From d67924d234732e656362d7d8dc8db21a8e549091 Mon Sep 17 00:00:00 2001
From: Julian Salazar <julsal@google.com>
Date: Fri, 29 May 2026 21:02:28 +0000
Subject: [PATCH 01/29] tool/build: Modernize pyproject.toml and add design
 manuals

* Introduce pyproject.toml defining package metadata and development dependencies.
* Add core design documentation (DESIGN.md) explaining Sequence primitives.

PiperPending-RevId: 923592270
PiperOrigin-RevId: 923592270
---
 DESIGN.md      | 166 +++++++++++++++++++++++++++++++++++++++++++++++++
 pyproject.toml |  52 +++++++++++++---
 2 files changed, 211 insertions(+), 7 deletions(-)
 create mode 100644 DESIGN.md

diff --git a/DESIGN.md b/DESIGN.md
new file mode 100644
index 0000000..f7dbb8e
--- /dev/null
+++ b/DESIGN.md
@@ -0,0 +1,166 @@
+# SequenceLayers Design & Philosophy
+
+This document summarizes the core design principles, primitives, and contracts
+of the `SequenceLayers` library, as detailed in `tech-report.pdf`. It is
+designed as a highly readable reference for both human developers and AI coding
+agents.
+
+--------------------------------------------------------------------------------
+
+## 1. Core Philosophy
+
+SequenceLayers is a design pattern and library for sequence modeling. It is
+built around three core features:
+
+1.  **Streamable:** Gives you streaming "for free". Every streamable layer
+    implements an explicit state and a `step` method to evolve that state,
+    allowing easy transition from offline training to online streaming.
+2.  **Correct by Default:** Eliminates entire classes of bugs related to
+    masking, padding, causality, and lookahead by enforcing a strict
+    mathematical contract and using unified `Sequence` containers.
+3.  **Composable:** Uses a declarative, compositional API (combinators like
+    `Serial`, `Residual`) that allows complex models to be defined like block
+    diagrams, automatically handling state plumbing and aggregate properties
+    (latency, receptive fields).
+
+--------------------------------------------------------------------------------
+
+## 2. Core Primitives
+
+### `Sequence` and `MaskedSequence`
+
+Instead of raw tensors, SequenceLayers APIs consume and produce `Sequence`
+objects.
+
+*   **Structure:** A PyTree dataclass pairing `values` (shape `[batch, time,
+    ...channels]`) with a boolean `mask` (shape `[batch, time]`).
+*   **Masking:** A `Sequence` is "masked" if all invalid positions (`mask[b,
+    t] == False`) have their corresponding `values[b, t, ...]` zeroed out.
+*   **`MaskedSequence`:** A subclass of `Sequence` that statically guarantees
+    that invalid positions are already zeroed out. Calling `.mask_invalid()` on
+    it is a no-op.
+
+--------------------------------------------------------------------------------
+
+## 3. The `SequenceLayer` API
+
+A `SequenceLayer` is a functional component that supports two primary execution
+modes:
+
+### A. Layer-wise (Offline / Training)
+
+Used for parallel processing (e.g., teacher-forced training).
+
+```python
+y = layer.layer(x, training=training)
+```
+
+*   **Input `x`:** `Sequence` of shape `[b, t_in, ...]`
+*   **Output `y`:** `Sequence` of shape `[b, t_out, ...]` where `t_out = t_in *
+    output_ratio`.
+
+### B. Step-wise (Online / Streaming / Inference)
+
+Used for autoregressive generation or streaming inference.
+
+```python
+state = layer.get_initial_state(batch_size, input_spec, training=training)
+# In a loop:
+y_step, state = layer.step(x_step, state, training=training)
+```
+
+*   **Input `x_step`:** `Sequence` of shape `[b, block_size, ...]` (usually
+    `block_size = 1`)
+*   **State:** An explicit PyTree of arrays representing the layer's temporal
+    state (e.g., KV cache, convolution buffer). No state is stored internally in
+    the layer object.
+
+### Constants
+
+Layers may accept a `Constants` dictionary for time-synchronized conditioning
+signals (e.g., speaker embeddings, language IDs). Constants are propagated
+through combinators alongside `Sequence` and `State`.
+
+### Emits
+
+Since `layer()` and `step()` return a single `Sequence`, layers that need
+auxiliary debugging output use the **Emits** API:
+
+*   `layer_with_emits(x, constants) -> (Sequence, Emits)`: Layer-wise with
+    auxiliary outputs.
+*   `step_with_emits(x, state, constants) -> (Sequence, State, Emits)`:
+    Step-wise with auxiliary outputs.
+
+The `Emitting` subclass of `SequenceLayer` implements `layer`/`step` in terms of
+the `_with_emits` variants. The `Emit` layer simply emits its input for tapping
+into intermediate sequences.
+
+### Receptive Field
+
+The `receptive_field` property computes the (start, end) range of input
+timesteps affecting each output timestep. Key details:
+
+*   For layers with `output_ratio != 1`, the receptive field is relative to
+    `t_i = t_o // output_ratio`.
+*   `receptive_field_per_step` tracks step-specific receptive fields (used by
+    combinators like `Serial` for precise composition).
+*   Special cases: infinite receptive fields (e.g., LSTM → `(-inf, 0)`), `None`
+    for timesteps with no receptive field (e.g., transposed convolution holes).
+
+--------------------------------------------------------------------------------
+
+## 4. The SequenceLayer Contract (CRITICAL)
+
+For a layer to be correct, it **MUST** satisfy the following properties,
+verified via the `verify_contract` test utility:
+
+1.  **Layer-Step Equivalence:** Running a sequence through `layer()` must
+    produce mathematically identical results (values and mask) to feeding it
+    chunk-by-chunk through `step()` and concatenating the outputs, once latency
+    is accounted for. Stateful stochastic layers (e.g., `Dropout`) should obey
+    this when the starting RNG state is equivalent.
+2.  **Padding Invariance:** Appending padding (invalid timesteps) to the end of
+    an input sequence must not affect the output values of the non-padding
+    (valid) timesteps. *Note: This is currently only required for end padding.
+    Start or interior padding may affect behavior.*
+3.  **Batching Invariance:** The position of an example in a batch, or the
+    lengths of other examples in the batch, must not affect its computed output.
+4.  **Masked Inputs/Outputs:** Layers must NOT assume input `values` are masked.
+    If a layer's computation requires masked inputs (e.g., it mixes information
+    across timesteps), it must call `mask_invalid()` on the input before use.
+
+## `verify_contract` checks: layer-step output equivalence, gradient equivalence (parameters and inputs), consistency with metadata (`get_output_spec`, `output_ratio`, `block_size`, latencies), receptive field matching (via gradient-based calculation), batching invariance (inserting invalid batch items), and padding invariance (replacing invalid timesteps with NaNs or large integers).
+
+## 5. Latency and Lookahead in Streaming
+
+When a layer requires future context (lookahead) or introduces delay, it manages
+this via two properties:
+
+*   **`input_latency`:** The number of future timesteps required to produce the
+    current output. To get all valid outputs, the caller must "flush" the layer
+    at the end of the sequence by feeding it `input_latency` invalid (padded)
+    timesteps.
+*   **`output_latency`:** The delay introduced by the layer. The first
+    `output_latency` timesteps returned by `step()` will be invalid (`mask =
+    False`) and must be discarded by the caller before expecting valid outputs.
+
+*For a causal layer, both latencies are `0`.*
+
+--------------------------------------------------------------------------------
+
+## 6. Combinators
+
+Layers are composed into larger architectures using backend-agnostic
+combinators:
+
+*   **`Serial`:** Executes a list of layers sequentially. Automatically handles
+    the nesting and plumbing of sub-layer states into a single aggregate state.
+*   **`Parallel`:** Executes multiple layers on the same input in parallel,
+    combining their outputs.
+*   **`Residual`:** Implements `F(x) + x`, managing state for `F`.
+*   **`Repeat`:** Repeats a layer `N` times, using control flow primitives like
+    `scan` to minimize compilation time.
+*   **`Blockwise`:** Dynamically adjusts the execution block size of any layer,
+    automatically implementing `layer` in terms of `step` to reduce peak memory.
+
+--------------------------------------------------------------------------------
diff --git a/pyproject.toml b/pyproject.toml
index 6d0b8c2..9930f9b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@
 name = "sequence_layers"
 description = "Sequence Layers neural network layer library from Google."
 readme = "README.md"
-requires-python = ">=3.11"
+requires-python = ">=3.13"
 license = {file = "LICENSE"}
 authors = [
     {name = "RJ Skerry-Ryan", email="rjryan@google.com"},
@@ -24,8 +24,8 @@ dependencies = [
     "jaxtyping",
     "numpy",
     "orbax-export",
-    "recurrentgemma[jax]",
-    "typeguard==2.13.3",
+    "recurrentgemma[jax]>=1.0.1",
+    "typeguard>=2.13.0",
 ]
 
 # `version` is automatically set by flit to use `sequence_layers.__version__`
@@ -50,13 +50,15 @@ mlx = [
     "mlx",
 ]
 dev = [
-    "absl-py",
+    "absl-py>=1.0.0",
     "chex",
     "orbax",
+    "isort",
+    "pyink",
+    "pylint>=2.6.0",
+    "pyrefly>=0.58.0",
     "pytest",
     "pytest-xdist",
-    "pylint>=2.6.0",
-    "pyink",
     "tensorflow",  # JAX tests use TensorFlow.
 ]
 
@@ -67,6 +69,36 @@ unstable = true
 pyink-indentation = 2
 pyink-use-majority-quotes = true
 
+[tool.isort]
+profile = "google"
+line_length = 80
+
+[tool.pylint.master]
+extension-pkg-whitelist = ["mlx", "mlx.core"]
+
+[tool.pylint.format]
+indent-string = "  "
+
+[tool.pylint.basic]
+no-docstring-rgx = "^(_)?test_|^.*Test$|^__.*__$"
+
+disable = [
+  "duplicate-code",
+  "too-few-public-methods",
+  "too-many-ancestors",
+  "too-many-arguments",
+  "too-many-branches",
+  "too-many-instance-attributes",
+  "too-many-lines",
+  "too-many-locals",
+  "too-many-positional-arguments",
+  "too-many-public-methods",
+  "too-many-return-statements",
+  "too-many-statements",
+]
+
+
+
 [build-system]
 # Build system specify which backend is used to build/install the project (flit,
 # poetry, setuptools,...). All backends are supported by `pip install`
@@ -81,4 +113,10 @@ exclude = [
   # Do not release test files on PyPI
   "**/*_test.py",
   "testdata/**",
-]
\ No newline at end of file
+]
+
+[tool.pyrefly]
+# Pyrefly fails to properly support config: Config without defaults in Flax Modules
+# (used in JAX), incorrectly treating them as dataclasses and complaining about
+# field ordering. This effectively only impacts JAX files.
+errors = { missing-override-decorator = "error", bad-class-definition = "ignore" }
\ No newline at end of file

From 8d7759b492ddca7fe3478214a357711c2c438633 Mon Sep 17 00:00:00 2001
From: Julian Salazar <julsal@google.com>
Date: Fri, 29 May 2026 09:32:30 +0000
Subject: [PATCH 02/29] refactor: Introduce specs/ protocol layer

* Define backend-agnostic structural protocols and behavior tests in specs/.

Co-authored-by: David Braun <davidbraun@google.com>
Co-authored-by: Kehang Han <kehanghan@google.com>

PiperPending-RevId: 923278026
PiperOrigin-RevId: 923278026
---
 sequence_layers/specs/__init__.py          |  67 ++
 sequence_layers/specs/backend.py           |  58 ++
 sequence_layers/specs/backend_behaviors.py |  29 +
 sequence_layers/specs/test_utils.py        |  72 ++
 sequence_layers/specs/test_utils_spec.py   |  51 ++
 sequence_layers/specs/types.py             | 988 +++++++++++++++++++++
 sequence_layers/specs/types_behaviors.py   | 837 +++++++++++++++++
 7 files changed, 2102 insertions(+)
 create mode 100644 sequence_layers/specs/__init__.py
 create mode 100644 sequence_layers/specs/backend.py
 create mode 100644 sequence_layers/specs/backend_behaviors.py
 create mode 100644 sequence_layers/specs/test_utils.py
 create mode 100644 sequence_layers/specs/test_utils_spec.py
 create mode 100644 sequence_layers/specs/types.py
 create mode 100644 sequence_layers/specs/types_behaviors.py

diff --git a/sequence_layers/specs/__init__.py b/sequence_layers/specs/__init__.py
new file mode 100644
index 0000000..1cacfc7
--- /dev/null
+++ b/sequence_layers/specs/__init__.py
@@ -0,0 +1,67 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Specification for sequence_layers backend implementations.
+
+https://typing.python.org/en/latest/spec/protocol.html#modules-as-implementations-of-protocols
+"""
+
+from typing import Any, Protocol, runtime_checkable
+
+from . import test_utils_spec as _test_utils_spec
+from . import types as _types
+
+
+@runtime_checkable
+class ModuleSpec(Protocol):
+  """Protocol for a backend-specific SequenceLayers module (sequence_layers.<backend> as sl)."""
+
+  # pylint: disable=missing-function-docstring
+
+  @property
+  def backend(self) -> Any:
+    ...
+
+  @property
+  def types(self) -> _types.ModuleSpec:
+    ...
+
+  @property
+  def test_utils(self) -> _test_utils_spec.ModuleSpec:
+    ...
+
+  # pylint: disable=invalid-name
+
+  # Identifiers that backend-specific implementations should expose at top
+  # level. Demonstrating read-only allows for covariance (subclasses of
+  # types_module.Sequence to satisfy the protocol).
+
+  @property
+  def Sequence(self) -> type[_types.Sequence]:
+    ...
+
+  @property
+  def MaskedSequence(self) -> type[_types.MaskedSequence]:
+    ...
+
+  @property
+  def SequenceLayer(self) -> type[_types.SequenceLayer]:
+    ...
+
+  @property
+  def SequenceLayerConfig(self) -> type[_types.SequenceLayerConfig]:
+    ...
+
+  @property
+  def SequenceLayerTest(self) -> type[Any]:
+    ...
diff --git a/sequence_layers/specs/backend.py b/sequence_layers/specs/backend.py
new file mode 100644
index 0000000..45657bf
--- /dev/null
+++ b/sequence_layers/specs/backend.py
@@ -0,0 +1,58 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Specification for backend-specific helpers."""
+
+from typing import Any, Protocol, Sequence as TypingSequence, runtime_checkable
+
+from sequence_layers.specs import types as types_spec
+
+Array = types_spec.Array
+
+
+# pylint: disable=invalid-name
+class xp(Protocol):
+  """NumPy-compatible interface to enable generic behavior tests.
+
+  https://numpy.org/doc/stable/reference/routines.html#routines
+  https://docs.jax.dev/en/latest/jax.numpy.html
+  """
+
+  bool_: Any
+  int32: Any
+  float32: Any
+
+  def array(self, a: Any, dtype: Any = None) -> Array:
+    """Creates an array."""
+
+  def zeros(self, shape: tuple[int, ...], dtype: Any = None) -> Array:
+    """Creates an array of zeros."""
+
+  def concatenate(self, arrays: TypingSequence[Array], axis: int = 0) -> Array:
+    """Concatenates a list of arrays."""
+
+
+@runtime_checkable
+class ModuleSpec(Protocol):
+  """Specification for sequence_layers.<backend>.backend."""
+
+  @property
+  def xp(self) -> xp:
+    """Returns the NumPy-compatible interface."""
+
+
+__all__ = [
+    name
+    for name, attr in ModuleSpec.__dict__.items()
+    if isinstance(attr, property)
+]
diff --git a/sequence_layers/specs/backend_behaviors.py b/sequence_layers/specs/backend_behaviors.py
new file mode 100644
index 0000000..9123112
--- /dev/null
+++ b/sequence_layers/specs/backend_behaviors.py
@@ -0,0 +1,29 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Abstract tests for backend utilities."""
+
+# pylint: disable=abstract-method
+
+from typing import override
+
+from sequence_layers import specs
+from sequence_layers.specs import backend as backend_spec
+from sequence_layers.specs import test_utils as test_utils_spec
+
+
+class ModuleSpecTest(test_utils_spec.ModuleSpecTest):
+
+  @override
+  def module_spec_pairs(self, backend_sl: specs.ModuleSpec):
+    return {backend_sl.backend: backend_spec.ModuleSpec}
diff --git a/sequence_layers/specs/test_utils.py b/sequence_layers/specs/test_utils.py
new file mode 100644
index 0000000..119e4dd
--- /dev/null
+++ b/sequence_layers/specs/test_utils.py
@@ -0,0 +1,72 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities for testing sequence layers."""
+
+import abc
+from typing import Any
+
+from absl.testing import parameterized
+from sequence_layers import specs
+from sequence_layers.specs import backend as backend_spec
+from sequence_layers.specs import test_utils_spec
+from sequence_layers.specs import types as types_spec
+
+
+class _AbcParameterizedTestCaseMeta(abc.ABCMeta, type(parameterized.TestCase)):
+  """Metaclass for abstract parameterized test cases."""
+
+
+class SequenceLayerTest(
+    parameterized.TestCase,
+    metaclass=_AbcParameterizedTestCaseMeta,
+):
+  """Base test class providing common sequence testing assertions."""
+
+  sl: specs.ModuleSpec
+
+  @property
+  def xp(self) -> backend_spec.xp:
+    """Returns the backend wrapper."""
+    return self.sl.backend.xp
+
+  # pylint: disable=invalid-name
+
+  @abc.abstractmethod
+  def assertSequencesEqual(
+      self, x: types_spec.Sequence, y: types_spec.Sequence
+  ) -> None:
+    """Asserts that two sequences are equal."""
+
+  @abc.abstractmethod
+  def assertAllEqual(self, x: Any, y: Any) -> None:
+    """Asserts that all elements are equal."""
+
+  # pylint: enable=invalid-name
+
+
+class ModuleSpecTest(SequenceLayerTest):
+  """Test that a backend-specific module implements the ModuleSpec protocol."""
+
+  @abc.abstractmethod
+  def module_spec_pairs(self, backend_sl: specs.ModuleSpec) -> dict[Any, Any]:
+    """Returns a mapping of module to protocol to be verified."""
+
+  def test_backend_specific_module_has_interface(self) -> None:
+    pairs = self.module_spec_pairs(self.sl)
+    for mod, protocol in pairs.items():
+      self.assertIsInstance(mod, protocol)
+
+
+ModuleSpec = test_utils_spec.ModuleSpec
+__all__ = test_utils_spec.__all__
diff --git a/sequence_layers/specs/test_utils_spec.py b/sequence_layers/specs/test_utils_spec.py
new file mode 100644
index 0000000..636062e
--- /dev/null
+++ b/sequence_layers/specs/test_utils_spec.py
@@ -0,0 +1,51 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Protocol specification for test_utils modules.
+
+This is in a separate file to break import cycle between test_utils.py and
+specs/__init__.py.
+"""
+
+from typing import Any, Callable, Iterable, Protocol, runtime_checkable
+
+
+@runtime_checkable
+class ModuleSpec(Protocol):
+  """Specification for sequence_layers.<backend>.test_utils."""
+
+  def zip_longest(
+      self,
+      targets: Iterable[Iterable[Any]],
+      sources: Iterable[Any],
+  ) -> list[Any]:
+    """Zips targets and sources."""
+
+  def named_product(
+      self,
+      first: Iterable[Any],
+      second: Iterable[Any],
+  ) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
+    """Creates a named product."""
+
+  @property
+  def SequenceLayerTest(self) -> type[Any]:  # pylint: disable=invalid-name
+    ...
+
+
+__all__ = [
+    name
+    for name, attr in ModuleSpec.__dict__.items()
+    if isinstance(attr, property)
+    or (callable(attr) and not name.startswith('__'))
+]
diff --git a/sequence_layers/specs/types.py b/sequence_layers/specs/types.py
new file mode 100644
index 0000000..f37279f
--- /dev/null
+++ b/sequence_layers/specs/types.py
@@ -0,0 +1,988 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Signatures for the types module.
+
+See the corresponding _behaviors module for behaviors.
+
+If you are adding a new class or method to be implemented per backend, make
+sure to add it to the ModuleSpec protocol.
+"""
+
+import abc
+import dataclasses  # pylint: disable=unused-import
+import enum
+import fractions
+from typing import (
+    Any,
+    Callable,
+    Concatenate,
+    Generic,
+    Iterable,
+    Literal,
+    MutableMapping,
+    Protocol,
+    Self,
+    TypeVar,
+    override,
+    runtime_checkable,
+)
+from typing import cast  # pylint: disable=unused-import
+
+import jaxtyping as jt
+import numpy.typing as npt
+
+ArrayLike = npt.ArrayLike
+
+Array = jt.Shaped[Any, '...']
+
+# Type aliases for generic usage
+T = TypeVar('T')
+Shape = tuple[int, ...]
+ShapeLike = list[int] | tuple[int, ...]
+DType = Any  # Can be numpy, jax, or mlx dtype
+
+
+class ChannelSpec(Protocol):
+  """Protocol for channel specifications."""
+
+  @property
+  def shape(self) -> Shape:
+    """The shape of the channel."""
+
+  @property
+  def dtype(self) -> Any:
+    """The dtype of the channel."""
+
+
+State = Any
+Constants = MutableMapping[str, jt.PyTree[Array]]
+Emits = jt.PyTree[Array]
+
+
+ValuesT = TypeVar('ValuesT', bound=Array, default=Array)
+MaskT = TypeVar('MaskT', bound=Array, default=Array)
+ChannelSpecT = TypeVar('ChannelSpecT', bound=ChannelSpec, default=ChannelSpec)
+
+LengthsT = TypeVar('LengthsT', bound=Array, default=Array)
+
+InputT = TypeVar('InputT', bound='Sequence', default='Sequence')
+OutputT = TypeVar('OutputT', bound='Sequence', default='Sequence')
+
+# A "self" type alias to allow Sequence and subclasses to return their own
+# Sequence subtype. (Self cannot be parameterized.)
+SequenceSelf = TypeVar('SequenceSelf', bound='Sequence')
+
+
+class PaddingMode(enum.Enum):
+  """Supported padding modes."""
+
+  # In VALID padding mode, no padding is applied.
+  #
+  # Key properties:
+  # * The physical length of an input array to a VALID padded function shrinks,
+  #   dropping any timesteps whose inputs are computed from implicit edge
+  #   padding.
+  # * An output timestep is valid when all of its input timesteps are also
+  #   valid.
+  VALID = 'valid'
+
+  # In SAME padding mode, the input sequence is padded such that the output
+  # length is equal to the input length before applying striding.
+  #
+  # Key properties:
+  # * The input length is equal to the output length, before applying striding.
+  # * Padding of `effective_kernel_size - 1` is applied. Half is applied to the
+  #   front and half to the back. If `effective_kernel_size` is even, the extra
+  #   padding is added to the end.
+  # * An output timestep is valid when its corresponding input timestep is
+  #   valid.
+  SAME = 'same'
+
+  # In CAUSAL_VALID padding mode, the input sequence is padded such that the
+  # output length is equal to the input length before applying striding. Padding
+  # is applied such that the output timestep `to` can only depend on input
+  # timesteps at or before `ti` where `ti * output_ratio = to`.
+  #
+  # Key properties:
+  # * As in SAME padding, the input length is equal to the output length, before
+  #   applying striding.
+  # * Padding of `effective_kernel_size - 1` is applied to the front of the
+  #   sequence.
+  # * As in VALID padding, an output timestep is valid iff all of its input
+  #   timesteps are also valid.
+  CAUSAL_VALID = 'causal_valid'
+
+  # In REVERSE_CAUSAL_VALID padding mode, the input sequence is padded such that
+  # the output length is equal to the input length before applying striding.
+  # Padding is applied such that the output timestep `to` can only depend on
+  # input timesteps at or after `ti` where `ti * output_ratio = to`.
+  #
+  # Key properties:
+  # * As in SAME padding, the input length is equal to the output length, before
+  #   applying striding.
+  # * Padding of `effective_kernel_size - 1` is applied to the back of the
+  #   sequence.
+  REVERSE_CAUSAL_VALID = 'reverse_causal_valid'
+
+  # In CAUSAL padding mode, the input sequence is padded such that the output
+  # length is equal to the input length before applying striding. Padding is
+  # applied such that the output timestep `to` can only depend on input
+  # timesteps at or before `ti` where `ti * output_ratio = to`.
+  #
+  # Key properties:
+  # * As in SAME padding, the input length is equal to the output length, before
+  #   applying striding.
+  # * Padding of `effective_kernel_size - 1` is applied to the front of the
+  #   sequence.
+  # * As in SAME padding, an output timestep is valid when its corresponding
+  #   input timestep is valid.
+  CAUSAL = 'causal'
+
+  # In REVERSE_CAUSAL padding mode, the input sequence is padded such that the
+  # output length is equal to the input length before applying striding. Padding
+  # is applied such that the output timestep `to` can only depend on input
+  # timesteps at or after `ti` where `ti * output_ratio = to`.
+  #
+  # Key properties:
+  # * As in SAME padding, the input length is equal to the output length, before
+  #   applying striding.
+  # * Padding of `effective_kernel_size - 1` is applied to the back of the
+  #   sequence.
+  # * As in SAME padding, an output timestep is valid when its corresponding
+  #   input timestep is valid.
+  REVERSE_CAUSAL = 'reverse_causal'
+
+  # In SEMICAUSAL padding mode, the input sequence is padded such that the
+  # output length is equal to the input length before applying striding. Padding
+  # is applied such that the output timestep `to` can only depend on input
+  # timesteps at or before `ti` where `ti * output_ratio = to`.
+  #
+  # Key properties:
+  # * As in SAME padding, the input length is equal to the output length, before
+  #   applying striding.
+  # * Padding of `effective_kernel_size - stride` is applied to the front of the
+  #   sequence, and padding of `stride - 1` timesteps is applied to the back of
+  #   the sequence for a total of `effective_kernel_size - 1` timesteps of
+  #   padding. If `effective_kernel_size` < `stride`, then padding of
+  #   `effective_kernel_size - 1` is applied to the back of the sequence.
+  # * As in SAME padding, an output timestep is valid when its corresponding
+  #   input timestep is valid.
+  SEMICAUSAL = 'semicausal'
+
+  # In SEMICAUSAL_FULL padding mode, the input sequence is padded such that the
+  # output of the corresponding overlap-add or transpose convolution is of the
+  # same size as the input sequence and perfect reconstruction can be achieved.
+  # The reconstructed signal is of the same length or of length rounded up to
+  # cover the full input sequence.
+  SEMICAUSAL_FULL = 'semicausal_full'
+
+
+PaddingModeString = Literal[
+    'valid',
+    'same',
+    'causal_valid',
+    'reverse_causal_valid',
+    'causal',
+    'reverse_causal',
+    'semicausal',
+    'semicausal_full',
+]
+
+
+class Sequence(Generic[ValuesT, MaskT], metaclass=abc.ABCMeta):
+  """Abstract base class for Sequence."""
+
+  values: ValuesT
+  mask: MaskT
+
+  def __init__(self, values: ValuesT, mask: MaskT):
+    raise NotImplementedError('Subclasses must implement __init__')
+
+  @property
+  @abc.abstractmethod
+  def shape(self) -> Shape:
+    """The shape of the sequence as (batch, time, ...channels)."""
+
+  @property
+  @abc.abstractmethod
+  def ndim(self) -> int:
+    """The number of dimensions of the sequence values."""
+
+  @property
+  @abc.abstractmethod
+  def channel_shape(self) -> Shape:
+    """The shape of the channels in the sequence."""
+
+  @property
+  @abc.abstractmethod
+  def dtype(self) -> DType:
+    """The dtype of the sequence values."""
+
+  @classmethod
+  @abc.abstractmethod
+  def from_values(cls, values: ValuesT) -> Self:
+    """Creates a Sequence from values with a default mask."""
+
+  @classmethod
+  @abc.abstractmethod
+  def from_lengths(
+      cls,
+      values: ValuesT,
+      lengths: LengthsT,
+      is_masked: bool = False,
+  ) -> Self:
+    """Creates a Sequence from values and lengths."""
+
+  @classmethod
+  @abc.abstractmethod
+  def concatenate_sequences(cls, sequences: Iterable[Self]) -> Self:
+    """Concatenates multiple sequences into one."""
+
+  @abc.abstractmethod
+  def expanded_mask(self) -> Any:
+    """Returns the mask expanded to the shape of the values."""
+
+  @abc.abstractmethod
+  def apply_values[NewValuesT: Array, **P](
+      self,
+      values_fn: Callable[Concatenate[ValuesT, P], NewValuesT],
+      *args: P.args,
+      **kwargs: P.kwargs,
+  ) -> 'Sequence[NewValuesT, MaskT]':
+    """Applies a function to the sequence values."""
+
+  @abc.abstractmethod
+  def apply_values_masked[NewValuesT: Array, **P](
+      self,
+      values_fn: Callable[Concatenate[ValuesT, P], NewValuesT],
+      *args: P.args,
+      **kwargs: P.kwargs,
+  ) -> 'Sequence[NewValuesT, MaskT]':
+    """Applies a function to the sequence values, respecting the mask."""
+
+  @abc.abstractmethod
+  def apply[NewValuesT: Array, NewMaskT: Array, **P](
+      self,
+      apply_fn: Callable[
+          Concatenate[ValuesT, MaskT, P], tuple[NewValuesT, NewMaskT]
+      ],
+      *args: P.args,
+      **kwargs: P.kwargs,
+  ) -> 'Sequence[NewValuesT, NewMaskT]':
+    """Applies a function to both values and mask."""
+
+  @abc.abstractmethod
+  def apply_masked[NewValuesT: Array, NewMaskT: Array, **P](
+      self,
+      apply_fn: Callable[
+          Concatenate[ValuesT, MaskT, P], tuple[NewValuesT, NewMaskT]
+      ],
+      *args: P.args,
+      **kwargs: P.kwargs,
+  ) -> 'Sequence[NewValuesT, NewMaskT]':
+    """Applies a function to values and mask, respecting the mask."""
+
+  @abc.abstractmethod
+  def astype(self, dtype: DType | None) -> Self:
+    """Returns a copy of the sequence with a new dtype."""
+
+  @abc.abstractmethod
+  def lengths(self) -> Any:
+    """Returns the lengths of the sequences in the batch."""
+
+  @abc.abstractmethod
+  def __getitem__(
+      self,
+      the_slice: slice | tuple[int | slice | None | type(Ellipsis), ...],
+  ) -> Self:
+    ...
+
+  @abc.abstractmethod
+  def pad_time(
+      self,
+      pad_left: int,
+      pad_right: int,
+      valid: bool,
+      pad_value: Any | None = None,
+  ) -> Self:
+    """Pads the sequence along the time dimension."""
+
+  @abc.abstractmethod
+  def concatenate(self, other: Self) -> Self:
+    """Concatenates another sequence to this one."""
+
+  @abc.abstractmethod
+  def mask_invalid(
+      self, mask_value: Any | None = None
+  ) -> 'Sequence[ValuesT, MaskT]':
+    """Returns a MaskedSequence with invalid timesteps zeroed."""
+
+  @abc.abstractmethod
+  def unmask(self) -> 'Sequence[ValuesT, MaskT]':
+    """Returns a Sequence with no masking applied."""
+
+
+class MaskedSequence(Sequence[ValuesT, MaskT]):
+  """A sequence whose invalid timesteps are masked to zero."""
+
+  @abc.abstractmethod
+  @override
+  def apply_values_masked[NewValuesT: Array, **P](
+      self,
+      values_fn: Callable[Concatenate[ValuesT, P], NewValuesT],
+      *args: P.args,
+      **kwargs: P.kwargs,
+  ) -> 'MaskedSequence[NewValuesT, MaskT]':
+    ...
+
+  @abc.abstractmethod
+  @override
+  def apply_masked[NewValuesT: Array, NewMaskT: Array, **P](
+      self,
+      apply_fn: Callable[
+          Concatenate[ValuesT, MaskT, P], tuple[NewValuesT, NewMaskT]
+      ],
+      *args: P.args,
+      **kwargs: P.kwargs,
+  ) -> 'MaskedSequence[NewValuesT, NewMaskT]':
+    ...
+
+
+class SequenceLayerConfig(metaclass=abc.ABCMeta):
+  """Configuration for a SequenceLayer."""
+
+  @abc.abstractmethod
+  def make(self) -> Any:
+    """Creates the sequence layer."""
+
+  @abc.abstractmethod
+  def copy(self, **kwargs: Any) -> Self:
+    """Returns a copy of the config with updated fields."""
+
+
+class Steppable(Generic[InputT, OutputT, ChannelSpecT], metaclass=abc.ABCMeta):
+  """A sequence processing layer that can be executed layerwise or stepwise.
+
+  The backend must implement:
+  - layer_with_emits
+  - step_with_emits
+  """
+
+  @property
+  @abc.abstractmethod
+  def block_size(self) -> int:
+    """The block size this layer processes at once."""
+
+  @property
+  @abc.abstractmethod
+  def output_ratio(self) -> fractions.Fraction:
+    """The ratio of output timesteps to input timesteps."""
+
+  @property
+  @abc.abstractmethod
+  def supports_step(self) -> bool:
+    """Returns True if the layer supports stepwise processing."""
+
+  @property
+  @abc.abstractmethod
+  def input_latency(self) -> int:
+    """The number of future timesteps required for the current output."""
+
+  @property
+  @abc.abstractmethod
+  def output_latency(self) -> int:
+    """The number of timesteps the output is delayed."""
+
+  @abc.abstractmethod
+  def get_accumulated_input_latency(self, input_latency: int) -> int:
+    """Calculates the total input latency including previous layers."""
+
+  @abc.abstractmethod
+  def get_accumulated_output_latency(self, output_latency: int) -> int:
+    """Calculates the total output latency including previous layers."""
+
+  @abc.abstractmethod
+  def layer(
+      self, x: InputT, *, training: bool, constants: Constants | None = None
+  ) -> OutputT:
+    """Process this layer layer-wise.
+
+    Args:
+      x: Input sequence with values shaped [b, t_i, ...].
+      training: Python bool. Whether we are in training mode.
+      constants: A dictionary of constant name to array or sl.Sequence. Values
+        or sequences that are "constant" with respect to the SequenceLayer, but
+        may affect its processing. For example, for an attention layer this may
+        contain the source sequence to attend to.
+
+    Returns:
+      y: The outputs corresponding to this layer with values shaped
+        [b, t_o, ...] where `t_o == t_i * output_ratio`. t_o may have been
+        truncated to only represent valid frames.
+    """
+
+  @abc.abstractmethod
+  def layer_with_emits(
+      self,
+      x: InputT,
+      *,
+      training: bool,
+      constants: Constants | None = None,
+  ) -> tuple[OutputT, Emits]:
+    """Process this layer layer-wise, producing emitted arrays.
+
+    This is like `layer`, except it has an additional return value which is the
+    "emitted" arrays for the layer. The emitted arrays are a structure of
+    arrays whose values are arrays or `Sequence`s.
+
+    Args:
+      x: Input sequence with values shaped [b, t_i, ...].
+      training: Python bool. Whether we are in training mode.
+      constants: A dictionary of constant name to array or sl.Sequence. Values
+        or sequences that are "constant" with respect to the SequenceLayer, but
+        may affect its processing. For example, for an attention layer this may
+        contain the key/value sequence to attend to.
+
+    Returns:
+      y: The outputs corresponding to this layer with values shaped
+        [b, t_o, ...] where `t_o == t_i * output_ratio`. t_o may have been
+        truncated to only represent valid frames.
+      emits: A nest of emitted arrays or Sequences.
+    """
+
+  @abc.abstractmethod
+  def step(
+      self,
+      x: InputT,
+      state: State,
+      *,
+      training: bool,
+      constants: Constants | None = None,
+  ) -> tuple[OutputT, State]:
+    """Process this layer step-wise.
+
+    Args:
+      x: Input sequence with values shaped [b, t_i, ...], where t_i is a
+        multiple of block_size.
+      state: A structure of state arrays matching get_initial_state. The
+        previous state for this layer.
+      training: Python bool. Whether we are in training mode.
+      constants: A dictionary of constant name to array or sl.Sequence. Values
+        or sequences that are "constant" with respect to the SequenceLayer, but
+        may affect its processing. For example, for an attention layer this may
+        contain the key/value sequence to attend to.
+
+    Returns:
+      y: The outputs corresponding to this step with values shaped [b, t_o, ...]
+        where `t_o == t_i * output_ratio`.
+      state: A structure of state arrays matching get_initial_state. The
+        new state for this layer.
+    """
+
+  @abc.abstractmethod
+  def step_with_emits(
+      self,
+      x: InputT,
+      state: State,
+      *,
+      training: bool,
+      constants: Constants | None = None,
+  ) -> tuple[OutputT, State, Emits]:
+    """Process this layer step-wise, producing emitted arrays.
+
+    This is like `step`, except it has an additional return value which is the
+    "emitted" arrays for the step. The emitted arrays are a structure of
+    arrays whose values are arrays or `Sequence`s.
+
+    Args:
+      x: Input sequence with values shaped [b, t_i, ...], where t_i is a
+        multiple of block_size.
+      state: A structure of state arrays matching get_initial_state. The
+        previous state for this layer.
+      training: Python bool. Whether we are in training mode.
+      constants: A dictionary of constant name to array or sl.Sequence. Values
+        or sequences that are "constant" with respect to the SequenceLayer, but
+        may affect its processing. For example, for an attention layer this may
+        contain the key/value sequence to attend to.
+
+    Returns:
+      y: The outputs corresponding to this step with values shaped [b, t_o, ...]
+        where `t_o == t_i * output_ratio`.
+      state: A structure of state arrays matching get_initial_state. The
+        new state for this layer.
+      emits: A nest of emitted arrays or Sequences.
+    """
+
+  @abc.abstractmethod
+  def get_initial_state(
+      self,
+      batch_size: int,
+      input_spec: ChannelSpecT,
+      *,
+      training: bool,
+      constants: Constants | None = None,
+  ) -> State:
+    """Returns the initial state for this SequenceLayer.
+
+    Args:
+      batch_size: The batch size to create state for.
+      input_spec: An input ChannelSpec representing the channel shape and dtype
+        of the input that will be stepped.
+      training: Python bool. Whether we are in training mode.
+      constants: A dictionary of constant name to array or sl.Sequence. Values
+        or sequences that are "constant" with respect to the SequenceLayer, but
+        may affect its processing. For example, for an attention layer this may
+        contain the source sequence to attend to.
+
+    Returns:
+      An integer, shape, or structure of integer/shapes.
+    """
+
+  @abc.abstractmethod
+  def get_output_shape(
+      self,
+      input_shape: ShapeLike,
+      *,
+      constants: Constants | None = None,
+  ) -> Shape:
+    """Returns the output channel shape this layer produces for an input channel shape.
+
+    Args:
+      input_shape: A shape representing the channels dimension of the input
+        sequence (i.e. not including the batch or time dimension).
+      constants: A dictionary of constant name to array or sl.Sequence. Values
+        or sequences that are "constant" with respect to the SequenceLayer, but
+        may affect its processing. For example, for an attention layer this may
+        contain the source sequence to attend to.
+
+    Returns:
+      A shape representing the output channels dimensions (i.e. not including
+      the batch or time dimension).
+    """
+
+  @abc.abstractmethod
+  def get_output_dtype(
+      self,
+      input_dtype: DType,
+      *,
+      constants: Constants | None = None,
+  ) -> DType:
+    """Returns the layer's output dtype for the specified input dtype.
+
+    Args:
+      input_dtype: The dtype of the input features.
+      constants: A dictionary of constant name to array or sl.Sequence. Values
+        or sequences that are "constant" with respect to the SequenceLayer, but
+        may affect its processing.
+
+    Returns:
+      The dtype of the output features.
+    """
+
+  @abc.abstractmethod
+  def get_output_spec(
+      self,
+      input_spec: ChannelSpecT,
+      *,
+      constants: Constants | None = None,
+  ) -> ChannelSpec:
+    """Returns the output spec this layer produces for the provided input spec.
+
+    Args:
+      input_spec: A ChannelSpec which represents the channels shape and dtype of
+        the input sequence (i.e. not including the batch or time dimension).
+      constants: A dictionary of constant name to array or sl.Sequence. Values
+        or sequences that are "constant" with respect to the SequenceLayer, but
+        may affect its processing.
+
+    Returns:
+      The ChannelSpec of the output features.
+    """
+
+  @property
+  @abc.abstractmethod
+  def receptive_field(self) -> Any:
+    """Returns the range of the receptive field of this layer.
+
+    A `(start, end)` tuple indicating the input time step range
+    `[ti + start, ti + end]` that affects the output step `to`, where `ti` is
+    the first step of the input block corresponding to `to`, i.e.,
+    `ti = to // output_ratio`.
+
+    For cases where the receptive field varies across steps, this returns the
+    union (min/max) of the per-step receptive fields. For example, with 2x
+    downsampling and 2x upsampling the receptive field is `(0, 1)` at even
+    steps and `(-1, 0)` at odd steps; this property would return `(-1, 1)`.
+
+    Returns:
+      A `(start, end)` tuple, or `None`. Infinite receptive field is
+      represented with `+/-inf` (e.g. RNNs have `(-inf, 0)`). `None`
+      indicates no receptive field (e.g. `Conv1DTranspose` with
+      `kernel_size=1, stride=2` produces `None` for every other step).
+    """
+
+
+class SequenceLayer(
+    Steppable[InputT, OutputT, ChannelSpecT], metaclass=abc.ABCMeta
+):
+  """Base class for Sequence Layers."""
+
+
+# ---------------------------------------------------------------------------
+# Mixins
+# ---------------------------------------------------------------------------
+
+
+class PreservesType:
+  """A mix-in for layers that do not change the input dtype."""
+
+  @abc.abstractmethod
+  def get_output_dtype(
+      self,
+      input_dtype: DType,
+      *,
+      constants: Constants | None = None,
+  ) -> DType:
+    """Returns the output dtype, which is the same as the input dtype."""
+
+
+class PreservesShape:
+  """A mix-in for layers that do not change the input channel shape."""
+
+  @abc.abstractmethod
+  def get_output_shape(
+      self,
+      input_shape: ShapeLike,
+      *,
+      constants: Constants | None = None,
+  ) -> Shape:
+    """Returns the output shape, which is the same as the input shape."""
+
+
+# ---------------------------------------------------------------------------
+# Stateless variants
+# ---------------------------------------------------------------------------
+
+
+class Stateless(SequenceLayer[InputT, OutputT, ChannelSpecT]):
+  """A layer with no state over time required for step-wise processing.
+
+  The backend must implement:
+  - get_initial_state
+  - step
+  Further sub-classes must only implement:
+  - layer
+  - get_output_shape
+  - get_output_dtype
+  """
+
+  @abc.abstractmethod
+  @override
+  def get_output_shape(
+      self, input_shape: ShapeLike, *, constants: Constants | None = None
+  ) -> Shape:
+    ...
+
+  @abc.abstractmethod
+  @override
+  def get_output_dtype(
+      self, input_dtype: DType, *, constants: Constants | None = None
+  ) -> DType:
+    ...
+
+  @abc.abstractmethod
+  @override
+  def layer(
+      self,
+      x: InputT,
+      *,
+      training: bool,
+      constants: Constants | None = None,
+  ) -> OutputT:
+    ...
+
+  @abc.abstractmethod
+  @override
+  def get_initial_state(
+      self,
+      batch_size: int,
+      input_spec: ChannelSpecT,
+      *,
+      training: bool,
+      constants: Constants | None = None,
+  ) -> State:
+    ...
+
+  @abc.abstractmethod
+  @override
+  def step(
+      self,
+      x: InputT,
+      state: State,
+      *,
+      training: bool,
+      constants: Constants | None = None,
+  ) -> tuple[OutputT, State]:
+    ...
+
+
+class StatelessPointwise(
+    PreservesShape,
+    Stateless[InputT, OutputT, ChannelSpecT],
+    metaclass=abc.ABCMeta,
+):
+  """Stateless layer that operates pointwise (preserves shape)."""
+
+
+class StatelessPointwiseFunctor(
+    StatelessPointwise[InputT, OutputT, ChannelSpecT]
+):
+  """Stateless pointwise layer defined by a fn(values, mask).
+
+  The backend must implement:
+  - layer
+  Further sub-classes must only implement:
+  - fn
+  - mask_required
+  """
+
+  @abc.abstractmethod
+  def fn(self, values: Any, mask: Any) -> tuple[Any, Any]:
+    """Transforms each scalar in values independently."""
+
+  @property
+  @abc.abstractmethod
+  def mask_required(self) -> bool:
+    """Returns true if fn can change the sequence's masked state.
+
+    If fn(0) -> 0, then mask_required() is False.
+    """
+
+  @abc.abstractmethod
+  @override
+  def layer(
+      self,
+      x: InputT,
+      *,
+      training: bool,
+      constants: Constants | None = None,
+  ) -> OutputT:
+    ...
+
+
+# ---------------------------------------------------------------------------
+# Emitting variants
+# ---------------------------------------------------------------------------
+
+
+class Emitting(SequenceLayer[InputT, OutputT, ChannelSpecT]):
+  """A Steppable layer that emits auxiliary arrays.
+
+  This is a convenience subclass that implements step and layer in terms of
+  step_with_emits and layer_with_emits.
+
+  The backend must implement:
+  - step
+  - layer
+  Further sub-classes must only implement:
+  - step_with_emits
+  - layer_with_emits
+  """
+
+  @abc.abstractmethod
+  @override
+  def step(
+      self,
+      x: InputT,
+      state: State,
+      *,
+      training: bool,
+      constants: Constants | None = None,
+  ) -> tuple[OutputT, State]:
+    ...
+
+  @abc.abstractmethod
+  @override
+  def layer(
+      self,
+      x: InputT,
+      *,
+      training: bool,
+      constants: Constants | None = None,
+  ) -> OutputT:
+    ...
+
+  @abc.abstractmethod
+  @override
+  def step_with_emits(
+      self,
+      x: InputT,
+      state: State,
+      *,
+      training: bool,
+      constants: Constants | None = None,
+  ) -> tuple[OutputT, State, Emits]:
+    ...
+
+  @abc.abstractmethod
+  @override
+  def layer_with_emits(
+      self,
+      x: InputT,
+      *,
+      training: bool,
+      constants: Constants | None = None,
+  ) -> tuple[OutputT, Emits]:
+    ...
+
+
+class StatelessEmitting(Emitting[InputT, OutputT, ChannelSpecT]):
+  """A Steppable layer with no state over time that emits auxiliary arrays.
+
+  The backend must implement:
+  - get_initial_state
+  - step_with_emits
+  Further sub-classes must only implement:
+  - layer_with_emits
+  - get_output_shape
+  - get_output_dtype
+  """
+
+  @abc.abstractmethod
+  @override
+  def get_initial_state(
+      self,
+      batch_size: int,
+      input_spec: ChannelSpecT,
+      *,
+      training: bool,
+      constants: Constants | None = None,
+  ) -> State:
+    ...
+
+  @abc.abstractmethod
+  @override
+  def step_with_emits(
+      self,
+      x: InputT,
+      state: State,
+      *,
+      training: bool,
+      constants: Constants | None = None,
+  ) -> tuple[OutputT, State, Emits]:
+    ...
+
+  @abc.abstractmethod
+  @override
+  def get_output_shape(
+      self, input_shape: ShapeLike, *, constants: Constants | None = None
+  ) -> Shape:
+    ...
+
+  @abc.abstractmethod
+  @override
+  def get_output_dtype(
+      self, input_dtype: DType, *, constants: Constants | None = None
+  ) -> DType:
+    ...
+
+  @abc.abstractmethod
+  @override
+  def layer_with_emits(
+      self,
+      x: InputT,
+      *,
+      training: bool,
+      constants: Constants | None = None,
+  ) -> tuple[OutputT, Emits]:
+    ...
+
+
+_SequenceType = Sequence
+_MaskedSequenceType = MaskedSequence
+_SequenceLayerType = SequenceLayer
+_SequenceLayerConfigType = SequenceLayerConfig
+_SteppableType = Steppable
+
+
+@runtime_checkable
+class ModuleSpec(Protocol):
+  """Specification for sequence_layers.<backend>.types."""
+
+  # pylint: disable=invalid-name
+
+  @property
+  def Sequence(self) -> type[_SequenceType[Any, Any]]:
+    """The Sequence class for this backend."""
+
+  @property
+  def MaskedSequence(self) -> type[_MaskedSequenceType[Any, Any]]:
+    """The MaskedSequence class for this backend."""
+
+  @property
+  def SequenceLayer(self) -> type[_SequenceLayerType]:
+    """The SequenceLayer class for this backend."""
+
+  @property
+  def SequenceLayerConfig(self) -> type[_SequenceLayerConfigType]:
+    """The SequenceLayerConfig class for this backend."""
+
+  @property
+  def Steppable(self) -> type[_SteppableType[Any, Any, Any]]:
+    """The Steppable class for this backend."""
+
+  @property
+  def PreservesShape(self) -> type[PreservesShape]:
+    """The PreservesShape class for this backend."""
+
+  @property
+  def Stateless(self) -> type[Stateless]:
+    """The Stateless class for this backend."""
+
+  @property
+  def StatelessPointwise(self) -> type[StatelessPointwise]:
+    """The StatelessPointwise class for this backend."""
+
+  @property
+  def StatelessPointwiseFunctor(self) -> type[StatelessPointwiseFunctor]:
+    """The StatelessPointwiseFunctor class for this backend."""
+
+  @property
+  def PreservesType(self) -> type[PreservesType]:
+    """The PreservesType class for this backend."""
+
+  @property
+  def Emitting(self) -> type[Emitting]:
+    """The Emitting class for this backend."""
+
+  @property
+  def StatelessEmitting(self) -> type[StatelessEmitting]:
+    """The StatelessEmitting class for this backend."""
+
+
+__all__ = (
+    'ChannelSpec',
+    'Sequence',
+    'MaskedSequence',
+    'SequenceLayer',
+    'SequenceLayerConfig',
+    'Steppable',
+    'PreservesShape',
+    'Stateless',
+    'StatelessPointwise',
+    'StatelessPointwiseFunctor',
+    'PreservesType',
+    'Emitting',
+    'StatelessEmitting',
+)
diff --git a/sequence_layers/specs/types_behaviors.py b/sequence_layers/specs/types_behaviors.py
new file mode 100644
index 0000000..b2b3a99
--- /dev/null
+++ b/sequence_layers/specs/types_behaviors.py
@@ -0,0 +1,837 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=abstract-method
+"""Generic tests for Sequence types_spec."""
+
+import dataclasses
+import fractions
+from typing import Any, NamedTuple, override
+import unittest.mock
+
+from absl.testing import parameterized
+import numpy as np
+from sequence_layers import specs
+from sequence_layers.specs import test_utils as test_utils_spec
+from sequence_layers.specs import types as types_spec
+
+SequenceLayerTest = test_utils_spec.SequenceLayerTest
+
+
+class ModuleSpecTest(test_utils_spec.ModuleSpecTest):
+  """Abstract tests for ModuleSpec behaviors."""
+
+  @override
+  def module_spec_pairs(self, backend_sl: specs.ModuleSpec):
+    return {backend_sl.types: types_spec.ModuleSpec}
+
+  def test_backend_specific_types_are_subclasses(self) -> None:
+    pairs = self.module_spec_pairs(self.sl)
+    for mod, protocol in pairs.items():
+      if protocol is types_spec.ModuleSpec:
+        self.assertTrue(issubclass(mod.Sequence, types_spec.Sequence))
+        self.assertTrue(
+            issubclass(mod.MaskedSequence, types_spec.MaskedSequence)
+        )
+        self.assertTrue(issubclass(mod.SequenceLayer, types_spec.SequenceLayer))
+        self.assertTrue(
+            issubclass(mod.SequenceLayerConfig, types_spec.SequenceLayerConfig)
+        )
+        self.assertTrue(issubclass(mod.Steppable, types_spec.Steppable))
+
+
+class DummyChannelSpec(NamedTuple):
+  """Dummy channel spec for testing."""
+
+  shape: types_spec.Shape
+  dtype: types_spec.DType
+
+
+class DefaultTestLayer(types_spec.SequenceLayer):
+  """A default test layer for testing."""
+
+  @property
+  @override
+  def block_size(self) -> int:
+    return 1
+
+  @property
+  @override
+  def output_ratio(self) -> fractions.Fraction:
+    return fractions.Fraction(1)
+
+  @property
+  @override
+  def supports_step(self) -> bool:
+    return True
+
+  @property
+  @override
+  def input_latency(self) -> int:
+    return 0
+
+  @property
+  @override
+  def output_latency(self) -> int:
+    return 0
+
+  @property
+  @override
+  def receptive_field(self) -> Any:
+    return 1
+
+  @override
+  def get_accumulated_input_latency(self, input_latency: int) -> int:
+    return input_latency
+
+  @override
+  def get_accumulated_output_latency(self, output_latency: int) -> int:
+    return output_latency
+
+  @override
+  def layer(
+      self,
+      x: types_spec.Sequence,
+      *,
+      training: bool,
+      constants: types_spec.Constants | None = None,
+  ) -> types_spec.Sequence:
+    return x
+
+  @override
+  def layer_with_emits(
+      self,
+      x: types_spec.Sequence,
+      *,
+      training: bool,
+      constants: types_spec.Constants | None = None,
+  ) -> tuple[types_spec.Sequence, types_spec.Emits]:
+    return self.layer(x, training=training, constants=constants), (
+        'test_emits',
+    )
+
+  @override
+  def step(
+      self,
+      x: types_spec.Sequence,
+      state: types_spec.State,
+      *,
+      training: bool,
+      constants: types_spec.Constants | None = None,
+  ) -> tuple[types_spec.Sequence, types_spec.State]:
+    return x, ('new_test_state',)
+
+  @override
+  def step_with_emits(
+      self,
+      x: types_spec.Sequence,
+      state: types_spec.State,
+      *,
+      training: bool,
+      constants: types_spec.Constants | None = None,
+  ) -> tuple[types_spec.Sequence, types_spec.State, types_spec.Emits]:
+    return *self.step(x, state, training=training, constants=constants), (
+        'test_emits',
+    )
+
+  @override
+  def get_initial_state(
+      self,
+      batch_size: int,
+      input_spec: types_spec.ChannelSpec,
+      *,
+      training: bool,
+      constants: types_spec.Constants | None = None,
+  ) -> types_spec.State:
+    return ('test_state',)
+
+  @override
+  def get_output_shape(
+      self,
+      input_shape: types_spec.ShapeLike,
+      *,
+      constants: types_spec.Constants | None = None,
+  ) -> types_spec.Shape:
+    return tuple(input_shape) + (1,)
+
+  @override
+  def get_output_dtype(
+      self,
+      input_dtype: types_spec.DType,
+      *,
+      constants: types_spec.Constants | None = None,
+  ) -> types_spec.DType:
+    return np.float64
+
+  @override
+  def get_output_spec(
+      self,
+      input_spec: Any,
+      *,
+      constants: types_spec.Constants | None = None,
+  ) -> Any:
+    shape = self.get_output_shape(input_spec.shape, constants=constants)
+    dtype = self.get_output_dtype(input_spec.dtype, constants=constants)
+    return DummyChannelSpec(shape, dtype)
+
+
+class ModuleInterfaceTest(SequenceLayerTest):
+  """Abstract tests for ModuleInterface behaviors."""
+
+  def test_backend_specific_module_has_interface(self) -> None:
+    self.assertIsInstance(self.sl.types, types_spec.ModuleSpec)
+
+
+class SequenceTest(SequenceLayerTest):
+  """Abstract tests for the Sequence class."""
+
+  @parameterized.named_parameters(
+      ('mask_value=None', 0.0, None),
+      ('mask_value=0.0', 0.0, 0.0),
+      ('mask_value=-1.0', -1.0, -1.0),
+  )
+  def test_mask_invalid(
+      self, mask_value: float, expected_mask_value: float | None
+  ) -> None:
+    values = self.xp.array([
+        [1.0, 2.0, 3.0, 4.0],
+        [10.0, 20.0, 30.0, 40.0],
+    ])
+    mask = self.xp.array(
+        [[True, True, False, False], [False, False, False, True]]
+    )
+
+    # Pass mask_value only if it is not None (to test default None behavior vs
+    # explicit value)
+    if expected_mask_value is None:
+      output = self.sl.types.Sequence(values, mask).mask_invalid()
+      fill_value = 0.0
+    else:
+      output = self.sl.types.Sequence(values, mask).mask_invalid(mask_value)
+      fill_value = mask_value
+
+    expected_values = self.xp.array([
+        [1.0, 2.0, fill_value, fill_value],
+        [fill_value, fill_value, fill_value, 40.0],
+    ])
+    self.assertAllEqual(output.values, expected_values)
+    self.assertAllEqual(output.mask, mask)
+
+  def test_pad_time(self) -> None:
+    values = self.xp.array([
+        [1.0, 2.0, 3.0, 4.0],
+        [10.0, 20.0, 30.0, 40.0],
+    ])
+    mask = self.xp.array(
+        [[True, True, False, False], [False, False, False, True]]
+    )
+
+    x = self.sl.types.Sequence(values, mask).mask_invalid()
+
+    y = x.pad_time(0, 0, valid=False)
+    self.assertAllEqual(y.values, x.values)
+    self.assertAllEqual(y.mask, x.mask)
+
+    y = x.pad_time(1, 0, valid=False)
+
+    x_left1 = self.sl.types.Sequence(
+        self.xp.array([
+            [0.0, 1.0, 2.0, 3.0, 4.0],
+            [0.0, 10.0, 20.0, 30.0, 40.0],
+        ]),
+        self.xp.array([
+            [False, True, True, False, False],
+            [False, False, False, False, True],
+        ]),
+    ).mask_invalid()
+    self.assertAllEqual(y.values, x_left1.values)
+    self.assertAllEqual(y.mask, x_left1.mask)
+
+  def _create_test_sequence(
+      self, shape: types_spec.Shape
+  ) -> types_spec.Sequence[types_spec.Array, types_spec.Array]:
+    """Creates a test sequence with specific shape."""
+    size = 1
+    for d in shape:
+      size *= d
+    values_np = np.arange(size, dtype=np.float32).reshape(shape)
+    mask_np = np.ones(shape[:2], dtype=bool)
+    if shape[0] > 0 and shape[1] > 1:
+      mask_np[0, 1] = False
+
+    values = self.xp.array(values_np)
+    mask = self.xp.array(mask_np)
+    return self.sl.types.Sequence(values, mask)
+
+  def test_slice(self) -> None:
+    x = self._create_test_sequence((3, 5, 9))
+
+    self.assertSequencesEqual(
+        x[:, 1:], self.sl.types.Sequence(x.values[:, 1:], x.mask[:, 1:])
+    )
+    self.assertSequencesEqual(
+        x[:, ::2], self.sl.types.Sequence(x.values[:, ::2], x.mask[:, ::2])
+    )
+    self.assertSequencesEqual(
+        x[::2, ::3],
+        self.sl.types.Sequence(x.values[::2, ::3], x.mask[::2, ::3]),
+    )
+
+  def test_slice_can_slice_channel_dimensions(self) -> None:
+    x = self._create_test_sequence((3, 5, 9, 4))
+
+    self.assertSequencesEqual(
+        x[:, 1:, :], self.sl.types.Sequence(x.values[:, 1:], x.mask[:, 1:])
+    )
+    self.assertSequencesEqual(
+        x[:, ::2, :3],
+        self.sl.types.Sequence(x.values[:, ::2, :3], x.mask[:, ::2]),
+    )
+
+  def test_apply_values(self) -> None:
+    values = self.xp.array([
+        [-1.0, 2.0, 3.0, 4.0],
+        [10.0, -20.0, 30.0, 40.0],
+    ])
+    mask = self.xp.array(
+        [[True, True, False, False], [False, True, False, True]]
+    )
+
+    x = self.sl.types.Sequence(values, mask)
+    masked = x.mask_invalid()
+
+    # Simple abs function
+    fn = abs
+
+    y = x.apply_values(fn)
+    self.assertAllEqual(y.values, fn(x.values))
+    self.assertAllEqual(y.mask, x.mask)
+
+    y = masked.apply_values(fn)
+    self.assertAllEqual(y.values, fn(masked.values))
+    self.assertAllEqual(y.mask, x.mask)
+
+    y = masked.apply_values_masked(fn)
+    self.assertAllEqual(y.values, fn(masked.values))
+    self.assertAllEqual(y.mask, x.mask)
+
+  def test_apply_values_args(self) -> None:
+    values = self.xp.array([
+        [-1.0, 2.0, 3.0, 4.0],
+        [10.0, -20.0, 30.0, 40.0],
+    ])
+    mask = self.xp.array(
+        [[True, True, False, False], [False, True, False, True]]
+    )
+    x = self.sl.types.Sequence(values, mask)
+
+    target_shape = (2, 4, 1)
+    y = x.apply_values(lambda v, s: v.reshape(s), target_shape)
+    self.assertAllEqual(y.values.shape, target_shape)
+    self.assertAllEqual(y.mask.shape, (2, 4))
+
+  def test_from_values(self) -> None:
+    values_np = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
+    values = self.xp.array(values_np)
+    x = self.sl.types.Sequence.from_values(values)
+    self.assertAllEqual(x.values, values)
+    self.assertAllEqual(
+        x.mask, self.xp.array(np.ones(values.shape[:2], dtype=bool))
+    )
+
+  def test_astype(self) -> None:
+    values_np = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
+    mask_np = np.array([[True, False], [False, True]], dtype=bool)
+
+    values = self.xp.array(values_np)
+    mask = self.xp.array(mask_np)
+
+    x = self.sl.types.Sequence(values, mask)
+
+    y = x.astype(self.xp.int32)
+
+    # Check values match casted version
+    self.assertAllEqual(y.mask, mask)
+    # y.values might be mlx array, values.astype(dtype) might be numpy if values
+    # was numpy? values is backend array. values.astype(dtype) should work if
+    # dtype is backend dtype.
+    self.assertAllEqual(y.values, values.astype(self.xp.int32))
+
+  def test_mask_invalid_idempotent(self) -> None:
+    values = self.xp.array([
+        [1.0, 2.0, 3.0, 4.0],
+        [10.0, 20.0, 30.0, 40.0],
+    ])
+    mask = self.xp.array(
+        [[True, True, False, False], [False, False, False, True]]
+    )
+
+    x = self.sl.types.Sequence(values, mask)
+    masked = x.mask_invalid()
+    self.assertIsNot(masked, x)
+    self.assertIsInstance(masked, self.sl.types.MaskedSequence)
+
+    masked_again = masked.mask_invalid()
+    self.assertIs(masked_again, masked)
+    self.assertIsInstance(masked_again, self.sl.types.MaskedSequence)
+
+    masked2 = x.mask_invalid()
+    self.assertIsNot(masked2, masked)
+    self.assertIsInstance(masked2, self.sl.types.MaskedSequence)
+
+  def test_from_lengths(self) -> None:
+    """Tests creating a sequence from lengths."""
+    values = self.xp.array(
+        np.arange(5 * 17 * 2).reshape((5, 17, 2)).astype(np.float32)
+    )
+    lengths_np = np.array([0, 5, 10, 17, 12], dtype=np.int32)
+    mask_np = np.arange(17)[None, :] < lengths_np[:, None]
+    mask = self.xp.array(mask_np)
+    lengths = self.xp.array(lengths_np)
+
+    x_expected = self.sl.types.Sequence(values, mask)
+    x = self.sl.types.Sequence.from_lengths(x_expected.values, lengths)
+    self.assertAllEqual(x.values, x_expected.values)
+    self.assertAllEqual(x.mask, x_expected.mask)
+
+    # Out of range lengths are clipped to 0 or max.
+    x = self.sl.types.Sequence.from_lengths(
+        x_expected.values, self.xp.array([-1, 5, 10, 17, 18])
+    )
+    self.assertAllEqual(x.lengths(), self.xp.array([0, 5, 10, 17, 17]))
+    self.assertNotIsInstance(x, self.sl.types.MaskedSequence)
+
+    # Return type is MaskedSequence if is_masked=True.
+    x = self.sl.types.Sequence.from_lengths(
+        x_expected.values, self.xp.array([-1, 5, 10, 17, 18]), is_masked=True
+    )
+    self.assertAllEqual(x.lengths(), self.xp.array([0, 5, 10, 17, 17]))
+    self.assertIsInstance(x, self.sl.types.MaskedSequence)
+
+
+class SteppableTest(SequenceLayerTest):
+  """Abstract tests for Steppable layers."""
+
+  def create_steppable(self) -> types_spec.Steppable:
+    """Creates a basic Steppable instance."""
+    backend_sl = self.sl
+
+    class DefaultSteppable(DefaultTestLayer, backend_sl.types.Steppable):
+      """Mock layer for testing."""
+
+      @override
+      def layer_with_emits(self, *args, **kwargs):
+        return backend_sl.types.Steppable.layer_with_emits(
+            self, *args, **kwargs
+        )
+
+      @override
+      def step_with_emits(self, *args, **kwargs):
+        return backend_sl.types.Steppable.step_with_emits(self, *args, **kwargs)
+
+    return DefaultSteppable()
+
+  def test_steppable_defaults(self) -> None:
+    layer = self.create_steppable()
+    self.assertEqual(layer.block_size, 1)
+    self.assertEqual(layer.output_ratio, fractions.Fraction(1))
+    self.assertTrue(layer.supports_step)
+    self.assertEqual(layer.input_latency, 0)
+    self.assertEqual(layer.output_latency, 0)
+    self.assertEqual(layer.get_accumulated_input_latency(0), 0)
+    self.assertEqual(layer.get_accumulated_output_latency(0), 0)
+
+  def test_get_output_spec(self) -> None:
+    layer = self.create_steppable()
+    input_spec = DummyChannelSpec(shape=(2, 3), dtype=np.float32)
+    output_spec = layer.get_output_spec(input_spec)
+    self.assertEqual(output_spec.shape, (2, 3, 1))
+    self.assertEqual(output_spec.dtype, np.float64)
+
+  def create_sequence(self) -> types_spec.Sequence:
+    """Creates a test sequence."""
+    return self.sl.types.Sequence(
+        self.xp.zeros((2, 3, 5)), self.xp.zeros((2, 3), dtype=self.xp.bool_)
+    )
+
+  def test_steppable_with_emits_defaults_to_tuple_with_empty_emits(
+      self,
+  ) -> None:
+    layer = self.create_steppable()
+    seq = self.create_sequence()
+    state_in = {'a': 'b'}
+    state_out = {1: 2}
+
+    with unittest.mock.patch.object(
+        layer, 'layer', return_value=seq
+    ) as mock_layer:
+      out, emits = layer.layer_with_emits(seq, training=False, constants=None)
+      self.assertEqual(out, seq)
+      self.assertEqual(emits, ())
+      mock_layer.assert_called_with(seq, training=False, constants=None)
+
+    with unittest.mock.patch.object(
+        layer, 'step', return_value=(seq, state_out)
+    ) as mock_step:
+      out, state, emits = layer.step_with_emits(
+          seq, state_in, training=True, constants=None
+      )
+      self.assertEqual(out, seq)
+      self.assertEqual(state, state_out)
+      self.assertEqual(emits, ())
+      mock_step.assert_called_with(seq, state_in, training=True, constants=None)
+
+
+class SequenceLayerConfigTest(SequenceLayerTest):
+  """Abstract tests for SequenceLayerConfig behaviors."""
+
+  def test_copy(self) -> None:
+    backend_sl = self.sl
+
+    @dataclasses.dataclass(frozen=True)
+    class Config(backend_sl.types.SequenceLayerConfig):
+      """Mock config."""
+
+      a: int = 1234
+      b: str = 'default string'
+
+      @override
+      def make(self) -> Any:
+        """Makes a dummy layer."""
+        return 'dummy_layer'
+
+      @override
+      def copy(self, **kwargs: Any) -> Any:
+        """Returns a copy of the config."""
+        return dataclasses.replace(self, **kwargs)
+
+    config = Config()
+    new_config = config.copy(b='new string')
+    self.assertEqual(new_config.a, config.a)
+    self.assertEqual(new_config.b, 'new string')
+
+  def test_copy_raises_on_non_dataclass(self) -> None:
+    backend_sl = self.sl
+
+    class NonDataclassConfig(backend_sl.types.SequenceLayerConfig):  # pylint: disable=too-few-public-methods
+      """Non-dataclass mock config."""
+
+      @override
+      def make(self) -> Any:
+        """Makes a dummy layer."""
+        return 'dummy_layer'
+
+      @override
+      def copy(self, **kwargs: Any) -> Any:
+        """Returns a copy of the config."""
+        raise TypeError('Mock non-dataclass config')
+
+    config = NonDataclassConfig()
+    with self.assertRaises(TypeError):
+      new_config = config.copy()
+      del new_config
+
+  def test_copy_disallows_new_fields(self) -> None:
+    backend_sl = self.sl
+
+    @dataclasses.dataclass(frozen=True)
+    class Config(backend_sl.types.SequenceLayerConfig):
+      """Mock config."""
+
+      @override
+      def make(self) -> Any:
+        """Makes a dummy layer."""
+        return 'dummy_layer'
+
+      @override
+      def copy(self, **kwargs: Any) -> Any:
+        """Returns a copy of the config."""
+        return dataclasses.replace(self, **kwargs)
+
+    config = Config()
+    # dataclasses.replace raises TypeError for unknown arguments
+    # JAX implementation wraps it in AttributeError
+    with self.assertRaises((TypeError, AttributeError)):
+      new_config = config.copy(field_does_not_exist=1234)
+      del new_config
+
+
+class PreservesTypeTest(SequenceLayerTest):
+  """Abstract tests for PreservesType behaviors."""
+
+  def create_layer(self) -> types_spec.PreservesType:
+    """Creates a preserves type layer."""
+    backend_sl = self.sl
+
+    class DummyLayer(DefaultTestLayer, backend_sl.types.PreservesType):
+      """Mock layer for testing."""
+
+      @override
+      def get_output_dtype(self, *args, **kwargs):
+        return backend_sl.types.PreservesType.get_output_dtype(
+            self, *args, **kwargs
+        )
+
+    return DummyLayer()
+
+  def test_preserves_dtype(self) -> None:
+    layer = self.create_layer()
+    self.assertEqual(layer.get_output_dtype('fake_dtype123'), 'fake_dtype123')
+
+
+class PreservesShapeTest(SequenceLayerTest):
+  """Abstract tests for PreservesShape behaviors."""
+
+  def create_layer(self) -> types_spec.PreservesShape:
+    """Creates a preserves shape layer."""
+    backend_sl = self.sl
+
+    class DummyLayer(DefaultTestLayer, backend_sl.types.PreservesShape):
+      """Mock layer for testing."""
+
+      @override
+      def get_output_shape(self, *args, **kwargs):
+        return backend_sl.types.PreservesShape.get_output_shape(
+            self, *args, **kwargs
+        )
+
+    return DummyLayer()
+
+  def test_preserves_shape(self) -> None:
+    layer = self.create_layer()
+    self.assertEqual(layer.get_output_shape((1, 2, 3, 5)), (1, 2, 3, 5))
+
+
+class StatelessTest(SequenceLayerTest):
+  """Abstract tests for Stateless layer behaviors."""
+
+  def create_sequence(self) -> types_spec.Sequence:
+    """Creates a default test sequence."""
+    return self.sl.types.Sequence(
+        self.xp.zeros((2, 3, 5)), self.xp.zeros((2, 3), dtype=self.xp.bool_)
+    )
+
+  def create_layer(self) -> types_spec.Stateless:
+    """Creates a stateless layer."""
+    backend_sl = self.sl
+
+    class DummyLayer(DefaultTestLayer, backend_sl.types.Stateless):
+      """Mock layer for testing."""
+
+      @override
+      def get_initial_state(self, *args, **kwargs):
+        return backend_sl.types.Stateless.get_initial_state(
+            self, *args, **kwargs
+        )
+
+      @override
+      def step(self, *args, **kwargs):
+        return backend_sl.types.Stateless.step(self, *args, **kwargs)
+
+    return DummyLayer()
+
+  def test_stateless_behaviors(self) -> None:
+    layer = self.create_layer()
+
+    # Initial state must be empty
+    self.assertEqual(
+        layer.get_initial_state(
+            32,
+            DummyChannelSpec(shape=(2, 3), dtype=np.float32),
+            training=False,
+        ),
+        (),
+    )
+
+    # step unconditionally delegates to layer and returns identical empty state
+    x = self.create_sequence()
+    with unittest.mock.patch.object(
+        layer, 'layer', return_value='layer_out'
+    ) as mock_layer:
+      out, state = layer.step(
+          x, 'mock_state', training=True, constants={'c': 1}
+      )
+      self.assertEqual(out, 'layer_out')
+      self.assertEqual(state, 'mock_state')
+      mock_layer.assert_called_once_with(x, training=True, constants={'c': 1})
+
+
+class EmittingTest(SequenceLayerTest):
+  """Abstract tests for Emitting layer behaviors."""
+
+  def create_sequence(self) -> types_spec.Sequence:
+    """Creates a default test sequence."""
+    return self.sl.types.Sequence(
+        self.xp.zeros((2, 3, 5)), self.xp.zeros((2, 3), dtype=self.xp.bool_)
+    )
+
+  def create_layer(self) -> types_spec.Emitting:
+    """Creates an emitting layer."""
+    backend_sl = self.sl
+
+    class DummyLayer(DefaultTestLayer, backend_sl.types.Emitting):
+      """Mock layer for testing."""
+
+      @override
+      def layer(self, *args, **kwargs):
+        return backend_sl.types.Emitting.layer(self, *args, **kwargs)
+
+      @override
+      def step(self, *args, **kwargs):
+        return backend_sl.types.Emitting.step(self, *args, **kwargs)
+
+    return DummyLayer()
+
+  def test_emitting_drops_emits_on_standard_calls(self) -> None:
+    layer = self.create_layer()
+    x = self.create_sequence()
+
+    with unittest.mock.patch.object(
+        layer, 'layer_with_emits', return_value=('out', 'emits')
+    ) as m_layer:
+      self.assertEqual(layer.layer(x, training=False), 'out')
+      m_layer.assert_called_once_with(x, training=False, constants=None)
+
+    with unittest.mock.patch.object(
+        layer, 'step_with_emits', return_value=('out', 'state', 'emits')
+    ) as m_step:
+      out, state = layer.step(x, 'state', training=True, constants={'c': 1})
+      self.assertEqual(out, 'out')
+      self.assertEqual(state, 'state')
+      m_step.assert_called_once_with(
+          x, 'state', training=True, constants={'c': 1}
+      )
+
+
+class StatelessEmittingTest(SequenceLayerTest):
+  """Abstract tests for StatelessEmitting layer behaviors."""
+
+  def create_sequence(self) -> types_spec.Sequence:
+    """Creates a default test sequence."""
+    return self.sl.types.Sequence(
+        self.xp.zeros((2, 3, 5)), self.xp.zeros((2, 3), dtype=self.xp.bool_)
+    )
+
+  def create_layer(self) -> types_spec.SequenceLayer:
+    """Creates a stateless emitting layer."""
+    backend_sl = self.sl
+
+    class DummyLayer(DefaultTestLayer, backend_sl.types.StatelessEmitting):
+      """Mock layer for testing."""
+
+      @override
+      def get_initial_state(self, *args, **kwargs):
+        return backend_sl.types.StatelessEmitting.get_initial_state(
+            self, *args, **kwargs
+        )
+
+      @override
+      def step_with_emits(self, *args, **kwargs):
+        return backend_sl.types.StatelessEmitting.step_with_emits(
+            self, *args, **kwargs
+        )
+
+    return DummyLayer()
+
+  def test_stateless_emitting_behaviors(self) -> None:
+    layer = self.create_layer()
+
+    self.assertEqual(
+        layer.get_initial_state(
+            32,
+            DummyChannelSpec(shape=(2, 3), dtype=np.float32),
+            training=False,
+        ),
+        (),
+    )
+
+    x = self.create_sequence()
+    with unittest.mock.patch.object(
+        layer, 'layer_with_emits', return_value=('out', 'emits')
+    ) as m_layer:
+      out, state, emits = layer.step_with_emits(x, 'state', training=False)
+      self.assertEqual(out, 'out')
+      self.assertEqual(state, 'state')
+      self.assertEqual(emits, 'emits')
+      m_layer.assert_called_once_with(x, training=False, constants=None)
+
+
+class StatelessPointwiseFunctorTest(SequenceLayerTest):
+  """Abstract tests for StatelessPointwiseFunctor layer behaviors."""
+
+  def create_layer(
+      self, is_mask_required: bool
+  ) -> types_spec.SequenceLayer[Any]:
+    """Creates a stateless pointwise functor layer."""
+
+    backend_sl = self.sl
+
+    class DummyLayer(
+        DefaultTestLayer, backend_sl.types.StatelessPointwiseFunctor
+    ):
+      """Mock layer for testing."""
+
+      @override
+      def layer(self, *args, **kwargs):
+        return backend_sl.types.StatelessPointwiseFunctor.layer(
+            self, *args, **kwargs
+        )
+
+      @override
+      def get_output_shape(self, *args, **kwargs):
+        return backend_sl.types.StatelessPointwiseFunctor.get_output_shape(
+            self, *args, **kwargs
+        )
+
+      @property
+      @override
+      def mask_required(self) -> bool:
+        """Whether mask is required."""
+        return is_mask_required
+
+      @override
+      def fn(self, values: Any, mask: Any) -> tuple[Any, Any]:
+        """Pointwise function."""
+        return values, mask
+
+    return DummyLayer()
+
+  def create_sequence(
+      self,
+  ) -> types_spec.Sequence[types_spec.Array, types_spec.Array]:
+    """Creates a test sequence."""
+    return self.sl.types.Sequence(
+        self.xp.zeros((2, 3, 5)), self.xp.zeros((2, 3), dtype=self.xp.bool_)
+    )
+
+  def test_layer_applies_fn_based_on_mask_required(self) -> None:
+    for mask_required in [True, False]:
+      with self.subTest(mask_required=mask_required):
+        layer = self.create_layer(mask_required)
+        x = self.create_sequence()
+        # Mock the apply methods on the Sequence class itself so we return a
+        # valid Sequence that satisfies any @check_layer decorators.
+        with unittest.mock.patch.object(
+            type(x), 'apply', return_value=x
+        ) as mock_apply:
+          with unittest.mock.patch.object(
+              type(x), 'apply_masked', return_value=x
+          ) as mock_apply_masked:
+            layer.layer(x, training=False)
+
+            if mask_required:
+              mock_apply.assert_called_once()
+              mock_apply_masked.assert_not_called()
+            else:
+              mock_apply_masked.assert_called_once()
+              mock_apply.assert_not_called()

From b2591925acc8465f9471bf3cf37c15ea3fcbbd0e Mon Sep 17 00:00:00 2001
From: Julian Salazar <julsal@google.com>
Date: Fri, 29 May 2026 09:32:30 +0000
Subject: [PATCH 03/29] refactor: Align JAX/MLX backends and introduce backend
 operations

* Align Flax-based JAX implementations to inherit from specs protocols.
* Align MLX-based implementations to inherit from specs protocols.
* Delete obsolete mlx/basic_types.py.
* Abstract and implement backend-agnostic array/nn operations (xp, nn).
* Resolve JAX attention namespace collision by aligning imports.
* Add complete multi-backend coding guides (AGENTS.md, evolved DESIGN.md).

Co-authored-by: David Braun <davidbraun@google.com>
Co-authored-by: Kehang Han <kehanghan@google.com>

PiperPending-RevId: 923278027
PiperOrigin-RevId: 923278027
---
 AGENTS.md                                     | 167 +++++
 DESIGN.md                                     |  72 ++
 sequence_layers/abstract/types.py             | 313 ---------
 sequence_layers/abstract/types_test_base.py   | 384 -----------
 sequence_layers/jax/__init__.py               |   6 +
 sequence_layers/jax/attention/common.py       |  15 +
 sequence_layers/jax/backend.py                |  42 ++
 .../__init__.py => jax/backend_test.py}       |  15 +-
 sequence_layers/jax/dsp.py                    |   2 +-
 sequence_layers/jax/test_utils.py             |  13 +-
 sequence_layers/jax/types.py                  | 246 +++++--
 sequence_layers/jax/types_test.py             | 120 ++--
 sequence_layers/jax/typing.py                 |  23 +-
 sequence_layers/mlx/__init__.py               |   8 +
 sequence_layers/mlx/backend.py                |  43 ++
 sequence_layers/mlx/backend_test.py           |  26 +
 sequence_layers/mlx/basic_types.py            | 133 ----
 sequence_layers/mlx/types.py                  | 634 +++++++++++++++---
 sequence_layers/mlx/types_test.py             |  68 +-
 19 files changed, 1215 insertions(+), 1115 deletions(-)
 create mode 100644 AGENTS.md
 delete mode 100644 sequence_layers/abstract/types.py
 delete mode 100644 sequence_layers/abstract/types_test_base.py
 create mode 100644 sequence_layers/jax/backend.py
 rename sequence_layers/{abstract/__init__.py => jax/backend_test.py} (65%)
 create mode 100644 sequence_layers/mlx/backend.py
 create mode 100644 sequence_layers/mlx/backend_test.py
 delete mode 100644 sequence_layers/mlx/basic_types.py

diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000..d23537f
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,167 @@
+# SequenceLayers Multi-Backend: Coding Guide
+
+> [!IMPORTANT] Read `DESIGN.md` (§7) for the rationale behind this architecture.
+
+## Scope
+
+Design rules (🏛️) are universal and apply whenever extracting shared interfaces.
+Toolchain rules (🔧) apply to files checked with pyrefly (currently `specs/` and
+new files; pyrefly is available via `pyproject.toml`).
+
+## Framework Status
+
+1.  **JAX**: Stable, backward-compat required. Do not change public APIs (names,
+    interfaces, type signatures) without explicit justification.
+2.  **TF**: Deprecated. Ignore this directory and its conventions.
+3.  **MLX**: In progress. See Porting Workflow below.
+
+## Porting Workflow
+
+When porting or refactoring across backends, you are likely in one of these
+scenarios (or a **Full Port** combining them):
+
+1.  **Interface Extraction**: JAX and MLX implementations exist but don't
+    inherit from a shared interface. Find shared config fields, layer classes,
+    methods, and arguments. Codify the shared interface in `specs/*.py`. Both
+    backends should inherit from the spec class (in appropriate MRO order; see
+    rule 4), provided it does not break JAX backward compatibility.
+2.  **Test Unification**: JAX and MLX implementations exist but don't share
+    tests. Given the unified interface in `specs/*.py`, refactor shared test
+    logic into `specs/*_behaviors.py`. *Prefer JAX tests as the basis when they
+    cover equivalent features.*
+3.  **Full Port / Feature Porting**: A JAX-only layer needs an MLX port. **Start
+    by abstracting the tests** (TDD): codify the interface in `specs/*.py` and
+    tests in `specs/*_behaviors.py` (preferring JAX tests as basis), then create
+    or update `mlx/*(_test).py`.
+4.  **Backend-specific supersets**: When a backend implements extra features
+    beyond the shared spec, common functionality goes in `specs/*_behaviors.py`,
+    while backend-specific extensions stay in `<backend>/*_test.py`. Consider
+    whether extended features could be generalized into the shared spec.
+
+--------------------------------------------------------------------------------
+
+## Design Rules 🏛️
+
+### Architecture
+
+1.  **Up-front readability**: Backend files must be self-contained. Re-declare
+    all defaults, docstrings, function signatures, and Config fields. Users
+    should never need to read `specs/` to understand a backend's API.
+    -   *Exception*: Pure functions that are part of the contract all backends
+        must fulfill (e.g., test utilities like `zip_longest`, `named_product`)
+        may live in `specs/` and be aliased by backends.
+2.  **Generics and specialization**: Spec classes (layers and Configs) are
+    generic (e.g., over `DTypeT`, `SequenceT`). Backends specialize with
+    concrete types.
+3.  **Rigid signatures / LSP**: Match spec parameter names and signatures
+    exactly. No `**kwargs`. Include all protocol parameters (e.g., `training:
+    bool`) even if unused by a particular backend — this maintains Liskov
+    Substitution Principle compliance.
+4.  **MRO**: The abstract spec class should be the last one inherited. Example:
+    `class StatelessEmitting(Emitting, spec.StatelessEmitting)`
+5.  **Circular import prevention**: When submodules import root-level aliases
+    from `__init__.py`, ensure all root-level alias imports are placed at the
+    **top** of `__init__.py`, before importing any submodule classes.
+6.  **Decoupled instantiation**: Use `Layer.from_config(config)` factory methods
+    on the framework-specific class, not `Config.make(backend=...)`. Spec
+    configs remain abstract.
+7.  **Deferred initialization for stateless backends**: Backends without eager
+    parameter allocation (e.g., MLX) should use lazy submodule creation within
+    `_ensure_initialized` rather than maintaining a separate wrapper class. The
+    public class accepts a `Config` and lazily creates its internal submodules
+    on the first call to `layer()`.
+8.  **Config specs nested**: In `specs/` files, `Config` classes are nested
+    within the layer classes they configure, paralleling the structure in
+    backend implementations.
+
+### Testing
+
+1.  **Behavior tests via inheritance**: `specs/*_behaviors.py` defines
+    backend-agnostic test cases. `<backend>/*_test.py` inherits from these.
+    -   No `abc.ABC` in behavior test classes (they won't be discovered by
+        pytest since files are named `*_behaviors.py`, not `*_test.py`).
+    -   No cross-importing between behavior files. Prefer duplicating small
+        helpers or using shared bases in non-behavior modules.
+    -   Inherit from `test_utils.SequenceLayerTest` (or similar shared base). In
+        `<backend>/*_test.py`, subclass `test_utils.SequenceLayerTest` first
+        (MRO convention).
+2.  **Backend-native syntax in tests**: In `<backend>/*_test.py`, use
+    backend-specific types (`jnp`, `sl.Sequence`, etc.). Import the backend as
+    `sl` (e.g., `import sequence_layers.mlx as sl`).
+3.  **Avoid `super()` in diamond test hierarchies**: When dealing with diamond
+    inheritance (test base + backend-specific mock), `super()` calls can be
+    brittle. Use explicit class delegation (e.g.,
+    `backend_sl.types.Stateless.step(self, ...)`).
+4.  **Capture `self.sl` before nested classes**: Capture `backend_sl = self.sl`
+    in the outer method before defining a local mock class (like `DummyLayer`)
+    to avoid scoping issues with static analysis tools.
+5.  **Use `backend.xp` / `backend.nn`**: In shared behavior tests, avoid
+    importing backend-specific libraries directly. Use `self.sl.backend.xp` for
+    array ops and `self.sl.backend.nn` for neural network ops.
+
+### ModuleSpec
+
+1.  **Collocation**: Define `ModuleSpec` protocols in the specific spec module
+    they describe (e.g., `specs/simple.py`, `specs/types.py`), not in
+    `specs/__init__.py`.
+2.  **`__all__` from `ModuleSpec.__dict__`**: Files defining `ModuleSpec` should
+    derive `__all__` dynamically to keep exports aligned with the protocol.
+3.  **Protocol alignment**: Keep protocols aligned with usage in shared tests.
+    When exposing new modules or utilities via backend implementations, update
+    the relevant `ModuleSpec`.
+
+--------------------------------------------------------------------------------
+
+## Toolchain Rules 🔧
+
+*Apply to files checked with pyrefly (currently `specs/` and new files).*
+
+1.  **PEP 695 syntax**: Use `class Foo[T]:` instead of `TypeVar` +
+    `Generic[...]`. Legacy files may use the older syntax.
+2.  **Pyrefly priority**: Pyrefly over Pylint for structural correctness and
+    type safety. Use `from typing import ...` (no `import typing`). Fix warnings
+    up-front; never add `# type: ignore` without justification. If proposing
+    disables, prefer disabling in Pylint over Pyrefly.
+3.  **`@override` mandatory**: Implementations of abstract methods in backends
+    must be decorated with `@override` (from `typing`).
+4.  **Import naming from `specs`**:
+    -   If it is the "specification" for the current file, import as `spec`
+        (e.g., `test_utils.py` imports `specs/test_utils.py` as `spec`).
+    -   Otherwise, import as `<module>_spec` (e.g., `test_utils.py` imports
+        `specs/types.py` as `types_spec`).
+    -   Within `specs/` itself, always use the `_spec` suffix to avoid
+        ambiguity.
+5.  **Lint disable policy**: Broad-scoped disables are only allowed for these
+    cases:
+    -   `specs/*_behaviors.py`: `# pylint: disable=abstract-method` and `#
+        pyrefly: disable=bad-instantiation` at the file level (test classes
+        inherit abstract methods implemented in backend test files).
+    -   `ModuleSpec` protocols: `# pylint: disable=invalid-name` and `# pylint:
+        disable=missing-function-docstring` at the class level.
+    -   JAX layer implementations (e.g., `jax/dense.py`): `# pylint:
+        disable=abstract-method,abstract-class-instantiated` at the file level
+        (Pylint cannot see through Flax's metaclass wrappers; compliance is
+        guaranteed by Pyrefly and runtime tests).
+
+--------------------------------------------------------------------------------
+
+## Validation
+
+**Formatting, linting, static analysis**: Scope to the files you modified only.
+
+1.  **Format**: `pyink <file>`, `isort <file>`.
+2.  **Lint**: `pylint <file>` — fix all warnings. Do not claim "false positive"
+    without demonstrating it.
+3.  **Static analysis**: `pyrefly check <file>` — for pyrefly-checked files.
+
+> [!IMPORTANT] Do not fix pre-existing errors in files you did not modify.
+
+**Tests**: Scope depends on what you changed:
+
+| What you changed         | Test scope                                     |
+| ------------------------ | ---------------------------------------------- |
+| `<backend>/*.py` only    | That backend's `*_test.py` files               |
+| `specs/*.py` (protocols) | Static analysis usually suffices. Run          |
+|                          | `<backend>/*_test.py` if you added or changed  |
+|                          | abstract methods/signatures.                   |
+| `specs/*_behaviors.py`   | **All** inheriting `<backend>/*_test.py` files |
diff --git a/DESIGN.md b/DESIGN.md
index f7dbb8e..d5a4ee1 100644
--- a/DESIGN.md
+++ b/DESIGN.md
@@ -164,3 +164,75 @@ combinators:
     automatically implementing `layer` in terms of `step` to reduce peak memory.
 
 --------------------------------------------------------------------------------
+
+## 7. Multi-Backend Architecture
+
+### Why Multi-Backend?
+
+A core feature of SequenceLayers is **direct inspectability**: configs and model
+logic live beside each other, so clicking through to `sl.Dense` shows the full
+implementation in your framework. However, supporting multiple backends (JAX,
+MLX, and potentially PyTorch) means each backend must have its own native
+implementation — direct inspectability requires code duplication.
+
+Without safeguards, separate implementations inevitably **diverge** in
+interfaces (different names, configs, method signatures), behaviors (different
+numerical results for the same model), and implementations (different efficiency
+characteristics). While implementation equivalence is
+[undecidable in general](https://en.wikipedia.org/wiki/Rice's_theorem), we *can*
+enforce equivalence in interfaces and behaviors.
+
+### How: Three Enforcement Mechanisms
+
+1.  **Interface equivalence via protocols.** Shared abstract classes and
+    [protocols](https://typing.python.org/en/latest/spec/glossary.html#term-structural)
+    in `specs/*.py` define standardized layer names, configs, methods, and
+    signatures. All backends inherit from these.
+2.  **Behavior equivalence via shared tests.** Backend-agnostic test cases in
+    `specs/*_behaviors.py` verify that implementations produce equivalent
+    results (e.g., step-layer equivalence, expected outputs). Backend test files
+    inherit these and only add backend-specific extensions.
+3.  **Implementation sharing via pure functions.** Where frameworks share a
+    NumPy-compatible API, backend-generic pure functions (e.g.,
+    `compute_flash_attention`) can be shared, as long as direct inspectability
+    of high-level layer semantics is preserved.
+
+**Model conversion** across backends is a future goal: given interface,
+behavior, and parameter equivalence, cross-platform weight transfer should be
+possible.
+
+### Package Structure
+
+SequenceLayers supports multiple frameworks (JAX, MLX) via a three-tier package
+structure:
+
+```
+specs/          ← Backend-agnostic protocols, contracts, and shared behaviors
+  types.py           Protocols for Sequence, SequenceLayer, Config, etc.
+  types_behaviors.py Behavioral tests (step-layer equiv, etc.)
+  backend.py         Protocol for backend-specific ops (xp, nn)
+  test_utils.py      Shared test infrastructure
+
+jax/            ← JAX-native implementations (the production backend)
+  types.py           Inherits from specs, implements via Flax
+  backend.py         JAX backend: xp=jnp, nn=jax.nn
+  test_utils.py      JAX-specific test setup
+
+mlx/            ← MLX-native implementations
+  types.py           Inherits from specs, implements via mlx.nn
+  backend.py         MLX backend: xp=mx, nn=mlx.nn
+  test_utils.py      MLX-specific test setup
+```
+
+**Key principles:**
+
+*   **`specs/` is purely declarative.** It defines *what* backends must do
+    (protocols, type constraints), not *how*. Default implementations belong in
+    the backend-specific files.
+*   **Tests are shared via inheritance.** `specs/*_behaviors.py` defines
+    backend-agnostic test cases. Backend test files inherit these and only add
+    backend-specific extensions.
+*   **Direct inspectability is preserved.** Users of `jax/types.py` see full
+    implementations and docstrings without needing to read `specs/`.
+
+See `AGENTS.md` for detailed development conventions.
diff --git a/sequence_layers/abstract/types.py b/sequence_layers/abstract/types.py
deleted file mode 100644
index a450bf4..0000000
--- a/sequence_layers/abstract/types.py
+++ /dev/null
@@ -1,313 +0,0 @@
-# Copyright 2026 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Abstract base classes and types for SequenceLayers."""
-
-import abc
-import enum
-import fractions
-from typing import Any, Callable, Generic, Iterable, Literal, TypeVar
-
-# Type aliases for generic usage
-T = TypeVar('T')
-ValuesT = TypeVar('ValuesT')
-MaskT = TypeVar('MaskT')
-SequenceSelf = TypeVar('SequenceSelf', bound='Sequence')
-Shape = tuple[int, ...]
-ShapeLike = list[int] | tuple[int, ...]
-DType = Any  # Can be numpy, jax, or mlx dtype
-ChannelSpec = Any  # Typically ShapeDType or compatible
-
-
-class PaddingMode(enum.Enum):
-  """Supported padding modes."""
-
-  # In VALID padding mode, no padding is applied.
-  #
-  # Key properties:
-  # * The physical length of an input array to a VALID padded function shrinks,
-  #   dropping any timesteps whose inputs are computed from implicit edge
-  #   padding.
-  # * An output timestep is valid when all of its input timesteps are also
-  #   valid.
-  VALID = 'valid'
-
-  # In SAME padding mode, the input sequence is padded such that the output
-  # length is equal to the input length before applying striding.
-  #
-  # Key properties:
-  # * The input length is equal to the output length, before applying striding.
-  # * Padding of `effective_kernel_size - 1` is applied. Half is applied to the
-  #   front and half to the back. If `effective_kernel_size` is even, the extra
-  #   padding is added to the end.
-  # * An output timestep is valid when its corresponding input timestep is
-  #   valid.
-  SAME = 'same'
-
-  # In CAUSAL_VALID padding mode, the input sequence is padded such that the
-  # output length is equal to the input length before applying striding. Padding
-  # is applied such that the output timestep `to` can only depend on input
-  # timesteps at or before `ti` where `ti * output_ratio = to`.
-  #
-  # Key properties:
-  # * As in SAME padding, the input length is equal to the output length, before
-  #   applying striding.
-  # * Padding of `effective_kernel_size - 1` is applied to the front of the
-  #   sequence.
-  # * As in VALID padding, an output timestep is valid iff all of its input
-  #   timesteps are also valid.
-  CAUSAL_VALID = 'causal_valid'
-
-  # In REVERSE_CAUSAL_VALID padding mode, the input sequence is padded such that
-  # the output length is equal to the input length before applying striding.
-  # Padding is applied such that the output timestep `to` can only depend on
-  # input timesteps at or after `ti` where `ti * output_ratio = to`.
-  #
-  # Key properties:
-  # * As in SAME padding, the input length is equal to the output length, before
-  #   applying striding.
-  # * Padding of `effective_kernel_size - 1` is applied to the back of the
-  #   sequence.
-  REVERSE_CAUSAL_VALID = 'reverse_causal_valid'
-
-  # In CAUSAL padding mode, the input sequence is padded such that the output
-  # length is equal to the input length before applying striding. Padding is
-  # applied such that the output timestep `to` can only depend on input
-  # timesteps at or before `ti` where `ti * output_ratio = to`.
-  #
-  # Key properties:
-  # * As in SAME padding, the input length is equal to the output length, before
-  #   applying striding.
-  # * Padding of `effective_kernel_size - 1` is applied to the front of the
-  #   sequence.
-  # * As in SAME padding, an output timestep is valid when its corresponding
-  #   input timestep is valid.
-  CAUSAL = 'causal'
-
-  # In REVERSE_CAUSAL padding mode, the input sequence is padded such that the
-  # output length is equal to the input length before applying striding. Padding
-  # is applied such that the output timestep `to` can only depend on input
-  # timesteps at or after `ti` where `ti * output_ratio = to`.
-  #
-  # Key properties:
-  # * As in SAME padding, the input length is equal to the output length, before
-  #   applying striding.
-  # * Padding of `effective_kernel_size - 1` is applied to the back of the
-  #   sequence.
-  # * As in SAME padding, an output timestep is valid when its corresponding
-  #   input timestep is valid.
-  REVERSE_CAUSAL = 'reverse_causal'
-
-  # In SEMICAUSAL padding mode, the input sequence is padded such that the
-  # output length is equal to the input length before applying striding. Padding
-  # is applied such that the output timestep `to` can only depend on input
-  # timesteps at or before `ti` where `ti * output_ratio = to`.
-  #
-  # Key properties:
-  # * As in SAME padding, the input length is equal to the output length, before
-  #   applying striding.
-  # * Padding of `effective_kernel_size - stride` is applied to the front of the
-  #   sequence, and padding of `stride - 1` timesteps is applied to the back of
-  #   the sequence for a total of `effective_kernel_size - 1` timesteps of
-  #   padding. If `effective_kernel_size` < `stride`, then padding of
-  #   `effective_kernel_size - 1` is applied to the back of the sequence.
-  # * As in SAME padding, an output timestep is valid when its corresponding
-  #   input timestep is valid.
-  SEMICAUSAL = 'semicausal'
-
-  # In SEMICAUSAL_FULL padding mode, the input sequence is padded such that the
-  # output of the corresponding overlap-add or transpose convolution is of the
-  # same size as the input sequence and perfect reconstruction can be achieved.
-  # The reconstructed signal is of the same length or of length rounded up to
-  # cover the full input sequence.
-  SEMICAUSAL_FULL = 'semicausal_full'
-
-
-PaddingModeString = Literal[
-    'valid',
-    'same',
-    'causal_valid',
-    'reverse_causal_valid',
-    'causal',
-    'reverse_causal',
-    'semicausal',
-    'semicausal_full',
-]
-
-
-class Sequence(Generic[ValuesT, MaskT], metaclass=abc.ABCMeta):
-  """Abstract base class for Sequence."""
-
-  values: ValuesT
-  mask: MaskT
-
-  def __init__(self, values: ValuesT, mask: MaskT):
-    raise NotImplementedError
-
-  @property
-  @abc.abstractmethod
-  def shape(self) -> Shape:
-    pass
-
-  @property
-  @abc.abstractmethod
-  def ndim(self) -> int:
-    pass
-
-  @property
-  @abc.abstractmethod
-  def channel_shape(self) -> Shape:
-    pass
-
-  @property
-  @abc.abstractmethod
-  def dtype(self) -> DType:
-    pass
-
-  @classmethod
-  @abc.abstractmethod
-  def from_values(cls, values: ValuesT) -> 'Sequence':
-    pass
-
-  @classmethod
-  @abc.abstractmethod
-  def concatenate_sequences(cls, sequences: Iterable['Sequence']) -> 'Sequence':
-    pass
-
-  @abc.abstractmethod
-  def expanded_mask(self) -> Any:
-    pass
-
-  @abc.abstractmethod
-  def apply_values(
-      self,
-      values_fn: Callable[..., ValuesT],
-      *args,
-      **kwargs,
-  ) -> 'Sequence':
-    pass
-
-  @abc.abstractmethod
-  def apply_values_masked(
-      self: SequenceSelf,
-      values_fn: Callable[..., ValuesT],
-      *args,
-      **kwargs,
-  ) -> SequenceSelf:
-    pass
-
-  @abc.abstractmethod
-  def apply(
-      self,
-      apply_fn: Callable[..., tuple[ValuesT, MaskT]],
-      *args,
-      **kwargs,
-  ) -> 'Sequence':
-    pass
-
-  @abc.abstractmethod
-  def apply_masked(
-      self: SequenceSelf,
-      apply_fn: Callable[..., tuple[ValuesT, MaskT]],
-      *args,
-      **kwargs,
-  ) -> SequenceSelf:
-    pass
-
-  @abc.abstractmethod
-  def astype(self: SequenceSelf, dtype: DType | None) -> SequenceSelf:
-    pass
-
-  @abc.abstractmethod
-  def lengths(self) -> Any:
-    pass
-
-  @abc.abstractmethod
-  def __getitem__(self: SequenceSelf, the_slice: Any) -> SequenceSelf:
-    pass
-
-  @abc.abstractmethod
-  def pad_time(
-      self: SequenceSelf,
-      pad_left: int,
-      pad_right: int,
-      valid: bool,
-      pad_value: Any | None = None,
-  ) -> SequenceSelf:
-    pass
-
-  @abc.abstractmethod
-  def concatenate(self, other: 'Sequence') -> 'Sequence':
-    pass
-
-  @abc.abstractmethod
-  def mask_invalid(self, mask_value: Any | None = None) -> 'Sequence':
-    pass
-
-  @abc.abstractmethod
-  def unmask(self) -> 'Sequence':
-    pass
-
-
-class SequenceLayerConfig(metaclass=abc.ABCMeta):
-  """Configuration for a SequenceLayer."""
-
-  @abc.abstractmethod
-  def make(self) -> Any:
-    """Creates the sequence layer."""
-
-  @abc.abstractmethod
-  def copy(self, **kwargs) -> 'SequenceLayerConfig':
-    """Returns a copy of the config with updated fields."""
-
-
-class Steppable(metaclass=abc.ABCMeta):
-  """A sequence processing layer that can be executed layerwise or stepwise."""
-
-  @property
-  @abc.abstractmethod
-  def block_size(self) -> int:
-    pass
-
-  @property
-  @abc.abstractmethod
-  def output_ratio(self) -> fractions.Fraction:
-    pass
-
-  @property
-  @abc.abstractmethod
-  def supports_step(self) -> bool:
-    pass
-
-  @property
-  @abc.abstractmethod
-  def input_latency(self) -> int:
-    pass
-
-  @property
-  @abc.abstractmethod
-  def output_latency(self) -> int:
-    pass
-
-  @abc.abstractmethod
-  def get_accumulated_input_latency(self, input_latency: int) -> int:
-    pass
-
-  @abc.abstractmethod
-  def get_accumulated_output_latency(self, output_latency: int) -> int:
-    pass
-
-  @property
-  @abc.abstractmethod
-  def receptive_field(self) -> Any:
-    pass
diff --git a/sequence_layers/abstract/types_test_base.py b/sequence_layers/abstract/types_test_base.py
deleted file mode 100644
index 95eec58..0000000
--- a/sequence_layers/abstract/types_test_base.py
+++ /dev/null
@@ -1,384 +0,0 @@
-# Copyright 2026 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Abstract tests for Sequence types."""
-
-# pylint: disable=missing-class-docstring
-# pylint: disable=missing-function-docstring
-
-import abc
-import dataclasses
-import fractions
-from typing import Any, Callable, Self
-
-from absl.testing import parameterized
-import numpy as np
-from sequence_layers.abstract import types
-
-
-class _AbcParameterizedTestCaseMeta(
-    parameterized.TestGeneratorMetaclass, abc.ABCMeta
-):
-  pass
-
-
-class SequenceLayerTest(
-    parameterized.TestCase, metaclass=_AbcParameterizedTestCaseMeta
-):
-  """Base abstract test class providing common sequence testing assertions."""
-
-  # pylint: disable=invalid-name
-
-  @abc.abstractmethod
-  def assertSequencesClose(self, x: Any, y: Any, **kwargs):
-    pass
-
-  @abc.abstractmethod
-  def assertSequencesNotClose(self, x: Any, y: Any, **kwargs):
-    pass
-
-  @abc.abstractmethod
-  def assertSequencesEqual(self, x: Any, y: Any):
-    pass
-
-  @abc.abstractmethod
-  def assertSequencesNotEqual(self, x: Any, y: Any):
-    pass
-
-  @abc.abstractmethod
-  def assertAllEqual(self, x: Any, y: Any):
-    pass
-
-  @abc.abstractmethod
-  def assertAllClose(self, x: Any, y: Any, **kwargs):
-    pass
-
-  @abc.abstractmethod
-  def assertNotAllEqual(self, x: Any, y: Any):
-    pass
-
-  @abc.abstractmethod
-  def assertNotAllClose(self, x: Any, y: Any, **kwargs):
-    pass
-
-  # pylint: enable=invalid-name
-
-
-class SequenceTest(SequenceLayerTest, metaclass=_AbcParameterizedTestCaseMeta):
-  """Abstract tests for the Sequence class."""
-
-  @abc.abstractmethod
-  def get_backend(self) -> Any:
-    """Returns the backend module (jax.numpy or mlx.core)."""
-
-  # pylint: disable=invalid-name
-
-  @property
-  @abc.abstractmethod
-  def Sequence(self) -> Callable[[types.ValuesT, types.MaskT], types.Sequence]:
-    """Returns the Sequence class for the backend."""
-
-  @property
-  @abc.abstractmethod
-  def MaskedSequence(
-      self,
-  ) -> Callable[[types.ValuesT, types.MaskT], types.Sequence]:
-    """Returns the MaskedSequence class for the backend."""
-
-  # pylint: enable=invalid-name
-
-  @property
-  def check_trees_all_equal(self) -> Callable[[Any, Any], None]:
-    """Returns a function to check tree equality."""
-    return self.assertAllEqual
-
-  def test_mask_invalid_idempotent(self):
-    xp = self.get_backend()
-    values = xp.array([
-        [1.0, 2.0, 3.0, 4.0],
-        [10.0, 20.0, 30.0, 40.0],
-    ])
-    # Different backends might handle boolean creation differently, but standard
-    # numpy-like syntax usually works
-    mask = xp.array([[True, True, False, False], [False, False, False, True]])
-
-    x = self.Sequence(values, mask)
-    masked = x.mask_invalid()
-    self.assertIsNot(masked, x)
-    # We can't easily check isinstance here without importing the concrete
-    # classes, but we can check behavior or use a property if we added one. For
-    # now, we trust the concrete tests to check types if needed, or we could add
-    # abstract methods to check types.
-
-    masked_again = masked.mask_invalid()
-    self.assertIs(masked_again, masked)
-
-    masked2 = x.mask_invalid()
-    self.assertIsNot(masked2, masked)
-
-  @parameterized.named_parameters(
-      ('mask_value=None', 0.0, None),
-      ('mask_value=0.0', 0.0, 0.0),
-      ('mask_value=-1.0', -1.0, -1.0),
-  )
-  def test_mask_invalid(self, mask_value, expected_mask_value):
-    xp = self.get_backend()
-    values = xp.array([
-        [1.0, 2.0, 3.0, 4.0],
-        [10.0, 20.0, 30.0, 40.0],
-    ])
-    mask = xp.array([[True, True, False, False], [False, False, False, True]])
-
-    # Pass mask_value only if it is not None (to test default None behavior vs
-    # explicit value)
-    if expected_mask_value is None:
-      output = self.Sequence(values, mask).mask_invalid()
-      fill_value = 0.0
-    else:
-      output = self.Sequence(values, mask).mask_invalid(mask_value)
-      fill_value = mask_value
-
-    expected_values = xp.array([
-        [1.0, 2.0, fill_value, fill_value],
-        [fill_value, fill_value, fill_value, 40.0],
-    ])
-    self.check_trees_all_equal(output.values, expected_values)
-    self.check_trees_all_equal(output.mask, mask)
-
-  def test_pad_time(self):
-    xp = self.get_backend()
-    values = xp.array([
-        [1.0, 2.0, 3.0, 4.0],
-        [10.0, 20.0, 30.0, 40.0],
-    ])
-    mask = xp.array([[True, True, False, False], [False, False, False, True]])
-
-    x = self.Sequence(values, mask).mask_invalid()
-
-    y = x.pad_time(0, 0, valid=False)
-    self.check_trees_all_equal(y.values, x.values)
-    self.check_trees_all_equal(y.mask, x.mask)
-
-    y = x.pad_time(1, 0, valid=False)
-
-    x_left1 = self.Sequence(
-        xp.array([
-            [0.0, 1.0, 2.0, 3.0, 4.0],
-            [0.0, 10.0, 20.0, 30.0, 40.0],
-        ]),
-        xp.array([
-            [False, True, True, False, False],
-            [False, False, False, False, True],
-        ]),
-    ).mask_invalid()
-    self.check_trees_all_equal(y.values, x_left1.values)
-    self.check_trees_all_equal(y.mask, x_left1.mask)
-
-  def _create_test_sequence(self, shape):
-    xp = self.get_backend()
-    size = 1
-    for d in shape:
-      size *= d
-    values_np = np.arange(size, dtype=np.float32).reshape(shape)
-    mask_np = np.ones(shape[:2], dtype=bool)
-    if shape[0] > 0 and shape[1] > 1:
-      mask_np[0, 1] = False
-
-    values = xp.array(values_np)
-    mask = xp.array(mask_np)
-    return self.Sequence(values, mask)
-
-  def test_slice(self):
-    x = self._create_test_sequence((3, 5, 9))
-
-    self.assertSequencesEqual(
-        x[:, 1:], self.Sequence(x.values[:, 1:], x.mask[:, 1:])
-    )
-    self.assertSequencesEqual(
-        x[:, ::2], self.Sequence(x.values[:, ::2], x.mask[:, ::2])
-    )
-    self.assertSequencesEqual(
-        x[::2, ::3], self.Sequence(x.values[::2, ::3], x.mask[::2, ::3])
-    )
-
-  def test_slice_can_slice_channel_dimensions(self):
-    x = self._create_test_sequence((3, 5, 9, 4))
-
-    self.assertSequencesEqual(
-        x[:, 1:, :], self.Sequence(x.values[:, 1:], x.mask[:, 1:])
-    )
-    self.assertSequencesEqual(
-        x[:, ::2, :3],
-        self.Sequence(x.values[:, ::2, :3], x.mask[:, ::2]),
-    )
-
-  def test_apply_values(self):
-    xp = self.get_backend()
-    values = xp.array([
-        [-1.0, 2.0, 3.0, 4.0],
-        [10.0, -20.0, 30.0, 40.0],
-    ])
-    mask = xp.array([[True, True, False, False], [False, True, False, True]])
-
-    x = self.Sequence(values, mask)
-    masked = x.mask_invalid()
-
-    # Simple abs function
-    fn = abs
-
-    y = x.apply_values(fn)
-    self.check_trees_all_equal(y.values, fn(x.values))
-    self.check_trees_all_equal(y.mask, x.mask)
-
-    y = masked.apply_values(fn)
-    self.check_trees_all_equal(y.values, fn(masked.values))
-    self.check_trees_all_equal(y.mask, x.mask)
-
-    y = masked.apply_values_masked(fn)
-    self.check_trees_all_equal(y.values, fn(masked.values))
-    self.check_trees_all_equal(y.mask, x.mask)
-
-  def test_apply_values_args(self):
-    xp = self.get_backend()
-    values = xp.array([
-        [-1.0, 2.0, 3.0, 4.0],
-        [10.0, -20.0, 30.0, 40.0],
-    ])
-    mask = xp.array([[True, True, False, False], [False, True, False, True]])
-    x = self.Sequence(values, mask)
-
-    target_shape = (2, 4, 1)
-    y = x.apply_values(lambda v, s: v.reshape(s), target_shape)
-    self.check_trees_all_equal(y.values.shape, target_shape)
-    self.check_trees_all_equal(y.mask.shape, (2, 4))
-
-  def test_from_values(self):
-    xp = self.get_backend()
-    values_np = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
-    values = xp.array(values_np)
-    # Get the class from an instance
-    seq = self.Sequence(values, xp.array(np.ones(values.shape[:2], dtype=bool)))
-    seq_cls = type(seq)
-
-    x = seq_cls.from_values(values)
-    self.check_trees_all_equal(x.values, values)
-    self.check_trees_all_equal(
-        x.mask, xp.array(np.ones(values.shape[:2], dtype=bool))
-    )
-
-  def test_astype(self):
-    xp = self.get_backend()
-    values_np = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
-    mask_np = np.array([[True, False], [False, True]], dtype=bool)
-
-    values = xp.array(values_np)
-    mask = xp.array(mask_np)
-
-    x = self.Sequence(values, mask)
-
-    # We need a dtype that matches the backend
-    if xp.__name__ == 'jax.numpy':
-      dtype = xp.int32
-    elif xp.__name__ == 'mlx.core':
-      dtype = xp.int32
-    else:
-      dtype = np.int32
-
-    y = x.astype(dtype)
-
-    # Check values match casted version
-    self.check_trees_all_equal(y.mask, mask)
-    # y.values might be mlx array, values.astype(dtype) might be numpy if values
-    # was numpy? values is backend array. values.astype(dtype) should work if
-    # dtype is backend dtype.
-    self.check_trees_all_equal(y.values, values.astype(dtype))
-
-
-class SteppableTest(
-    parameterized.TestCase, metaclass=_AbcParameterizedTestCaseMeta
-):
-  """Abstract tests for Steppable layers."""
-
-  @abc.abstractmethod
-  def create_steppable(self) -> Any:
-    """Creates a basic Steppable instance that should have default properties."""
-
-  def test_steppable_defaults(self):
-    layer = self.create_steppable()
-    self.assertEqual(layer.block_size, 1)
-    self.assertEqual(layer.output_ratio, fractions.Fraction(1))
-    self.assertTrue(layer.supports_step)
-    self.assertEqual(layer.input_latency, 0)
-    self.assertEqual(layer.output_latency, 0)
-    self.assertEqual(layer.get_accumulated_input_latency(0), 0)
-    self.assertEqual(layer.get_accumulated_output_latency(0), 0)
-
-
-class SequenceLayerConfigTest(
-    SequenceLayerTest, metaclass=_AbcParameterizedTestCaseMeta
-):
-
-  @abc.abstractmethod
-  def get_config_base_cls(self) -> type[types.SequenceLayerConfig]:
-    """Returns the backend-specific SequenceLayerConfig class."""
-
-  def test_copy(self):
-    config_base_cls = self.get_config_base_cls()
-
-    @dataclasses.dataclass(frozen=True)
-    class Config(config_base_cls):
-      a: int = 1234
-      b: str = 'default string'
-
-      def make(self) -> Any:
-        return 'dummy_layer'
-
-      def copy(self, **kwargs) -> Self:
-        return dataclasses.replace(self, **kwargs)
-
-    config = Config()
-    new_config = config.copy(b='new string')
-    self.assertEqual(new_config.a, config.a)
-    self.assertEqual(new_config.b, 'new string')
-
-  def test_copy_raises_on_non_dataclass(self):
-    config_base_cls = self.get_config_base_cls()
-
-    class NonDataclassConfig(config_base_cls):
-
-      def make(self) -> Any:
-        return 'dummy_layer'
-
-    config = NonDataclassConfig()  # pytype: disable=not-instantiable
-    with self.assertRaises(TypeError):
-      new_config = config.copy()
-      del new_config
-
-  def test_copy_disallows_new_fields(self):
-    config_base_cls = self.get_config_base_cls()
-
-    @dataclasses.dataclass(frozen=True)
-    class Config(config_base_cls):
-
-      def make(self) -> Any:
-        return 'dummy_layer'
-
-      def copy(self, **kwargs) -> Self:
-        return dataclasses.replace(self, **kwargs)  # pytype: disable=wrong-keyword-args
-
-    config = Config()
-    # dataclasses.replace raises TypeError for unknown arguments
-    # JAX implementation wraps it in AttributeError
-    with self.assertRaises((TypeError, AttributeError)):
-      new_config = config.copy(field_does_not_exist=1234)
-      del new_config
diff --git a/sequence_layers/jax/__init__.py b/sequence_layers/jax/__init__.py
index 85bb162..3eb88cc 100644
--- a/sequence_layers/jax/__init__.py
+++ b/sequence_layers/jax/__init__.py
@@ -27,3 +27,9 @@
 from sequence_layers.jax.simple import *
 from sequence_layers.jax.time_varying import *
 from sequence_layers.jax.types import *
+
+# (re-export the names for typechecking)
+# pylint: disable=useless-import-alias
+from . import test_utils as test_utils
+from . import types as types
+from .test_utils import SequenceLayerTest
diff --git a/sequence_layers/jax/attention/common.py b/sequence_layers/jax/attention/common.py
index 03eb64c..6c78900 100644
--- a/sequence_layers/jax/attention/common.py
+++ b/sequence_layers/jax/attention/common.py
@@ -32,6 +32,21 @@
 from sequence_layers.jax import typing as jt
 from sequence_layers.jax import utils
 
+# These are the ones which also get exposed in __init__.py. Import other members
+# via sequence_layers.jax.attention.common.
+__all__ = [
+    # go/keep-sorted start
+    'CombinedQueryKeyValueProjection',
+    'CrossAttentionEmits',
+    'InputProjectionModule',
+    'QueryAndKeyValueProjection',
+    'QueryAndSharedKeyValueProjection',
+    'RelativePositionEmbedding',
+    'SelfAttentionEmits',
+    'SeparateQueryKeyValueProjection',
+    # go/keep-sorted end
+]
+
 
 # A negative enough value such that it underflows to a hard zero in softmax.
 _INVALID_LOGIT_VALUE = -1e9
diff --git a/sequence_layers/jax/backend.py b/sequence_layers/jax/backend.py
new file mode 100644
index 0000000..ab694ac
--- /dev/null
+++ b/sequence_layers/jax/backend.py
@@ -0,0 +1,42 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Backend-specific helpers (JAX)."""
+
+from typing import override
+
+import jax.numpy as jnp
+from sequence_layers.specs import backend as spec
+from sequence_layers.specs import types as types_spec
+
+
+class BackendWrapper(spec.xp):
+  """Thin wrapper around JAX to match NumPy interface for tests."""
+
+  bool_ = jnp.bool_
+  int32 = jnp.int32
+  float32 = jnp.float32
+
+  @override
+  def array(self, a, dtype=None) -> types_spec.Array:
+    return jnp.array(a, dtype=dtype)
+
+  @override
+  def zeros(self, shape, dtype=None) -> types_spec.Array:
+    return jnp.zeros(shape, dtype=dtype)
+
+  def concatenate(self, arrays, axis=0) -> types_spec.Array:
+    return jnp.concatenate(arrays, axis=axis)
+
+
+xp: spec.xp = BackendWrapper()
diff --git a/sequence_layers/abstract/__init__.py b/sequence_layers/jax/backend_test.py
similarity index 65%
rename from sequence_layers/abstract/__init__.py
rename to sequence_layers/jax/backend_test.py
index 585a9c6..aa62f37 100644
--- a/sequence_layers/abstract/__init__.py
+++ b/sequence_layers/jax/backend_test.py
@@ -11,7 +11,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Abstract specifications (interfaces and behaviors) for SequenceLayers."""
+"""Tests for JAX backend utilities."""
 
-from sequence_layers.abstract import types
-from sequence_layers.abstract import types_test_base
+from absl.testing import absltest
+from sequence_layers.jax import test_utils
+from sequence_layers.specs import backend_behaviors as spec
+
+
+class ModuleSpecTest(test_utils.SequenceLayerTest, spec.ModuleSpecTest):
+  pass
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/sequence_layers/jax/dsp.py b/sequence_layers/jax/dsp.py
index 55d9092..4b3a6d1 100644
--- a/sequence_layers/jax/dsp.py
+++ b/sequence_layers/jax/dsp.py
@@ -1423,7 +1423,7 @@ def get_output_shape(
       input_shape: types.ShapeLike,
       *,
       constants: types.Constants | None = None,
-  ) -> types.ShapeLike:
+  ) -> types.Shape:
     if not input_shape:
       raise ValueError(
           f'{self} requires input with at least rank 1, got: {input_shape}'
diff --git a/sequence_layers/jax/test_utils.py b/sequence_layers/jax/test_utils.py
index 54cfa66..a7f8ede 100644
--- a/sequence_layers/jax/test_utils.py
+++ b/sequence_layers/jax/test_utils.py
@@ -30,6 +30,7 @@
 from sequence_layers.jax import types
 from sequence_layers.jax import typing as jt
 from sequence_layers.jax import utils
+from sequence_layers.specs import test_utils as spec
 
 
 _SequenceLayerT = TypeVar('_SequenceLayerT', bound=types.SequenceLayer)
@@ -777,9 +778,19 @@ def _mask_and_pad_to_max_length(
   return a, b
 
 
-class SequenceLayerTest(parameterized.TestCase):
+class SequenceLayerTest(spec.SequenceLayerTest):
   """Base class for SequenceLayer tests."""
 
+  @property
+  def sl(self) -> Any:
+    import sequence_layers.jax as jax_sl
+
+    return jax_sl
+
+  @property
+  def xp(self) -> Any:
+    return jnp
+
   def setUp(self):
     super().setUp()
     # To avoid flakes, fix random seeds.
diff --git a/sequence_layers/jax/types.py b/sequence_layers/jax/types.py
index 99e6340..08f8f06 100644
--- a/sequence_layers/jax/types.py
+++ b/sequence_layers/jax/types.py
@@ -19,7 +19,20 @@
 import functools
 import math
 import typing
-from typing import Any, Callable, Generic, Iterable, MutableMapping, ParamSpec, Protocol, Self, Sequence as TypingSequence, TypeVar, override
+from typing import (
+    Any,
+    Callable,
+    Generic,
+    Iterable,
+    MutableMapping,
+    ParamSpec,
+    Protocol,
+    Self,
+    Sequence as TypingSequence,
+    TypeVar,
+    cast,
+    override,
+)
 
 from absl import logging
 from flax import linen as nn
@@ -28,12 +41,11 @@
 from jax import numpy as jnp
 import jaxtyping
 import numpy as np
-from sequence_layers.abstract import types as spec
 from sequence_layers.jax import sharding as sharding_lib
 from sequence_layers.jax import typing as jt
+from sequence_layers.specs import types as spec
 import typeguard
 
-
 __all__ = (
     # go/keep-sorted start
     'ArrayLike',
@@ -85,25 +97,27 @@
 # Sequence type aliases:
 MASK_DTYPE = np.bool_
 
-# A rank 2+ tensor of any type.
+# A rank 2+ array of any type.
 ValuesT = TypeVar('ValuesT', bound=jt.Shaped[jt.ArrayT, 'B T *C'])
+NewValuesT = TypeVar('NewValuesT', bound=jt.Shaped[jt.ArrayT, 'B T *C'])
 
-# A boolean batched mask tensor. True indicates a given timepoint is valid, and
+# A boolean batched mask array. True indicates a given timepoint is valid, and
 # False indicates it is invalid.
 MaskT = TypeVar('MaskT', bound=jt.Bool[jt.ArrayT, 'B T'])
+NewMaskT = TypeVar('NewMaskT', bound=jt.Bool[jt.ArrayT, 'B T'])
 
-# An integer batched lengths tensor.
+# An integer batched lengths array.
 LengthsT = TypeVar('LengthsT', bound=jt.Int[jt.ArrayT, 'B'])
 
-# A rank 2 boolean tensor with unit dimensions inserted to match their
+# A rank 2 boolean array with unit dimensions inserted to match their
 # corresponding values (e.g. for broadcasting).
 ExpandedMaskT = TypeVar('ExpandedMaskT', bound=jt.Bool[jt.ArrayT, 'B T *C'])
 
 # A "self" type alias to allow Sequence and subclasses to return their own
-# Sequence subtype.
-# TODO(rryan): Remove when PEP-0673 lands.
+# Sequence subtype. (Self cannot be parameterized.)
 SequenceSelf = TypeVar('SequenceSelf', bound='Sequence')
 
+
 # Args and keyword args for Sequence.apply_values.
 ApplyValuesParams = ParamSpec('ApplyValuesParams')
 ApplyValuesMaskedParams = ParamSpec('ApplyValuesMaskedParams')
@@ -344,6 +358,7 @@ def dtype(self) -> DType:
     return self.values.dtype
 
   @classmethod
+  @override
   def from_lengths(
       cls, values: ValuesT, lengths: LengthsT, is_masked: bool = False
   ) -> 'Sequence':
@@ -398,41 +413,41 @@ def concatenate(self, other: 'Sequence') -> 'Sequence':
   @override
   def apply_values(
       self,
-      values_fn: Callable[..., ValuesT],
+      values_fn: Callable[..., NewValuesT],
       *args: ApplyValuesParams.args,
       **kwargs: ApplyValuesParams.kwargs,
-  ) -> 'Sequence':
+  ) -> 'Sequence[NewValuesT, MaskT]':
     """Transforms values with values_fn, assuming result is unmasked."""
     return Sequence(values_fn(self.values, *args, **kwargs), self.mask)
 
   @override
   def apply_values_masked(
-      self: SequenceSelf,
-      values_fn: Callable[..., ValuesT],
+      self,
+      values_fn: Callable[..., NewValuesT],
       *args: ApplyValuesMaskedParams.args,
       **kwargs: ApplyValuesMaskedParams.kwargs,
-  ) -> SequenceSelf:
+  ) -> 'Sequence[NewValuesT, MaskT]':
     """Transforms values with values_fn, preserving masked state."""
     return type(self)(values_fn(self.values, *args, **kwargs), self.mask)
 
   @override
   def apply(
       self,
-      apply_fn: Callable[..., tuple[ValuesT, MaskT]],
+      apply_fn: Callable[..., tuple[NewValuesT, NewMaskT]],
       *args: ApplyParams.args,
       **kwargs: ApplyParams.kwargs,
-  ) -> 'Sequence':
+  ) -> 'Sequence[NewValuesT, NewMaskT]':
     """Transforms values/mask with apply_fn, assuming result is unmasked."""
     values, mask = apply_fn(self.values, self.mask, *args, **kwargs)
     return Sequence(values, mask)
 
   @override
   def apply_masked(
-      self: SequenceSelf,
-      apply_fn: Callable[..., tuple[ValuesT, MaskT]],
+      self,
+      apply_fn: Callable[..., tuple[NewValuesT, NewMaskT]],
       *args: ApplyMaskedParams.args,
       **kwargs: ApplyMaskedParams.kwargs,
-  ) -> SequenceSelf:
+  ) -> 'Sequence[NewValuesT, NewMaskT]':
     """Transforms values/mask with apply_fn, preserving masked state."""
     # TODO(rryan): Dig into bug preventing the use of
     # Callable[Concatenate[ValuesT, MaskT, ApplyMaskedParams], tuple[ValuesT,
@@ -442,9 +457,9 @@ def apply_masked(
 
   @override
   def astype(
-      self: SequenceSelf,
+      self,
       dtype: DType | None,
-  ) -> SequenceSelf:
+  ) -> Self:
     """Returns a copy of this sequence with its values cast to dtype."""
     return type(self)(self.values.astype(dtype), self.mask)
 
@@ -455,9 +470,9 @@ def lengths(self) -> jt.Int[jt.ArrayT, 'B']:
 
   @override
   def __getitem__(
-      self: SequenceSelf,
+      self,
       the_slice: slice | tuple[int | slice | None | type(Ellipsis), ...],
-  ) -> SequenceSelf:
+  ) -> Self:
     """Slices the Sequence values and mask with the provided slice."""
     if isinstance(the_slice, slice):
       the_slice = (the_slice,)
@@ -473,12 +488,12 @@ def __getitem__(
 
   @override
   def pad_time(
-      self: SequenceSelf,
+      self,
       pad_left: jt.ScalarInt,
       pad_right: jt.ScalarInt,
       valid: bool,
       pad_value: jt.Scalar | None = None,
-  ) -> SequenceSelf:
+  ) -> Self:
     """Pads this sequence with timesteps on the left and right.
 
     Args:
@@ -511,7 +526,7 @@ def pad_time(
       return_type = Sequence
     return return_type(values, mask)
 
-  def reverse_time(self: SequenceSelf) -> SequenceSelf:
+  def reverse_time(self) -> Self:
     """Reverses the sequence along the time dimension.
 
     Note that this only reverses the physical array with no assumptions about
@@ -525,7 +540,7 @@ def reverse_time(self: SequenceSelf) -> SequenceSelf:
         jnp.flip(self.values, axis=1), jnp.flip(self.mask, axis=1)
     )
 
-  def pad_to_multiple(self, block_size: jt.ScalarInt) -> SequenceSelf:
+  def pad_to_multiple(self, block_size: jt.ScalarInt) -> Self:
     pad_length = (
         self.shape[1] + block_size - 1
     ) // block_size * block_size - self.shape[1]
@@ -543,9 +558,36 @@ def unmask(self) -> 'Sequence':
     return self
 
 
-class MaskedSequence(Sequence[ValuesT, MaskT], Generic[ValuesT, MaskT]):
+class MaskedSequence(
+    Generic[ValuesT, MaskT],
+    Sequence[ValuesT, MaskT],
+    spec.MaskedSequence[ValuesT, MaskT],
+):
   """Sequence whose invalid timesteps are masked to zero."""
 
+  @override
+  def apply_values_masked(
+      self,
+      values_fn: Callable[..., NewValuesT],
+      *args: ApplyValuesMaskedParams.args,
+      **kwargs: ApplyValuesMaskedParams.kwargs,
+  ) -> 'MaskedSequence[NewValuesT, MaskT]':
+    return cast(
+        MaskedSequence,
+        super().apply_values_masked(values_fn, *args, **kwargs),  # pytype: disable=wrong-arg-types
+    )
+
+  @override
+  def apply_masked(
+      self,
+      apply_fn: Callable[..., tuple[NewValuesT, NewMaskT]],
+      *args: ApplyMaskedParams.args,
+      **kwargs: ApplyMaskedParams.kwargs,
+  ) -> 'MaskedSequence[NewValuesT, NewMaskT]':
+    return cast(
+        MaskedSequence, super().apply_masked(apply_fn, *args, **kwargs)  # pytype: disable=wrong-arg-types
+    )
+
   @override
   def mask_invalid(self, mask_value: complex | None = None) -> 'Sequence':
     """Returns a sequence with invalid timesteps replaced with mask_value."""
@@ -561,21 +603,19 @@ def unmask(self) -> Sequence:
 
 
 def mask_invalid(
-    sequence: Sequence,
+    self: Sequence,
     mask_value: complex | None = None,
 ) -> 'Sequence':
   """Returns a sequence whose invalid timesteps are replaced with mask_value."""
-  expanded_mask = sequence.expanded_mask()
+  expanded_mask = self.expanded_mask()
   if mask_value is None:
-    masked_values = jnp.zeros_like(sequence.values)
+    masked_values = jnp.zeros_like(self.values)
     result_type = MaskedSequence
   else:
-    masked_values = jnp.full(
-        sequence.values.shape, mask_value, sequence.values.dtype
-    )
+    masked_values = jnp.full(self.values.shape, mask_value, self.values.dtype)
     result_type = Sequence
-  masked_values = jnp.where(expanded_mask, sequence.values, masked_values)
-  return result_type(masked_values, sequence.mask)
+  masked_values = jnp.where(expanded_mask, self.values, masked_values)
+  return result_type(masked_values, self.mask)
 
 
 # Defined outside of Sequence so that mask_invalid can return a MaskedSequence.
@@ -604,6 +644,8 @@ def __getitem__(cls, item):
 
 
 class SequenceT(Sequence, metaclass=MetaSequenceT):
+  """Allows typing to be: SequenceT[Float, "B T C"]."""
+
   pass
 
 
@@ -675,7 +717,7 @@ def _add_custom_checker_lookup_fn(lookup_fn):
 _add_custom_checker_lookup_fn(_sequence_checker_lookup_fn)
 
 
-class Steppable(spec.Steppable):
+class Steppable(spec.Steppable[Sequence, Sequence, ChannelSpec]):
   """A sequence processing layer that can be executed layerwise or stepwise.
 
   # Step-wise execution:
@@ -882,6 +924,7 @@ def receptive_field_per_step(self) -> dict[int, ReceptiveField]:
     )
 
   @abc.abstractmethod
+  @override
   def layer(
       self, x: Sequence, *, training: bool, constants: Constants | None = None
   ) -> Sequence:
@@ -901,6 +944,7 @@ def layer(
         truncated to only represent valid frames.
     """
 
+  @override
   def layer_with_emits(
       self,
       x: Sequence,
@@ -908,11 +952,11 @@ def layer_with_emits(
       training: bool,
       constants: Constants | None = None,
   ) -> tuple[Sequence, Emits]:
-    """Process this layer layer-wise, producing emitted tensors.
+    """Process this layer layer-wise, producing emitted arrays.
 
     This is like `layer`, except it has an additional return value which is the
-    "emitted" tensors for the layer. The emitted tensors are a structure of
-    tensors whose whose values are `ArrayLike`s or `Sequence`s.
+    "emitted" arrays for the layer. The emitted arrays are a structure of
+    arrays whose whose values are `ArrayLike`s or `Sequence`s.
 
     Args:
       x: Input sequence with values shaped [b, t_i, ...].
@@ -926,7 +970,7 @@ def layer_with_emits(
       y: The outputs corresponding to this layer with values shaped
         [b, t_o, ...] where `t_o == t_i * output_ratio`. t_o may have been
         truncated to only represent valid frames.
-      emits: A nest of emitted tensors or Sequences.
+      emits: A nest of emitted arrays or Sequences.
     """
     outputs = self.layer(x, training=training, constants=constants)
     return outputs, ()
@@ -938,6 +982,7 @@ def __call__(
     return self.layer(x, training=training, constants=constants)
 
   @abc.abstractmethod
+  @override
   def step(
       self,
       x: Sequence,
@@ -946,12 +991,12 @@ def step(
       training: bool,
       constants: Constants | None = None,
   ) -> tuple[Sequence, State]:
-    """Process this layer step-wise, producing emitted tensors.
+    """Process this layer step-wise, producing emitted arrays.
 
     Args:
       x: Input sequence with values shaped [b, t_i, ...], where t_i is a
         multiple of block_size.
-      state: A structure of state tensors matching get_initial_state. The
+      state: A structure of state arrays matching get_initial_state. The
         previous state for this layer.
       training: Python bool. Whether we are in training mode.
       constants: A dictionary of constant name to ArrayLike or sl.Sequence.
@@ -962,10 +1007,11 @@ def step(
     Returns:
       y: The outputs corresponding to this step with values shaped [b, t_o, ...]
         where `t_o == t_i * output_ratio`.
-      state: A structure of state tensors matching get_initial_state. The
+      state: A structure of state arrays matching get_initial_state. The
         new state for this layer.
     """
 
+  @override
   def step_with_emits(
       self,
       x: Sequence,
@@ -974,16 +1020,16 @@ def step_with_emits(
       training: bool,
       constants: Constants | None = None,
   ) -> tuple[Sequence, State, Emits]:
-    """Process this layer step-wise, producing emitted tensors.
+    """Process this layer step-wise, producing emitted arrays.
 
     This is like `step`, except it has an additional return value which is the
-    "emitted" tensors for the step. The emitted tensors are a structure of
-    tensors whose values are `ArrayLike`s or `Sequence`s.
+    "emitted" arrays for the step. The emitted arrays are a structure of
+    arrays whose values are `ArrayLike`s or `Sequence`s.
 
     Args:
       x: Input sequence with values shaped [b, t_i, ...], where t_i is a
         multiple of block_size.
-      state: A structure of state tensors matching get_initial_state. The
+      state: A structure of state arrays matching get_initial_state. The
         previous state for this layer.
       training: Python bool. Whether we are in training mode.
       constants: A dictionary of constant name to ArrayLike or sl.Sequence.
@@ -994,14 +1040,15 @@ def step_with_emits(
     Returns:
       y: The outputs corresponding to this step with values shaped [b, t_o, ...]
         where `t_o == t_i * output_ratio`.
-      state: A structure of state tensors matching get_initial_state. The
+      state: A structure of state arrays matching get_initial_state. The
         new state for this layer.
-      emits: A nest of emitted tensors or Sequences.
+      emits: A nest of emitted arrays or Sequences.
     """
     outputs, state = self.step(x, state, training=training, constants=constants)
     return outputs, state, ()
 
   @abc.abstractmethod
+  @override
   def get_initial_state(
       self,
       batch_size: int,
@@ -1023,14 +1070,15 @@ def get_initial_state(
         attention layer this may contain the source sequence to attend to.
 
     Returns:
-      An integer, TensorShape or structure of integer/TensorShapes.
+      An integer, shape or structure of integer/shapes.
     """
 
   @abc.abstractmethod
+  @override
   def get_output_shape(
       self, input_shape: ShapeLike, *, constants: Constants | None = None
   ) -> Shape:
-    """Returns the output shape this layer produces for an input shape.
+    """Returns the output channel shape this layer produces for an input channel shape.
 
     Args:
       input_shape: A shape representing the channels dimension of the input
@@ -1091,12 +1139,14 @@ def get_output_spec_for_sequence(
     return self.get_output_spec(x.channel_spec, constants=constants)
 
   @abc.abstractmethod
+  @override
   def get_output_dtype(
       self, input_dtype: DType, *, constants: Constants | None = None
   ) -> DType:
     """Returns the layer's output dtype for the specified input dtype."""
 
   @nn.nowrap
+  @override
   def get_output_spec(
       self,
       input_spec: ChannelSpec,
@@ -1263,14 +1313,17 @@ def check_step_with_emits_fn(
   return check_step_with_emits_fn
 
 
-class SequenceLayer(nn.Module, Steppable):
+class SequenceLayer(
+    nn.Module, Steppable, spec.SequenceLayer[Sequence, Sequence, ChannelSpec]
+):
   """Base Module for Sequence Layers."""
 
 
-class PreservesType:
+class PreservesType(spec.PreservesType):
   """A mix-in for layers that do not change the input dtype."""
 
   @nn.nowrap
+  @override
   def get_output_dtype(
       self, input_dtype: DType, *, constants: Constants | None = None
   ) -> DType:
@@ -1278,7 +1331,7 @@ def get_output_dtype(
     return input_dtype
 
 
-class PreservesShape:
+class PreservesShape(spec.PreservesShape):
   """A mix-in for layers that do not change the input shape."""
 
   @nn.nowrap
@@ -1289,8 +1342,8 @@ def get_output_shape(
     return tuple(input_shape)
 
 
-class Emitting(SequenceLayer, metaclass=abc.ABCMeta):
-  """A SequenceLayer that emits auxiliary tensors.
+class Emitting(SequenceLayer, spec.Emitting[Sequence, Sequence, ChannelSpec]):  # pytype: disable=ignored-abstractmethod
+  """A SequenceLayer that emits auxiliary arrays.
 
   This is a convenience subclass that implements step and layer in terms of
   step_with_emits and layer_with_emits, so that implementors need only implement
@@ -1299,6 +1352,7 @@ class Emitting(SequenceLayer, metaclass=abc.ABCMeta):
   do not produce emits.
   """
 
+  @override
   def step(
       self,
       x: Sequence,
@@ -1323,6 +1377,7 @@ def step_with_emits(
   ) -> tuple[Sequence, State, Emits]:
     pass
 
+  @override
   def layer(
       self,
       x: Sequence,
@@ -1346,7 +1401,7 @@ def layer_with_emits(
     pass
 
 
-class Stateless(SequenceLayer):
+class Stateless(SequenceLayer, spec.Stateless[Sequence, Sequence, ChannelSpec]):  # pytype: disable=ignored-abstractmethod
   """A SequenceLayer with no state over time required for step-wise processing.
 
   Sub-classes must only implement:
@@ -1356,9 +1411,11 @@ class Stateless(SequenceLayer):
   """
 
   @property
+  @override
   def receptive_field_per_step(self) -> dict[int, ReceptiveField]:
     return {0: (0, 0)}
 
+  @override
   def get_initial_state(
       self,
       batch_size: int,
@@ -1369,9 +1426,11 @@ def get_initial_state(
   ) -> State:
     del batch_size
     del input_spec
+    del training
     del constants
     return ()
 
+  @override
   def step(
       self,
       x: Sequence,
@@ -1382,9 +1441,30 @@ def step(
   ) -> tuple[Sequence, State]:
     return self.layer(x, training=training, constants=constants), state
 
+  def get_output_shape(
+      self, input_shape: ShapeLike, *, constants: Constants | None = None
+  ) -> Shape:
+    raise NotImplementedError()
+
+  def get_output_dtype(
+      self, input_dtype: DType, *, constants: Constants | None = None
+  ) -> DType:
+    raise NotImplementedError()
+
+  def layer(
+      self,
+      x: Sequence,
+      *,
+      training: bool,
+      constants: Constants | None = None,
+  ) -> 'Sequence':
+    raise NotImplementedError()
 
-class StatelessEmitting(Emitting):
-  """A SequenceLayer with no state over time that emits auxiliary tensors.
+
+class StatelessEmitting(  # pytype: disable=ignored-abstractmethod
+    Emitting, spec.StatelessEmitting[Sequence, Sequence, ChannelSpec]
+):
+  """A SequenceLayer with no state over time that emits auxiliary arrays.
 
   Sub-classes must only implement:
   - layer_with_emits
@@ -1393,9 +1473,11 @@ class StatelessEmitting(Emitting):
   """
 
   @property
+  @override
   def receptive_field_per_step(self) -> dict[int, ReceptiveField]:
     return {0: (0, 0)}
 
+  @override
   def step_with_emits(
       self,
       x: Sequence,
@@ -1409,6 +1491,7 @@ def step_with_emits(
     )
     return outputs, state, emits
 
+  @override
   def get_initial_state(
       self,
       batch_size: int,
@@ -1417,21 +1500,59 @@ def get_initial_state(
       training: bool,
       constants: Constants | None = None,
   ) -> State:
+    del batch_size
+    del input_spec
+    del training
+    del constants
     return ()
 
+  @abc.abstractmethod
+  @override
+  def get_output_shape(
+      self, input_shape: ShapeLike, *, constants: Constants | None = None
+  ) -> Shape:
+    raise NotImplementedError()
+
+  @abc.abstractmethod
+  @override
+  def get_output_dtype(
+      self, input_dtype: DType, *, constants: Constants | None = None
+  ) -> DType:
+    raise NotImplementedError()
 
-class StatelessPointwise(PreservesShape, Stateless):
+  @abc.abstractmethod
+  @override
+  def layer_with_emits(
+      self,
+      x: Sequence[ValuesT, MaskT],
+      *,
+      training: bool,
+      constants: Constants | None = None,
+  ) -> tuple[Sequence[ValuesT, MaskT], Emits]:
+    raise NotImplementedError()
+
+
+class StatelessPointwise(
+    PreservesShape,
+    Stateless,
+    spec.StatelessPointwise[Sequence, Sequence, ChannelSpec],
+):
   """A SequenceLayer that has no state and operates pointwise on its input."""
 
 
-class StatelessPointwiseFunctor(StatelessPointwise, metaclass=abc.ABCMeta):
+class StatelessPointwiseFunctor(  # pytype: disable=ignored-abstractmethod
+    StatelessPointwise,
+    spec.StatelessPointwiseFunctor[Sequence, Sequence, ChannelSpec],
+):
   """A stateless SequenceLayer for simple pointwise processing fns."""
 
   @abc.abstractmethod
+  @override
   def fn(self, values: ValuesT, mask: MaskT) -> tuple[ValuesT, MaskT]:
     """Transforms each scalar in values independently."""
 
   @property
+  @override
   def mask_required(self):
     """Returns true if fn can change the sequence's masked state.
 
@@ -1440,6 +1561,7 @@ def mask_required(self):
     return True
 
   @check_layer
+  @override
   def layer(
       self,
       x: Sequence,
diff --git a/sequence_layers/jax/types_test.py b/sequence_layers/jax/types_test.py
index c2bec3e..87d0e04 100644
--- a/sequence_layers/jax/types_test.py
+++ b/sequence_layers/jax/types_test.py
@@ -11,10 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+#
+# PyType limitation that interprets jax/typing.py as stdlib typing:
+# pytype: disable=module-attr
 """Types test."""
 
 import dataclasses
-import typing
+from typing import Sequence
 
 import chex
 import flax.linen as nn
@@ -22,37 +25,33 @@
 import jax.numpy as jnp
 import jaxtyping
 import numpy as np
-from sequence_layers.abstract import types_test_base
 from sequence_layers.jax import simple
 from sequence_layers.jax import test_utils
 from sequence_layers.jax import types
 from sequence_layers.jax import typing as jt
+from sequence_layers.specs import types_behaviors as spec
 
 
-class Foo(nn.Module):
-
-  @nn.compact
-  def __call__(self, x: types.Sequence) -> types.Sequence:
-    return x
+class ModuleInterfaceTest(
+    test_utils.SequenceLayerTest, spec.ModuleInterfaceTest
+):
+  pass
 
 
-class SequenceTest(test_utils.SequenceLayerTest, types_test_base.SequenceTest):
+class SequenceTest(test_utils.SequenceLayerTest, spec.SequenceTest):
   """Tests for the Sequence class."""
 
-  def get_backend(self):
-    return jnp
-
-  @property
-  def Sequence(self):
-    return types.Sequence
-
-  @property
-  def MaskedSequence(self):
-    return types.MaskedSequence
-
   def test_type_checks(self):
     """Test type checks in Sequence.__post_init__."""
 
+    class Foo(nn.Module):
+
+      @nn.compact
+      def __call__(
+          self, x: types.Sequence[types.ValuesT, types.MaskT]
+      ) -> types.Sequence:
+        return x
+
     # Allowed: Both array-like.
     types.Sequence(jnp.zeros((2, 3, 5)), jnp.zeros((2, 3), dtype=jnp.bool_))
     types.Sequence(np.zeros((2, 3, 5)), np.zeros((2, 3), dtype=jnp.bool_))
@@ -106,26 +105,6 @@ def test_type_checks(self):
     with self.assertRaises(jaxtyping.TypeCheckError):
       types.Sequence(np.zeros((2, 3, 5)), np.zeros((1, 3), dtype=jnp.bool_))
 
-  def test_mask_invalid_idempotent(self):
-    values = jnp.array([
-        [1.0, 2.0, 3.0, 4.0],
-        [10.0, 20.0, 30.0, 40.0],
-    ])
-    mask = jnp.array([[True, True, False, False], [False, False, False, True]])
-
-    x = types.Sequence(values, mask)
-    masked = x.mask_invalid()
-    self.assertIsNot(masked, x)
-    self.assertIsInstance(masked, types.MaskedSequence)
-
-    masked_again = masked.mask_invalid()
-    self.assertIs(masked_again, masked)
-    self.assertIsInstance(masked_again, types.MaskedSequence)
-
-    masked2 = x.mask_invalid()
-    self.assertIsNot(masked2, masked)
-    self.assertIsInstance(masked2, types.MaskedSequence)
-
   def test_type_annotation(self):
     if not jt.runtime_type_checking_enabled:
       self.skipTest('Type checking is disabled.')
@@ -219,36 +198,17 @@ def fn(x: types.Sequence) -> types.Sequence:
     y = fn(x)
     self.assertSequencesEqual(y, x)
 
-  def test_from_lengths(self):
-    x_expected = test_utils.random_sequence(5, 17, 2)
-    x = types.Sequence.from_lengths(x_expected.values, x_expected.lengths())
-    self.assertSequencesEqual(x, x_expected)
-
-    # Out of range lengths are clipped to 0 or max.
-    x = types.Sequence.from_lengths(x_expected.values, [-1, 0, 5, 17, 18])
-    self.assertAllEqual(x.lengths(), jnp.asarray([0, 0, 5, 17, 17]))
-    self.assertNotIsInstance(x, types.MaskedSequence)
-
-    # Return type is MaskedSequence if is_masked=True.
-    x = types.Sequence.from_lengths(
-        x_expected.values, [-1, 0, 5, 17, 18], is_masked=True
-    )
-    self.assertAllEqual(x.lengths(), jnp.asarray([0, 0, 5, 17, 17]))
-    self.assertIsInstance(x, types.MaskedSequence)
-
 
 class SequenceLayerConfigTest(
-    test_utils.SequenceLayerTest, types_test_base.SequenceLayerConfigTest
+    test_utils.SequenceLayerTest, spec.SequenceLayerConfigTest
 ):
-
-  def get_config_base_cls(self):
-    return types.SequenceLayerConfig
+  pass
 
   def test_copy_raises_on_mutable_attribute(self):
 
     @dataclasses.dataclass(slots=True)
     class ConfigWithSequence(types.SequenceLayerConfig):
-      seq: typing.Sequence[int]
+      seq: Sequence[int]
 
       def make(self) -> simple.Identity:
         return simple.Identity.Config().make()
@@ -283,30 +243,36 @@ def make(self) -> simple.Identity:
       del new_config
 
 
-class SteppableTest(types_test_base.SteppableTest):
+class SteppableTest(test_utils.SequenceLayerTest, spec.SteppableTest):
+  pass
 
-  def create_steppable(self):
 
-    class DefaultSteppable(types.Steppable):
+class PreservesTypeTest(test_utils.SequenceLayerTest, spec.PreservesTypeTest):
+  pass
 
-      def layer(self, x, *, training=False, constants=None):
-        return x
 
-      def step(self, x, state, *, training=False, constants=None):
-        return x, state
+class PreservesShapeTest(test_utils.SequenceLayerTest, spec.PreservesShapeTest):
+  pass
 
-      def get_initial_state(
-          self, batch_size, input_spec, *, training=False, constants=None
-      ):
-        return 0
 
-      def get_output_shape(self, input_shape, *, constants=None):
-        return input_shape
+class StatelessTest(test_utils.SequenceLayerTest, spec.StatelessTest):
+  pass
 
-      def get_output_dtype(self, input_dtype, *, constants=None):
-        return input_dtype
 
-    return DefaultSteppable()
+class EmittingTest(test_utils.SequenceLayerTest, spec.EmittingTest):
+  pass
+
+
+class StatelessEmittingTest(
+    test_utils.SequenceLayerTest, spec.StatelessEmittingTest
+):
+  pass
+
+
+class StatelessPointwiseFunctorTest(
+    test_utils.SequenceLayerTest, spec.StatelessPointwiseFunctorTest
+):
+  pass
 
 
 if __name__ == '__main__':
diff --git a/sequence_layers/jax/typing.py b/sequence_layers/jax/typing.py
index 9b75781..3788271 100644
--- a/sequence_layers/jax/typing.py
+++ b/sequence_layers/jax/typing.py
@@ -18,22 +18,23 @@
 from jaxtyping import AbstractDtype, Bool, config as jaxtyping_config, Float, Int, PyTree, Shaped, jaxtyped, TypeCheckError
 import numpy as np
 import typeguard
-from typing import Callable, TypeVar, Union
+from typing import Callable, TypeVar, TYPE_CHECKING, Union
 
+if TYPE_CHECKING:
+  ArrayT = jax.Array | np.ndarray
+else:
 
-class _MetaArrayT(type):
-  types = ()
+  class _MetaArrayT(type):
+    types = ()
 
-  def __instancecheck__(cls, obj):
-    return isinstance(obj, cls.types)
+    def __instancecheck__(cls, obj):
+      return isinstance(obj, cls.types)
 
+  class JaxArrayT(metaclass=_MetaArrayT):
+    types = (jax.Array, jax.ShapeDtypeStruct)
 
-class JaxArrayT(metaclass=_MetaArrayT):
-  types = (jax.Array, jax.ShapeDtypeStruct)
-
-
-class ArrayT(metaclass=_MetaArrayT):
-  types = (JaxArrayT, np.ndarray)
+  class ArrayT(metaclass=_MetaArrayT):
+    types = (JaxArrayT, np.ndarray)
 
 
 Scalar = Shaped[ArrayT, ''] | Shaped[np.generic, ''] | Shaped[jnp.generic, '']
diff --git a/sequence_layers/mlx/__init__.py b/sequence_layers/mlx/__init__.py
index 8f5b18f..ddaa329 100644
--- a/sequence_layers/mlx/__init__.py
+++ b/sequence_layers/mlx/__init__.py
@@ -1,3 +1,4 @@
+# pylint: disable=cyclic-import
 # Copyright 2026 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,3 +15,10 @@
 """Sequence layers in MLX."""
 
 from sequence_layers.mlx.types import *
+
+# pylint: disable=useless-import-alias
+# (re-export the names for typechecking)
+from . import backend as backend
+from . import test_utils as test_utils
+from . import types as types
+from .test_utils import SequenceLayerTest
diff --git a/sequence_layers/mlx/backend.py b/sequence_layers/mlx/backend.py
new file mode 100644
index 0000000..513cd21
--- /dev/null
+++ b/sequence_layers/mlx/backend.py
@@ -0,0 +1,43 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Backend-specific helpers (MLX)."""
+
+from typing import override
+
+import mlx.core as mx
+from sequence_layers.specs import backend as spec
+from sequence_layers.specs import types as types_spec
+
+
+class BackendWrapper(spec.xp):
+  """Thin wrapper around MLX to match NumPy interface for tests."""
+
+  bool_ = mx.bool_
+  int32 = mx.int32
+  float32 = mx.float32
+
+  @override
+  def array(self, a, dtype=None) -> types_spec.Array:
+    return mx.array(a, dtype=dtype)
+
+  @override
+  def zeros(self, shape, dtype=None) -> types_spec.Array:
+    return mx.zeros(shape, dtype=dtype)
+
+  @override
+  def concatenate(self, arrays, axis=0) -> types_spec.Array:
+    return mx.concatenate(arrays, axis=axis)  # pyrefly: ignore[bad-argument-type]
+
+
+xp: spec.xp = BackendWrapper()
diff --git a/sequence_layers/mlx/backend_test.py b/sequence_layers/mlx/backend_test.py
new file mode 100644
index 0000000..af07d82
--- /dev/null
+++ b/sequence_layers/mlx/backend_test.py
@@ -0,0 +1,26 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for MLX backend utilities."""
+
+from absl.testing import absltest
+from sequence_layers.mlx import test_utils  # pylint: disable=cyclic-import
+from sequence_layers.specs import backend_behaviors as spec
+
+
+class ModuleSpecTest(test_utils.SequenceLayerTest, spec.ModuleSpecTest):
+  pass
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/sequence_layers/mlx/basic_types.py b/sequence_layers/mlx/basic_types.py
deleted file mode 100644
index 7369df2..0000000
--- a/sequence_layers/mlx/basic_types.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright 2026 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Basic sequence types for MLX."""
-
-import dataclasses
-from typing import Generic, TypeVar
-
-import mlx.core as mx
-import numpy as np
-
-# A rank 2+ tensor of any type.
-# Note: MLX does not support jaxtyping-style shape annotations out of the box,
-# so we simply bind to mx.array.
-ValuesT = TypeVar('ValuesT', bound=mx.array)
-
-# You can also add the others if you need them:
-MaskT = TypeVar('MaskT', bound=mx.array)
-LengthsT = TypeVar('LengthsT', bound=mx.array)
-ExpandedMaskT = TypeVar('ExpandedMaskT', bound=mx.array)
-# A "self" type alias to allow Sequence and subclasses to return their own
-# Sequence subtype.
-SequenceSelf = TypeVar('SequenceSelf', bound='Sequence')
-Shape = tuple[int, ...]
-DType = np.dtype
-
-
-def sequence_mask(lengths: LengthsT, maxlen: int) -> MaskT:
-  return mx.arange(maxlen)[None, :] < mx.array(lengths)[:, None]
-
-
-@dataclasses.dataclass(frozen=True)
-class ChannelSpec:
-  """A specification for the channel shape and dtype of a sequence."""
-
-  shape: Shape
-  dtype: DType
-
-
-class Sequence(Generic[ValuesT, MaskT]):
-  """A generic sequence container that preserves masking information."""
-
-  values: ValuesT
-  mask: MaskT
-
-  def __init__(self, values: ValuesT, mask: MaskT):
-    self.values = values
-    self.mask = mask
-
-  @property
-  def shape(self) -> Shape:
-    """Returns the shape of the sequence values."""
-    return self.values.shape
-
-  @property
-  def ndim(self) -> int:
-    """Returns the rank of the sequence values."""
-    return self.values.ndim
-
-  @property
-  def channel_shape(self) -> Shape:
-    """Returns the channel shape (the shape without batch and time)."""
-    return self.values.shape[2:]
-
-  @property
-  def channel_spec(self) -> ChannelSpec:
-    """Returns a "spec" for this sequence (the channel shape and dtype)."""
-    return ChannelSpec(self.channel_shape, self.dtype)
-
-  @property
-  def dtype(self) -> DType:
-    """Returns the dtype of the sequence values."""
-    return self.values.dtype
-
-  def expanded_mask(self) -> ExpandedMaskT:
-    """Returns the Sequence mask with dimensions expanded to match values."""
-    return self.mask.reshape(self.mask.shape + (1,) * (self.values.ndim - 2))
-
-  def mask_invalid(self, mask_value: complex | None = None) -> 'Sequence':
-    """Returns a sequence with invalid timesteps replaced with mask_value."""
-    raise NotImplementedError('Replaced below.')
-
-  def unmask(self) -> 'Sequence':
-    """Returns an unmasked version of this sequence with unchanged values."""
-    # We are already an unmasked sequence.
-    return self
-
-
-class MaskedSequence(Sequence[ValuesT, MaskT]):
-  """Sequence whose invalid timesteps are masked to zero."""
-
-  def mask_invalid(self, mask_value: complex | None = None) -> 'Sequence':
-    """Returns a sequence with invalid timesteps replaced with mask_value."""
-    if mask_value is None:
-      return self
-    else:
-      return mask_invalid(self, mask_value)
-
-  def unmask(self) -> Sequence:
-    """Returns an unmasked version of this sequence with unchanged values."""
-    return Sequence(self.values, self.mask)
-
-
-def mask_invalid(
-    sequence: Sequence,
-    mask_value: complex | None = None,
-) -> 'Sequence':
-  """Returns a sequence whose invalid timesteps are replaced with mask_value."""
-  expanded_mask = sequence.expanded_mask()
-  if mask_value is None:
-    masked_values = mx.zeros_like(sequence.values)
-    result_type = MaskedSequence
-  else:
-    masked_values = mx.full(
-        sequence.values.shape, mask_value, sequence.values.dtype
-    )
-    result_type = Sequence
-  masked_values = mx.where(expanded_mask, sequence.values, masked_values)
-  return result_type(masked_values, sequence.mask)
-
-
-# Defined outside of Sequence so that mask_invalid can return a MaskedSequence.
-Sequence.mask_invalid = mask_invalid
diff --git a/sequence_layers/mlx/types.py b/sequence_layers/mlx/types.py
index e5ffd0d..b959725 100644
--- a/sequence_layers/mlx/types.py
+++ b/sequence_layers/mlx/types.py
@@ -18,12 +18,22 @@
 import fractions
 import functools
 import math
-from typing import Callable, Generic, Iterable, TypeVar, override
+import types
+from typing import (
+    Any,
+    Callable,
+    Iterable,
+    MutableMapping,
+    Self,
+    TypeVar,
+    cast,
+    override,
+)
 
+import jaxtyping as jt
 from mlx import nn
 import mlx.core as mx
-import numpy as np
-from sequence_layers.abstract import types
+from sequence_layers.specs import types as spec
 
 # Type aliases.
 MASK_DTYPE = mx.bool_
@@ -32,19 +42,20 @@
 MaskT = TypeVar('MaskT', bound=mx.array)
 LengthsT = TypeVar('LengthsT', bound=mx.array)
 ExpandedMaskT = TypeVar('ExpandedMaskT', bound=mx.array)
+NewValuesT = TypeVar('NewValuesT', bound=mx.array)
+NewMaskT = TypeVar('NewMaskT', bound=mx.array)
 SequenceSelf = TypeVar('SequenceSelf', bound='Sequence')
 
 Shape = tuple[int, ...]
 ShapeLike = list[int] | tuple[int, ...]
-DType = np.dtype
+DType = mx.Dtype
 State = object  # Any pytree.
-Constants = dict[str, object]
-Emits = object
+Constants = MutableMapping[str, jt.PyTree[mx.array]]
+Emits = jt.PyTree[mx.array]
 
 # Receptive field.
 ReceptiveField = tuple[float | int, float | int] | None
 
-
 __all__ = (
     # go/keep-sorted start
     'ChannelSpec',
@@ -87,9 +98,11 @@ def __init__(self, shape: Shape, dtype: DType):
     self.shape = shape
     self.dtype = dtype
 
+  @override
   def __repr__(self) -> str:
     return f'ShapeDType(shape={self.shape}, dtype={self.dtype})'
 
+  @override
   def __eq__(self, other: object) -> bool:
     if not isinstance(other, ShapeDType):
       return NotImplemented
@@ -101,14 +114,17 @@ def __hash__(self) -> int:
 
 ChannelSpec = ShapeDType
 
-PaddingMode = types.PaddingMode
+PaddingMode = spec.PaddingMode
 
 
-def sequence_mask(lengths: LengthsT, maxlen: int) -> MaskT:
-  return mx.arange(maxlen)[None, :] < mx.array(lengths)[:, None]
+def sequence_mask(lengths: LengthsT, maxlen: int) -> mx.array:
+  """Generates a boolean mask for sequences based on lengths."""
+  return mx.arange(maxlen)[None, :] < mx.array(lengths)[:, None]  # pylint: disable=unsubscriptable-object
 
 
-class Sequence(types.Sequence[ValuesT, MaskT], Generic[ValuesT, MaskT]):
+class Sequence[ValuesT: mx.array, MaskT: mx.array](
+    spec.Sequence[ValuesT, MaskT]
+):
   """A generic sequence container that preserves masking information."""
 
   values: ValuesT
@@ -147,6 +163,23 @@ def dtype(self) -> DType:
     """Returns the dtype of the sequence values."""
     return self.values.dtype
 
+  @classmethod
+  @override
+  def from_lengths(
+      cls,
+      values: ValuesT,
+      lengths: LengthsT,
+      is_masked: bool = False,
+  ) -> 'Sequence':
+    """Constructs a sequence from values and per-batch element lengths."""
+    values_arr = mx.array(values)
+    mask = sequence_mask(lengths, maxlen=values_arr.shape[1])
+    return (
+        MaskedSequence(values_arr, mask)
+        if is_masked
+        else Sequence(values_arr, mask)
+    )
+
   @classmethod
   @override
   def from_values(cls, values: ValuesT) -> 'MaskedSequence':
@@ -174,7 +207,7 @@ def concatenate_sequences(cls, sequences: Iterable['Sequence']) -> 'Sequence':
     )
 
   @override
-  def expanded_mask(self) -> ExpandedMaskT:
+  def expanded_mask(self) -> mx.array:
     """Returns the Sequence mask expanded to match values rank."""
     return self.mask.reshape(self.mask.shape + (1,) * (self.values.ndim - 2))
 
@@ -190,13 +223,16 @@ def apply_values(
 
   @override
   def apply_values_masked(
-      self: SequenceSelf,
-      values_fn: Callable[..., ValuesT],
+      self,
+      values_fn: Callable[..., NewValuesT],
       *args,
       **kwargs,
-  ) -> SequenceSelf:
+  ) -> 'Sequence[NewValuesT, MaskT]':
     """Transforms values, preserving masked state."""
-    return type(self)(values_fn(self.values, *args, **kwargs), self.mask)
+    return cast(
+        'Sequence[NewValuesT, MaskT]',
+        type(self)(values_fn(self.values, *args, **kwargs), self.mask),
+    )
 
   @override
   def apply(
@@ -211,14 +247,14 @@ def apply(
 
   @override
   def apply_masked(
-      self: SequenceSelf,
-      apply_fn: Callable[..., tuple[ValuesT, MaskT]],
+      self,
+      apply_fn: Callable[..., tuple[NewValuesT, NewMaskT]],
       *args,
       **kwargs,
-  ) -> SequenceSelf:
+  ) -> 'Sequence[NewValuesT, NewMaskT]':
     """Transforms values/mask, preserving masked state."""
     values, mask = apply_fn(self.values, self.mask, *args, **kwargs)
-    return type(self)(values, mask)
+    return cast('Sequence[NewValuesT, NewMaskT]', type(self)(values, mask))
 
   @override
   def astype(self: SequenceSelf, dtype: DType | None) -> SequenceSelf:
@@ -235,7 +271,7 @@ def lengths(self) -> mx.array:
   @override
   def __getitem__(
       self: SequenceSelf,
-      the_slice,
+      the_slice: slice | tuple[int | slice | None | types.EllipsisType, ...],
   ) -> SequenceSelf:
     """Slices the Sequence values and mask."""
     if isinstance(the_slice, slice):
@@ -275,8 +311,9 @@ def concatenate(self, other: 'Sequence') -> 'Sequence':
     """Concatenates with other on the time dimension."""
     values = mx.concatenate([self.values, other.values], axis=1)
     mask = mx.concatenate([self.mask, other.mask], axis=1)
-    return_type = type(self) if type(self) is type(other) else Sequence
-    return return_type(values, mask)
+    if type(self) is type(other):
+      return type(self)(values, mask)
+    return Sequence(values, mask)
 
   @override
   def mask_invalid(self, mask_value: complex | None = None) -> 'Sequence':
@@ -289,11 +326,37 @@ def unmask(self) -> 'Sequence':
     return self
 
 
-class MaskedSequence(Sequence[ValuesT, MaskT], Generic[ValuesT, MaskT]):
+class MaskedSequence[ValuesT: mx.array, MaskT: mx.array](
+    Sequence[ValuesT, MaskT], spec.MaskedSequence[ValuesT, MaskT]
+):
   """Sequence whose invalid timesteps are masked to zero."""
 
   @override
-  def mask_invalid(self, mask_value: complex | None = None) -> 'Sequence':
+  def apply_values_masked(
+      self,
+      values_fn: Callable[..., NewValuesT],
+      *args,
+      **kwargs,
+  ) -> 'MaskedSequence[NewValuesT, MaskT]':
+    return cast(
+        'MaskedSequence[NewValuesT, MaskT]',
+        type(self)(values_fn(self.values, *args, **kwargs), self.mask),
+    )
+
+  @override
+  def apply_masked(
+      self,
+      apply_fn: Callable[..., tuple[NewValuesT, NewMaskT]],
+      *args,
+      **kwargs,
+  ) -> 'MaskedSequence[NewValuesT, NewMaskT]':
+    values, mask = apply_fn(self.values, self.mask, *args, **kwargs)
+    return cast(
+        'MaskedSequence[NewValuesT, NewMaskT]', type(self)(values, mask)
+    )
+
+  @override
+  def mask_invalid(self, mask_value: complex | None = None) -> Sequence:
     if mask_value is None:
       return self
     return mask_invalid(self, mask_value)
@@ -304,21 +367,23 @@ def unmask(self) -> Sequence:
 
 
 def mask_invalid(
-    sequence: Sequence,
+    self: Sequence[ValuesT, MaskT],
     mask_value: complex | None = None,
-) -> 'Sequence':
+) -> Sequence[ValuesT, MaskT]:
   """Returns a sequence with invalid timesteps replaced."""
-  expanded_mask = sequence.expanded_mask()
+  expanded_mask = self.expanded_mask()
   if mask_value is None:
-    masked_values = mx.zeros_like(sequence.values)
-    result_type = MaskedSequence
+    masked_values = mx.zeros_like(self.values)
+    result_type: type[Sequence[mx.array, mx.array]] = MaskedSequence
   else:
     masked_values = mx.full(
-        sequence.values.shape, mask_value, sequence.values.dtype
+        self.values.shape,
+        mask_value,  # pyrefly: ignore[bad-argument-type]
+        self.values.dtype,
     )
-    result_type = Sequence
-  masked_values = mx.where(expanded_mask, sequence.values, masked_values)
-  return result_type(masked_values, sequence.mask)
+    result_type: type[Sequence[mx.array, mx.array]] = Sequence
+  masked_values = mx.where(expanded_mask, self.values, masked_values)
+  return cast(Sequence[ValuesT, MaskT], result_type(masked_values, self.mask))
 
 
 # Defined outside of Sequence so mask_invalid can return MaskedSequence.
@@ -330,6 +395,7 @@ def mask_invalid(
 
 
 def _check_output_spec(layer, x, y, constants):
+  """Checks that the output spec of a layer matches the expected spec."""
   expected = layer.get_output_spec(x.channel_spec, constants=constants)
   if y.channel_shape != expected.shape:
     raise ValueError(
@@ -341,6 +407,7 @@ def _check_output_spec(layer, x, y, constants):
 
 
 def _check_output_ratio(layer, x, y):
+  """Checks that the output length of a layer matches the expected length."""
   expected_length = x.shape[1] * layer.output_ratio
   if y.shape[1] != expected_length:
     raise ValueError(
@@ -354,8 +421,8 @@ def check_layer(layer_fn):
   """Validates layer inputs and outputs."""
 
   @functools.wraps(layer_fn)
-  def wrapper(self, x, *, constants=None):
-    y = layer_fn(self, x, constants=constants)
+  def wrapper(self, x, *, training: bool, constants=None):
+    y = layer_fn(self, x, training=training, constants=constants)
     _check_output_spec(self, x, y, constants)
     return y
 
@@ -366,7 +433,7 @@ def check_step(step_fn):
   """Validates step inputs and outputs."""
 
   @functools.wraps(step_fn)
-  def wrapper(self, x, state, *, constants=None):
+  def wrapper(self, x, state, *, training: bool, constants=None):
     if not self.supports_step:
       raise ValueError(f'{self.__class__.__name__} does not support step().')
     block_size = self.block_size
@@ -375,7 +442,7 @@ def wrapper(self, x, state, *, constants=None):
           f'{self.__class__.__name__} received input with'
           f' {x.shape=} not a multiple of {block_size=}.'
       )
-    y, state = step_fn(self, x, state, constants=constants)
+    y, state = step_fn(self, x, state, training=training, constants=constants)
     _check_output_spec(self, x, y, constants)
     _check_output_ratio(self, x, y)
     return y, state
@@ -388,8 +455,71 @@ def wrapper(self, x, state, *, constants=None):
 # ---------------------------------------------------------------------------
 
 
-class Steppable(types.Steppable):
-  """A sequence processing layer that supports layer and step modes."""
+class Steppable(spec.Steppable[Sequence, Sequence, ChannelSpec]):
+  """A sequence processing layer that can be executed layerwise or stepwise.
+
+  # Step-wise execution:
+
+  A SequenceLayer supports step-wise execution if its `supports_step` property
+  is true. Most built-in SequenceLayers support step-wise processing by default,
+  but may support processing features that are not causal and therefore cannot
+  be executed step-by-step (e.g. non-causal convolutions, bidirectional RNNs,
+  etc.).
+
+  When executing step-wise, use the `step` or `step_with_emits` method to
+  process a block of inputs (a `Sequence` shaped `[b, block_size * n, ...]`) and
+  a `state` input whose structure matches `get_initial_state`.
+
+  This produces:
+  - An output `Sequence` shaped  `[b, block_size * n * output_ratio, ...]`
+    whose `...` shape matches `get_output_shape`.
+  - A `state` output whose structure matches `get_initial_state`.
+  - (Optionally) an `emits` output.
+
+  The output `Sequence` is the primary output of the step, while the `emits`
+  represent "auxiliary" outputs that are produced by the layer (for example,
+  debug output).
+
+  # Layer-wise execution:
+
+  When executing layer-wise, use the `layer` or `layer_with_emits` method to
+  process inputs (a `Sequence` shaped `[b, t, ...]`).
+
+  This produces:
+  - An output `Sequence` shaped  `[b,  t * output_ratio, ...]`
+    whose `...` shape matches `get_output_shape`.
+  - (Optionally) an `emits` output.
+
+  The output `Sequence` is the primary output of the layer, while the `emits`
+  represent "auxiliary" outputs that are produced by the layer (for example,
+  debug output).
+
+  # Latency
+
+  SequenceLayers have an input and output "latency" to describe their latency
+  characteristics. Latency is the number of input or output timesteps from
+  step-wise excecution that are input or output before the step-wise output of
+  the layer matches the layer-wise output of the layer.
+
+  An invariant that all layers must maintain is that for the layer-wise output
+  and step-wise output:
+
+  ```
+  y_layer = l.layer(x, training=training)
+
+  # Pad x with input_latency timesteps to process the entire sequence:
+  x = x.pad_time(0, l.input_latency, valid=False)
+
+  y_step, _, _ = utils.step_by_step_dynamic(l, x, training=training)
+  ```
+
+  The step-wise output is equivalent to the layer-wise output after dropping the
+  initial latency timesteps of the step-wise output:
+
+  ```
+  y_layer == y_step[:, l.output_latency:]
+  ```
+  """
 
   @property
   @override
@@ -448,14 +578,49 @@ def receptive_field(self) -> ReceptiveField:
   @abc.abstractmethod
   @override
   def layer(
-      self, x: Sequence, *, constants: Constants | None = None
+      self, x: Sequence, *, training: bool, constants: Constants | None = None
   ) -> Sequence:
-    """Process this layer layer-wise."""
+    """Process this layer layer-wise.
+
+    Args:
+      x: Input sequence with values shaped [b, t_i, ...].
+      training: Python bool. Whether we are in training mode.
+      constants: A dictionary of constant name to array or sl.Sequence. Values
+        or sequences that are "constant" with respect to the SequenceLayer, but
+        may affect its processing. For example, for an attention layer this may
+        contain the source sequence to attend to.
+
+    Returns:
+      y: The outputs corresponding to this layer with values shaped
+        [b, t_o, ...] where `t_o == t_i * output_ratio`. t_o may have been
+        truncated to only represent valid frames.
+    """
 
+  @override
   def layer_with_emits(
-      self, x: Sequence, *, constants: Constants | None = None
+      self, x: Sequence, *, training: bool, constants: Constants | None = None
   ) -> tuple[Sequence, Emits]:
-    return self.layer(x, constants=constants), ()
+    """Process this layer layer-wise, producing emitted arrays.
+
+    This is like `layer`, except it has an additional return value which is the
+    "emitted" arrays for the layer. The emitted arrays are a structure of
+    arrays whose values are arrays or `Sequence`s.
+
+    Args:
+      x: Input sequence with values shaped [b, t_i, ...].
+      training: Python bool. Whether we are in training mode.
+      constants: A dictionary of constant name to array or sl.Sequence. Values
+        or sequences that are "constant" with respect to the SequenceLayer, but
+        may affect its processing. For example, for an attention layer this may
+        contain the key/value sequence to attend to.
+
+    Returns:
+      y: The outputs corresponding to this layer with values shaped
+        [b, t_o, ...] where `t_o == t_i * output_ratio`. t_o may have been
+        truncated to only represent valid frames.
+      emits: A nest of emitted arrays or Sequences.
+    """
+    return self.layer(x, training=training, constants=constants), ()
 
   @abc.abstractmethod
   @override
@@ -464,18 +629,63 @@ def step(
       x: Sequence,
       state: State,
       *,
+      training: bool,
       constants: Constants | None = None,
   ) -> tuple[Sequence, State]:
-    """Process this layer step-wise."""
+    """Process this layer step-wise.
+
+    Args:
+      x: Input sequence with values shaped [b, t_i, ...], where t_i is a
+        multiple of block_size.
+      state: A structure of state arrays matching get_initial_state. The
+        previous state for this layer.
+      training: Python bool. Whether we are in training mode.
+      constants: A dictionary of constant name to array or sl.Sequence. Values
+        or sequences that are "constant" with respect to the SequenceLayer, but
+        may affect its processing. For example, for an attention layer this may
+        contain the key/value sequence to attend to.
+
+    Returns:
+      y: The outputs corresponding to this step with values shaped [b, t_o, ...]
+        where `t_o == t_i * output_ratio`.
+      state: A structure of state arrays matching get_initial_state. The
+        new state for this layer.
+    """
 
+  @override
   def step_with_emits(
       self,
       x: Sequence,
       state: State,
       *,
+      training: bool,
       constants: Constants | None = None,
   ) -> tuple[Sequence, State, Emits]:
-    y, state = self.step(x, state, constants=constants)
+    """Process this layer step-wise, producing emitted arrays.
+
+    This is like `step`, except it has an additional return value which is the
+    "emitted" arrays for the step. The emitted arrays are a structure of
+    arrays whose values are arrays or `Sequence`s.
+
+    Args:
+      x: Input sequence with values shaped [b, t_i, ...], where t_i is a
+        multiple of block_size.
+      state: A structure of state arrays matching get_initial_state. The
+        previous state for this layer.
+      training: Python bool. Whether we are in training mode.
+      constants: A dictionary of constant name to array or sl.Sequence. Values
+        or sequences that are "constant" with respect to the SequenceLayer, but
+        may affect its processing. For example, for an attention layer this may
+        contain the key/value sequence to attend to.
+
+    Returns:
+      y: The outputs corresponding to this step with values shaped [b, t_o, ...]
+        where `t_o == t_i * output_ratio`.
+      state: A structure of state arrays matching get_initial_state. The
+        new state for this layer.
+      emits: A nest of emitted arrays or Sequences.
+    """
+    y, state = self.step(x, state, training=training, constants=constants)
     return y, state, ()
 
   @abc.abstractmethod
@@ -485,9 +695,24 @@ def get_initial_state(
       batch_size: int,
       input_spec: ChannelSpec,
       *,
+      training: bool,
       constants: Constants | None = None,
   ) -> State:
-    """Returns the initial state for step-wise processing."""
+    """Returns the initial state for this SequenceLayer.
+
+    Args:
+      batch_size: The batch size to create state for.
+      input_spec: An input ChannelSpec representing the channel shape and dtype
+        of the input that will be stepped.
+      training: Python bool. Whether we are in training mode.
+      constants: A dictionary of constant name to array or sl.Sequence. Values
+        or sequences that are "constant" with respect to the SequenceLayer, but
+        may affect its processing. For example, for an attention layer this may
+        contain the source sequence to attend to.
+
+    Returns:
+      An integer, shape, or structure of integer/shapes.
+    """
 
   @abc.abstractmethod
   @override
@@ -497,7 +722,20 @@ def get_output_shape(
       *,
       constants: Constants | None = None,
   ) -> Shape:
-    """Returns the output channel shape for an input channel shape."""
+    """Returns the output channel shape this layer produces for an input channel shape.
+
+    Args:
+      input_shape: A shape representing the channels dimension of the input
+        sequence (i.e. not including the batch or time dimension).
+      constants: A dictionary of constant name to array or sl.Sequence. Values
+        or sequences that are "constant" with respect to the SequenceLayer, but
+        may affect its processing. For example, for an attention layer this may
+        contain the source sequence to attend to.
+
+    Returns:
+      A shape representing the output channels dimensions (i.e. not including
+      the batch or time dimension).
+    """
 
   @abc.abstractmethod
   @override
@@ -507,14 +745,37 @@ def get_output_dtype(
       *,
       constants: Constants | None = None,
   ) -> DType:
-    """Returns the output dtype for an input dtype."""
+    """Returns the layer's output dtype for the specified input dtype.
+
+    Args:
+      input_dtype: The dtype of the input features.
+      constants: A dictionary of constant name to array or sl.Sequence. Values
+        or sequences that are "constant" with respect to the SequenceLayer, but
+        may affect its processing.
+
+    Returns:
+      The dtype of the output features.
+    """
 
+  @override
   def get_output_spec(
       self,
       input_spec: ChannelSpec,
       *,
       constants: Constants | None = None,
   ) -> ChannelSpec:
+    """Returns the output spec this layer produces for the provided input spec.
+
+    Args:
+      input_spec: A ChannelSpec which represents the channels shape and dtype of
+        the input sequence (i.e. not including the batch or time dimension).
+      constants: A dictionary of constant name to array or sl.Sequence. Values
+        or sequences that are "constant" with respect to the SequenceLayer, but
+        may affect its processing.
+
+    Returns:
+      The ChannelSpec of the output features.
+    """
     shape = self.get_output_shape(input_spec.shape, constants=constants)
     dtype = self.get_output_dtype(input_spec.dtype, constants=constants)
     return ChannelSpec(shape, dtype)
@@ -525,20 +786,27 @@ def get_output_spec(
 # ---------------------------------------------------------------------------
 
 
-class SequenceLayer(nn.Module, Steppable):
-  """Base MLX Module for Sequence Layers."""
+class SequenceLayer(
+    nn.Module,
+    Steppable,
+    spec.SequenceLayer[Sequence, Sequence, ChannelSpec],
+    metaclass=abc.ABCMeta,
+):
+  """Base Module for Sequence Layers."""
 
 
-class SequenceLayerConfig(types.SequenceLayerConfig):
+class SequenceLayerConfig(spec.SequenceLayerConfig):
   """Base class for SequenceLayer configuration objects."""
 
   @abc.abstractmethod
+  @override
   def make(self) -> SequenceLayer:
     """Builds a SequenceLayer from this config."""
 
-  def copy(self, **kwargs) -> 'SequenceLayerConfig':
+  @override
+  def copy(self, **kwargs) -> Self:
     """Returns a copy of the config with updated fields."""
-    return dataclasses.replace(self, **kwargs)
+    return cast(Self, dataclasses.replace(cast(Any, self), **kwargs))
 
 
 # ---------------------------------------------------------------------------
@@ -546,19 +814,24 @@ def copy(self, **kwargs) -> 'SequenceLayerConfig':
 # ---------------------------------------------------------------------------
 
 
-class PreservesType:
-  """Mix-in: layer does not change the input dtype."""
+class PreservesType(spec.PreservesType):
+  """A mix-in for layers that do not change the input dtype."""
 
+  @override
   def get_output_dtype(
-      self, input_dtype: DType, *, constants: Constants | None = None
+      self,
+      input_dtype: DType,
+      *,
+      constants: Constants | None = None,
   ) -> DType:
     del constants
     return input_dtype
 
 
-class PreservesShape:
-  """Mix-in: layer does not change the input channel shape."""
+class PreservesShape(spec.PreservesShape):
+  """A mix-in for layers that do not change the input shape."""
 
+  @override
   def get_output_shape(
       self,
       input_shape: ShapeLike,
@@ -574,47 +847,112 @@ def get_output_shape(
 # ---------------------------------------------------------------------------
 
 
-class Stateless(SequenceLayer):
-  """A SequenceLayer with no step state."""
+class Stateless(SequenceLayer, spec.Stateless[Sequence, Sequence, ChannelSpec]):
+  """A SequenceLayer with no state over time required for step-wise processing.
+
+  Sub-classes must also implement:
+  - layer
+  - get_output_shape
+  - get_output_dtype
+  """
 
+  @override
   def get_initial_state(
       self,
       batch_size: int,
       input_spec: ChannelSpec,
       *,
+      training: bool,
       constants: Constants | None = None,
   ) -> State:
+    del batch_size
+    del input_spec
+    del training
+    del constants
     return ()
 
+  @abc.abstractmethod
+  @override
+  def get_output_shape(
+      self,
+      input_shape: ShapeLike,
+      *,
+      constants: Constants | None = None,
+  ) -> Shape:
+    ...
+
+  @abc.abstractmethod
+  @override
+  def get_output_dtype(
+      self,
+      input_dtype: DType,
+      *,
+      constants: Constants | None = None,
+  ) -> DType:
+    ...
+
+  @abc.abstractmethod
+  @override
+  def layer(
+      self,
+      x: Sequence,
+      *,
+      training: bool,
+      constants: Constants | None = None,
+  ) -> Sequence:
+    ...
+
+  @override
   def step(
       self,
       x: Sequence,
       state: State,
       *,
+      training: bool,
       constants: Constants | None = None,
   ) -> tuple[Sequence, State]:
-    return self.layer(x, constants=constants), state
+    return self.layer(x, training=training, constants=constants), state
 
 
-class StatelessPointwise(PreservesShape, Stateless):
-  """Stateless layer that operates pointwise (preserves shape)."""
+class StatelessPointwise(
+    PreservesShape,
+    Stateless,
+    spec.StatelessPointwise[Sequence, Sequence, ChannelSpec],
+    metaclass=abc.ABCMeta,
+):
+  """A SequenceLayer that has no state and operates pointwise on its input."""
 
 
-class StatelessPointwiseFunctor(StatelessPointwise, metaclass=abc.ABCMeta):
-  """Stateless pointwise layer defined by a fn(values, mask)."""
+class StatelessPointwiseFunctor(
+    StatelessPointwise,
+    spec.StatelessPointwiseFunctor[Sequence, Sequence, ChannelSpec],
+):
+  """A stateless SequenceLayer for simple pointwise processing fns."""
 
   @abc.abstractmethod
+  @override
   def fn(self, values: ValuesT, mask: MaskT) -> tuple[ValuesT, MaskT]:
     """Transforms each scalar in values independently."""
 
   @property
+  @override
   def mask_required(self):
+    """Returns true if fn can change the sequence's masked state.
+
+    If fn(0) -> 0, then mask_required() is False.
+    """
     return True
 
   @check_layer
-  def layer(
-      self, x: Sequence, *, constants: Constants | None = None
+  @override
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self,
+      x: Sequence,
+      *,
+      training: bool,
+      constants: Constants | None = None,
   ) -> Sequence:
+    del training
     if self.mask_required:
       y = x.apply(self.fn)
     else:
@@ -622,7 +960,7 @@ def layer(
     # Ensure MaskedSequence -> Sequence conversion for apply.
     if isinstance(y, MaskedSequence) and self.mask_required:
       y = Sequence(y.values, y.mask)
-    return y
+    return cast(Sequence, y)
 
 
 # ---------------------------------------------------------------------------
@@ -630,60 +968,170 @@ def layer(
 # ---------------------------------------------------------------------------
 
 
-class Emitting(SequenceLayer, metaclass=abc.ABCMeta):
-  """A SequenceLayer that emits auxiliary tensors."""
+class Emitting(
+    SequenceLayer,
+    spec.Emitting[Sequence, Sequence, ChannelSpec],
+):
+  """A SequenceLayer that emits auxiliary arrays.
 
-  def step(
+  This is a convenience subclass that implements step and layer in terms of
+  step_with_emits and layer_with_emits, so that implementors need only implement
+  two of the four methods. For emits that are substantially expensive to compute
+  subclasses can choose to implement all four and save computation in those that
+  do not produce emits.
+  """
+
+  @abc.abstractmethod
+  @override
+  def get_initial_state(
       self,
-      x: Sequence,
-      state: State,
+      batch_size: int,
+      input_spec: ChannelSpec,
       *,
+      training: bool,
       constants: Constants | None = None,
-  ) -> tuple[Sequence, State]:
-    y, state, _ = self.step_with_emits(x, state, constants=constants)
-    return y, state
+  ) -> State:
+    ...
+
+  @abc.abstractmethod
+  @override
+  def get_output_shape(
+      self,
+      input_shape: ShapeLike,
+      *,
+      constants: Constants | None = None,
+  ) -> Shape:
+    ...
 
   @abc.abstractmethod
+  @override
+  def get_output_dtype(
+      self,
+      input_dtype: DType,
+      *,
+      constants: Constants | None = None,
+  ) -> DType:
+    ...
+
+  @abc.abstractmethod
+  @override
   def step_with_emits(
       self,
       x: Sequence,
       state: State,
       *,
+      training: bool,
       constants: Constants | None = None,
   ) -> tuple[Sequence, State, Emits]:
-    pass
-
-  def layer(
-      self, x: Sequence, *, constants: Constants | None = None
-  ) -> Sequence:
-    y, _ = self.layer_with_emits(x, constants=constants)
-    return y
+    ...
 
   @abc.abstractmethod
+  @override
   def layer_with_emits(
-      self, x: Sequence, *, constants: Constants | None = None
+      self,
+      x: Sequence,
+      *,
+      training: bool,
+      constants: Constants | None = None,
   ) -> tuple[Sequence, Emits]:
-    pass
+    ...
 
+  @override
+  def step(
+      self,
+      x: Sequence,
+      state: State,
+      *,
+      training: bool,
+      constants: Constants | None = None,
+  ) -> tuple[Sequence, State]:
+    output, state, _ = self.step_with_emits(
+        x, state, training=training, constants=constants
+    )
+    return output, state
 
-class StatelessEmitting(Emitting):
-  """Stateless layer that emits auxiliary tensors."""
+  @override
+  def layer(
+      self,
+      x: Sequence,
+      *,
+      training: bool,
+      constants: Constants | None = None,
+  ) -> Sequence:
+    outputs, _ = self.layer_with_emits(
+        x, training=training, constants=constants
+    )
+    return outputs
 
-  def step_with_emits(
+
+class StatelessEmitting(
+    Emitting,
+    spec.StatelessEmitting[Sequence, Sequence, ChannelSpec],
+):
+  """A SequenceLayer with no state over time that emits auxiliary arrays.
+
+  Sub-classes must implement:
+  - layer_with_emits
+  - get_output_shape
+  - get_output_dtype
+  """
+
+  @abc.abstractmethod
+  @override
+  def get_output_shape(
+      self,
+      input_shape: ShapeLike,
+      *,
+      constants: Constants | None = None,
+  ) -> Shape:
+    ...
+
+  @abc.abstractmethod
+  @override
+  def get_output_dtype(
+      self,
+      input_dtype: DType,
+      *,
+      constants: Constants | None = None,
+  ) -> DType:
+    ...
+
+  @abc.abstractmethod
+  @override
+  def layer_with_emits(
       self,
       x: Sequence,
-      state: State,
       *,
+      training: bool,
       constants: Constants | None = None,
-  ) -> tuple[Sequence, State, Emits]:
-    y, emits = self.layer_with_emits(x, constants=constants)
-    return y, state, emits
+  ) -> tuple[Sequence, Emits]:
+    ...
 
+  @override
   def get_initial_state(
       self,
       batch_size: int,
       input_spec: ChannelSpec,
       *,
+      training: bool,
       constants: Constants | None = None,
   ) -> State:
+    del batch_size
+    del input_spec
+    del training
+    del constants
     return ()
+
+  @override
+  def step_with_emits(
+      self,
+      x: Sequence,
+      state: State,
+      *,
+      training: bool,
+      constants: Constants | None = None,
+  ) -> tuple[Sequence, State, Emits]:
+    outputs, emits = self.layer_with_emits(
+        x, training=training, constants=constants
+    )
+    return outputs, state, emits
diff --git a/sequence_layers/mlx/types_test.py b/sequence_layers/mlx/types_test.py
index 113be46..401db66 100644
--- a/sequence_layers/mlx/types_test.py
+++ b/sequence_layers/mlx/types_test.py
@@ -11,65 +11,59 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Tests for MLX sequence types."""
 
 from absl.testing import absltest
-import mlx.core as mx
-import numpy as np
-from sequence_layers.abstract import types_test_base
-from sequence_layers.mlx import types
+from sequence_layers.mlx import test_utils
+from sequence_layers.specs import types_behaviors as spec
 
 
-class SequenceTest(types_test_base.SequenceTest):
+class ModuleInterfaceTest(
+    test_utils.SequenceLayerTest, spec.ModuleInterfaceTest
+):
+  pass
 
-  def get_backend(self):
-    return mx
 
-  @property
-  def Sequence(self):
-    return types.Sequence
+class SequenceTest(test_utils.SequenceLayerTest, spec.SequenceTest):
+  pass
 
-  @property
-  def MaskedSequence(self):
-    return types.MaskedSequence
 
-  def assertAllEqual(self, a, b):
-    a = np.array(a) if isinstance(a, mx.array) else a
-    b = np.array(b) if isinstance(b, mx.array) else b
-    np.testing.assert_array_equal(a, b)
+class SequenceLayerConfigTest(
+    test_utils.SequenceLayerTest, spec.SequenceLayerConfigTest
+):
+  pass
 
-  def assertSequencesEqual(self, a, b):
-    self.assertAllEqual(a.values, b.values)
-    self.assertAllEqual(a.mask, b.mask)
 
+class SteppableTest(test_utils.SequenceLayerTest, spec.SteppableTest):
+  pass
 
-class SteppableTest(types_test_base.SteppableTest):
 
-  def create_steppable(self):
+class PreservesTypeTest(test_utils.SequenceLayerTest, spec.PreservesTypeTest):
+  pass
 
-    class DefaultSteppable(types.Steppable):
 
-      def layer(self, x, *, constants=None):
-        return x
+class PreservesShapeTest(test_utils.SequenceLayerTest, spec.PreservesShapeTest):
+  pass
 
-      def step(self, x, state, *, constants=None):
-        return x, state
 
-      def get_initial_state(self, batch_size, input_spec, *, constants=None):
-        return ()
+class StatelessTest(test_utils.SequenceLayerTest, spec.StatelessTest):
+  pass
 
-      def get_output_shape(self, input_shape, *, constants=None):
-        return input_shape
 
-      def get_output_dtype(self, input_dtype, *, constants=None):
-        return input_dtype
+class EmittingTest(test_utils.SequenceLayerTest, spec.EmittingTest):
+  pass
 
-    return DefaultSteppable()
 
+class StatelessEmittingTest(
+    test_utils.SequenceLayerTest, spec.StatelessEmittingTest
+):
+  pass
 
-class SequenceLayerConfigTest(types_test_base.SequenceLayerConfigTest):
 
-  def get_config_base_cls(self):
-    return types.SequenceLayerConfig
+class StatelessPointwiseFunctorTest(
+    test_utils.SequenceLayerTest, spec.StatelessPointwiseFunctorTest
+):
+  pass
 
 
 if __name__ == '__main__':

From 67cae9b1b84094e85d606a0958d03e400ff008ad Mon Sep 17 00:00:00 2001
From: Julian Salazar <julsal@google.com>
Date: Fri, 29 May 2026 09:32:30 +0000
Subject: [PATCH 04/29] refactor: Abstract and implement multi-backend testing
 utilities

* Abstract testing structures into backend-agnostic specification tests.
* Implement concrete, reusable testing utilities in specs/test_utils.py.
* Inherit spec tests in JAX and MLX backend test suites for strict equivalence.

Co-authored-by: David Braun <davidbraun@google.com>
Co-authored-by: Kehang Han <kehanghan@google.com>

PiperPending-RevId: 923278025
PiperOrigin-RevId: 923278025
---
 sequence_layers/jax/test_utils.py             | 265 ++++++-------
 sequence_layers/jax/test_utils_test.py        | 175 ++++-----
 sequence_layers/mlx/test_utils.py             | 271 ++++++++++++++
 sequence_layers/mlx/test_utils_test.py        |  37 ++
 sequence_layers/specs/test_utils.py           | 182 ++++++++-
 sequence_layers/specs/test_utils_behaviors.py | 347 ++++++++++++++++++
 6 files changed, 1053 insertions(+), 224 deletions(-)
 create mode 100644 sequence_layers/mlx/test_utils.py
 create mode 100644 sequence_layers/mlx/test_utils_test.py
 create mode 100644 sequence_layers/specs/test_utils_behaviors.py

diff --git a/sequence_layers/jax/test_utils.py b/sequence_layers/jax/test_utils.py
index a7f8ede..54c58e8 100644
--- a/sequence_layers/jax/test_utils.py
+++ b/sequence_layers/jax/test_utils.py
@@ -11,17 +11,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+#
+# PyType limitation that interprets jax/typing.py as stdlib typing:
+# pytype: disable=module-attr,signature-mismatch
 """Test utilities."""
 
 import dataclasses
 import functools
-import itertools
 import logging
 import random
-from typing import Any, Callable, Iterable, Mapping, Sequence as TypingSequence, TypeVar
+from typing import Any, Callable, Iterable, Mapping, Sequence as TypingSequence, TypeVar, override
 
 from absl.testing import absltest
-from absl.testing import parameterized
 import chex
 import flax.linen as nn
 import jax
@@ -75,7 +76,7 @@ def random_sequence(
     raise ValueError('Must not specify random_mask and random_lengths.')
   if len(dims) < 2:
     raise ValueError(
-        'random_sequence expects at least 2 dimensions, got: %s' % (dims,)
+        f'random_sequence expects at least 2 dimensions, got: {dims}'
     )
 
   is_complex = dtype in (np.complex64, np.complex128)
@@ -272,33 +273,7 @@ def zip_longest(
     zipped argument internally sorted (target, source). If either input sequence
     was longer, the last element of the shorter input sequence is repeated.
   """
-
-  results = []
-  prev_source, prev_target = None, None
-  for source, target in itertools.zip_longest(sources, targets):
-    # If either runs out ahead-of-time, we repeat the final non-None element.
-    # (This is safest as we cannot inspect the function's defaults.)
-    if source is None:
-      source = prev_source
-    elif target is None:
-      target = prev_target
-
-    if isinstance(target, Mapping):
-      assert isinstance(source, Mapping)
-      results.append({**target, **source})
-    elif isinstance(sources, Iterable):
-      # target is a non-mapping iterable, like tuple or list.
-      if isinstance(source, Mapping):
-        # To match the target, we replace the source with its unlabeled values.
-        source = source.values()
-      results.append((*target, *source))
-      prev_source, prev_target = source, target
-    else:
-      raise NotImplementedError(
-          f'Targets of type {type(target)=} are unsupported.'
-      )
-
-  return results
+  return spec.zip_longest(targets, sources)
 
 
 def named_product(
@@ -323,62 +298,8 @@ def named_product(
     `{first_item_name}_{second_item_name}`. If both iterators' items are
     mappings, the product's items are mappings; otherwise they are ordered
     tuples.
-
-    For example, if `first` is
-    `[{**foo, 'testcase_name': 'foo'}, {**bar, 'testcase_name': 'bar'}]` and
-    `second` is `[['baz', *baz], ['qux', *qux]]`, the items will be
-    `('foo_baz', *foo.values(), *baz), ('foo_qux', *foo.values(), *qux), ...`
-
-  Raises:
-    ValueError: A testcase_name is missing; either an iterator item is empty, or
-      one is a mapping without a `testcase_name` key.
   """
-
-  results = []
-
-  for p1, p2 in itertools.product(first, second):
-
-    for source, parameters in enumerate([p1, p2]):
-      if isinstance(parameters, Mapping):
-        if 'testcase_name' not in parameters:
-          raise ValueError(
-              f'Mapping {parameters} from iterable #{source+1} does not have'
-              ' key `testcase_name`.'
-          )
-      elif not parameters:
-        raise ValueError(
-            f'An sequence from iterable #{source+1} is empty; the first entry'
-            ' is expected to be a testcase name.'
-        )
-
-    # When both are mappings, we merge by key:
-    if isinstance(p1, Mapping) and isinstance(p2, Mapping):
-      testcase_name = f'{p1["testcase_name"]}_{p2["testcase_name"]}'
-      p1 = {k: v for k, v in p1.items() if k != 'testcase_name'}
-      p2 = {k: v for k, v in p2.items() if k != 'testcase_name'}
-      results.append({**p1, **p2, 'testcase_name': testcase_name})
-
-    # Else, we return an ordered tuple based on each parameter set's order:
-    else:
-
-      if isinstance(p1, Mapping):
-        p1_name = p1['testcase_name']
-        p1 = tuple(v for k, v in p1.items() if k != 'testcase_name')
-      else:
-        p1_name = p1[0]
-        p1 = p1[1:]
-
-      if isinstance(p2, Mapping):
-        p2_name = p2['testcase_name']
-        p2 = tuple(v for k, v in p2.items() if k != 'testcase_name')
-      else:
-        p2_name = p2[0]
-        p2 = p2[1:]
-
-      testcase_name = f'{p1_name}_{p2_name}'
-      results.append((testcase_name, *p1, *p2))
-
-  return parameterized.named_parameters(*results)
+  return spec.named_product(first, second)
 
 
 def get_grad_tols(
@@ -403,8 +324,7 @@ def get_grad_tols(
       compute_dtype is None or compute_dtype == jnp.float32
   ):
     return {'grad_rtol': 1e-5, 'grad_atol': 1e-5}
-  else:
-    return {'grad_rtol': 1e-1, 'grad_atol': 1e-1}
+  return {'grad_rtol': 1e-1, 'grad_atol': 1e-1}
 
 
 def flax_init(layer: nn.Module, *args, **kwargs):
@@ -421,6 +341,7 @@ def init_layer(*args, **kwargs):
 
 
 def flax_apply(layer: nn.Module, params, *args, **kwargs):
+  """Applies a Flax module with the given parameters."""
   method = kwargs.pop('method', '__call__')
   should_jit = kwargs.pop('jit', True)
 
@@ -433,6 +354,7 @@ def layer_fn(params, *args, **kwargs):
 
 
 def sl_init(layer: types.SequenceLayer, *args, **kwargs):
+  """Initializes a SequenceLayer."""
   training = kwargs.pop('training', False)
   method = kwargs.pop('method', '__call__')
   should_jit = kwargs.pop('jit', True)
@@ -500,11 +422,10 @@ def pad_with_garbage(
 
     if isinstance(x, jax.Array):
       return jnp.pad(x, paddings, constant_values=pad_value)
-    else:
-      return type(x)(
-          jnp.pad(x.values, paddings, constant_values=pad_value),
-          jnp.pad(x.mask, [(1, 1), (0, 0)], constant_values=True),
-      )
+    return type(x)(
+        jnp.pad(x.values, paddings, constant_values=pad_value),
+        jnp.pad(x.mask, [(1, 1), (0, 0)], constant_values=True),
+    )
 
   return jax.tree.map(
       pad_with_garbage, tree, is_leaf=lambda x: isinstance(x, types.Sequence)
@@ -621,8 +542,12 @@ def fn(
     # avoid large gradient matrices.
     return jax.lax.reduce_sum(jnp.abs(y), axes=list(range(2, y.ndim)))
 
-  x_real_fn = lambda x_: types.Sequence(x_ + 1j * jnp.imag(x.values), x.mask)
-  x_imag_fn = lambda x_: types.Sequence(jnp.real(x.values) + 1j * x_, x.mask)
+  def x_real_fn(x_):
+    return types.Sequence(x_ + 1j * jnp.imag(x.values), x.mask)
+
+  def x_imag_fn(x_):
+    return types.Sequence(jnp.real(x.values) + 1j * x_, x.mask)
+
   jac_fn_real_y = functools.partial(
       fn, x_fn=lambda x_: types.Sequence(x_, x.mask), y_fn=jnp.real
   )
@@ -767,6 +692,7 @@ def fn(
 def _mask_and_pad_to_max_length(
     a: types.Sequence, b: types.Sequence
 ) -> tuple[types.Sequence, types.Sequence]:
+  """Masks invalid timesteps and pads two sequences to the same maximum length."""
   # Only compare values in non-masked regions.
   a = a.mask_invalid()
   b = b.mask_invalid()
@@ -846,18 +772,24 @@ def randomize_weights_fn(variables):
 
     return layer.bind(variables)
 
+  def init_layer(self, layer, x, **kwargs):
+    """Initialize and bind variables for JAX."""
+    key = jax.random.PRNGKey(1234)
+    return self.init_and_bind_layer(key, layer, x, **kwargs)
+
   def verify_masked(self, x: types.Sequence):
     """Asserts all invalid timesteps in x have values masked to zero."""
     # Manually mask even if x is a MaskedSequence.
     expected = types.Sequence(x.values, x.mask).mask_invalid()
     self.assertAllEqual(x.values, expected.values)
 
+  @override
   def verify_contract(
       self,
       l: types.SequenceLayer,
       x: types.Sequence,
       *,
-      training: bool,
+      training: bool = False,
       constants: types.Constants | None = None,
       stream_constants: bool = False,
       stream_constants_list: list[types.Constants] | None = None,
@@ -875,6 +807,7 @@ def verify_contract(
       test_padding_invariance: bool = True,
       test_receptive_field: bool = True,
       test_receptive_field_relaxed: bool = False,
+      **kwargs,
   ) -> types.Sequence:
     """Verifies that the provided layer obeys the SequenceLayer contract.
 
@@ -940,6 +873,7 @@ def verify_contract(
       test_receptive_field_relaxed: Whether to test the layer for receptive
         field with relaxed conditions, i.e., allowing the layer to report a
         receptive field that is larger than the actual receptive field.
+      **kwargs: Additional keyword arguments passed to the verifier.
 
     Returns:
       The output of
@@ -1131,6 +1065,7 @@ def _pad(x: types.Sequence, pad_back: int) -> types.Sequence:
       # Property 1: Check layer-wise and step-wise equivalence.
       self.assertSequencesClose(y_layer, y_step, rtol=rtol, atol=atol)
       if test_2x_step:
+        assert y_step_2x is not None
         self.assertSequencesClose(y_layer, y_step_2x, rtol=rtol, atol=atol)
 
       # Property 2: Padding invariance.
@@ -1147,6 +1082,7 @@ def _pad(x: types.Sequence, pad_back: int) -> types.Sequence:
         # is an integer type. go/jax-integer-autodiff
         assert y_layer_x_grad is not None
         if y_layer_x_grad.dtype != jax.dtypes.float0:
+          assert y_step_x_grad is not None
           self.assertSequencesClose(
               y_layer_x_grad, y_step_x_grad, rtol=grad_rtol, atol=grad_atol
           )
@@ -1210,17 +1146,93 @@ def _pad(x: types.Sequence, pad_back: int) -> types.Sequence:
             self.assertEqual(receptive_field, expected_receptive_field)
     return y_layer
 
-  def assertSequencesClose(  # pylint: disable=invalid-name
+  @override
+  def random_sequence(
       self,
-      a: types.Sequence,
-      b: types.Sequence,
+      *dims: int,
+      dtype=jnp.float32,
+      random_mask: bool = False,
+      random_lengths: bool | None = None,
+      low: int | None = 0,
+      high: int | None = 10,
+      low_length: int = 0,
+      high_length: int | None = None,
+  ) -> types.Sequence:
+    return random_sequence(
+        *dims,
+        dtype=dtype,
+        random_mask=random_mask,
+        random_lengths=random_lengths,
+        low=low,
+        high=high,
+        low_length=low_length,
+        high_length=high_length,
+    )
+
+  @override
+  # pyrefly: ignore[bad-override]
+  def _step_by_step(
+      self,
+      layer: types.SequenceLayer,
+      x: types.Sequence,
+      *,
+      block_size: int = 1,
+      constants=None,
+      stream_constants=None,
+  ) -> tuple[types.Sequence, Any]:
+    batch = x.values.shape[0] if hasattr(x, 'values') else x.shape[0]
+    time = x.values.shape[1] if hasattr(x, 'values') else x.shape[1]
+
+    input_spec = types.ShapeDType(x.channel_shape, x.dtype)
+
+    init_constants = dict(constants) if constants else {}
+    if stream_constants:
+      init_constants.update(stream_constants)
+
+    state = layer.get_initial_state(
+        batch, input_spec, constants=init_constants or None, training=False
+    )
+
+    outputs_values = []
+    outputs_masks = []
+
+    for t in range(0, time, block_size):
+      x_block = types.Sequence(
+          x.values[:, t : t + block_size],
+          x.mask[:, t : t + block_size],
+      )
+
+      step_constants = dict(constants) if constants else {}
+      if stream_constants:
+        for name, seq in stream_constants.items():
+          step_constants[name] = types.Sequence(
+              seq.values[:, t : t + block_size],
+              seq.mask[:, t : t + block_size],
+          )
+
+      y_block, state = layer.step(
+          x_block, state, constants=step_constants or None, training=False
+      )
+      outputs_values.append(y_block.values)
+      outputs_masks.append(y_block.mask)
+
+    y_values = jnp.concatenate(outputs_values, axis=1)
+    y_mask = jnp.concatenate(outputs_masks, axis=1)
+
+    return types.Sequence(y_values, y_mask), state
+
+  @override
+  def assertSequencesClose(  # pylint: disable=arguments-differ  # pyrefly: ignore[bad-override]
+      self,
+      x: types.Sequence,
+      y: types.Sequence,
       atol: float = 1e-6,
       rtol: float = 1e-6,
   ):
     """After padding, checks sequence values are close and masks are equal."""
-    a, b = _mask_and_pad_to_max_length(a, b)
-    self.assertAllClose(a.values, b.values, atol=atol, rtol=rtol)
-    self.assertAllEqual(a.mask, b.mask)
+    x, y = _mask_and_pad_to_max_length(x, y)
+    self.assertAllClose(x.values, y.values, atol=atol, rtol=rtol)
+    self.assertAllEqual(x.mask, y.mask)
 
   def assertSequencesNotClose(  # pylint: disable=invalid-name
       self,
@@ -1234,15 +1246,14 @@ def assertSequencesNotClose(  # pylint: disable=invalid-name
     self.assertNotAllClose(a.values, b.values, atol=atol, rtol=rtol)
     self.assertAllEqual(a.mask, b.mask)
 
-  def assertSequencesEqual(  # pylint: disable=invalid-name
-      self,
-      a: types.Sequence,
-      b: types.Sequence,
-  ):
+  @override
+  def assertSequencesEqual(  # pyrefly: ignore[bad-override]
+      self, x: types.Sequence, y: types.Sequence
+  ) -> None:
     """After padding, checks sequence values are equal and masks are equal."""
-    a, b = _mask_and_pad_to_max_length(a, b)
-    self.assertAllEqual(a.values, b.values)
-    self.assertAllEqual(a.mask, b.mask)
+    x, y = _mask_and_pad_to_max_length(x, y)
+    self.assertAllEqual(x.values, y.values)
+    self.assertAllEqual(x.mask, y.mask)
 
   def assertSequencesNotEqual(  # pylint: disable=invalid-name
       self,
@@ -1254,15 +1265,16 @@ def assertSequencesNotEqual(  # pylint: disable=invalid-name
     self.assertNotAllEqual(a.values, b.values)
     self.assertAllEqual(a.mask, b.mask)
 
-  def assertAllEqual(self, a, b):  # pylint: disable=invalid-name
+  @override
+  def assertAllEqual(self, x, y):  # pylint: disable=invalid-name
     """Asserts that two arrays are equal."""
-    if jnp.iscomplexobj(a) or jnp.iscomplexobj(b):
-      a_real, a_imag = jnp.real(a), jnp.imag(a)
-      b_real, b_imag = jnp.real(b), jnp.imag(b)
-      chex.assert_trees_all_equal(a_real, b_real)
-      chex.assert_trees_all_equal(a_imag, b_imag)
+    if jnp.iscomplexobj(x) or jnp.iscomplexobj(y):
+      x_real, x_imag = jnp.real(x), jnp.imag(x)
+      y_real, y_imag = jnp.real(y), jnp.imag(y)
+      chex.assert_trees_all_equal(x_real, y_real)
+      chex.assert_trees_all_equal(x_imag, y_imag)
     else:
-      chex.assert_trees_all_equal(a, b)
+      chex.assert_trees_all_equal(x, y)
 
   def assertAllClose(self, a, b, atol: float = 1e-6, rtol: float = 1e-6):  # pylint: disable=invalid-name
     """Asserts that two arrays have close values."""
@@ -1280,9 +1292,7 @@ def assertNotAllEqual(self, a, b):  # pylint: disable=invalid-name
       chex.assert_trees_all_equal(a, b)
     except AssertionError:
       return
-    raise AssertionError(
-        'The two values are equal at all elements. %s %s' % (a, b)
-    )
+    raise AssertionError(f'The two values are equal at all elements. {a} {b}')
 
   def assertNotAllClose(self, a, b, atol: float = 1e-6, rtol: float = 1e-6):  # pylint: disable=invalid-name
     """Asserts that two arrays do not have close values."""
@@ -1290,9 +1300,7 @@ def assertNotAllClose(self, a, b, atol: float = 1e-6, rtol: float = 1e-6):  # py
       self.assertAllClose(a, b, atol=atol, rtol=rtol)
     except AssertionError:
       return
-    raise AssertionError(
-        'The two values are close at all elements. %s %s' % (a, b)
-    )
+    raise AssertionError(f'The two values are close at all elements. {a} {b}')
 
 
 class AssertConstantsLayer(types.PreservesType, types.StatelessPointwise):
@@ -1300,14 +1308,18 @@ class AssertConstantsLayer(types.PreservesType, types.StatelessPointwise):
 
   @dataclasses.dataclass(frozen=True)
   class Config(types.SequenceLayerConfig):
+    """Configuration for AssertConstantsLayer."""
+
     expected_constant: str = 'test'
     name: str | None = None
 
+    @override
     def make(self) -> 'AssertConstantsLayer':
       return AssertConstantsLayer(self, name=self.name)
 
   config: Config
 
+  @override
   def get_initial_state(
       self,
       batch_size: int,
@@ -1322,6 +1334,7 @@ def get_initial_state(
         batch_size, input_spec, training=training, constants=constants
     )
 
+  @override
   def get_output_shape(
       self,
       input_shape: types.ShapeLike,
@@ -1332,6 +1345,7 @@ def get_output_shape(
       raise ValueError(f'{self.config.expected_constant=} not present')
     return super().get_output_shape(input_shape, constants=constants)
 
+  @override
   def layer(
       self,
       x: types.Sequence,
@@ -1350,17 +1364,22 @@ class NonSteppableLayer(types.PreservesType, types.StatelessPointwise):
 
   @dataclasses.dataclass(frozen=True)
   class Config(types.SequenceLayerConfig):
+    """Configuration for NonSteppableLayer."""
+
     name: str | None = None
 
+    @override
     def make(self) -> 'NonSteppableLayer':
       return NonSteppableLayer(self, name=self.name)
 
   config: Config
 
   @property
+  @override
   def supports_step(self):
     return False
 
+  @override
   def layer(
       self,
       x: types.Sequence,
diff --git a/sequence_layers/jax/test_utils_test.py b/sequence_layers/jax/test_utils_test.py
index 5c113e3..ba74343 100644
--- a/sequence_layers/jax/test_utils_test.py
+++ b/sequence_layers/jax/test_utils_test.py
@@ -13,17 +13,52 @@
 # limitations under the License.
 """Tests for the test utilities."""
 
-from unittest import mock
 from absl.testing import parameterized
-import numpy as np
+import jax
+import jax.numpy as jnp
+import sequence_layers.jax as sl
 from sequence_layers.jax import test_utils
+from sequence_layers.specs import test_utils_behaviors as spec
+
+
+class ModuleSpecTest(test_utils.SequenceLayerTest, spec.ModuleSpecTest):
+
+  def test_module_spec_with_typeguard(self) -> None:
+    self.skipTest(
+        'typeguard v3 has a bug that crashes at runtime when validating method '
+        'signatures of Protocols against module instances.'
+    )
+
+
+class VerifyContractTest(test_utils.SequenceLayerTest, spec.VerifyContractTest):
+
+  def get_dummy_layer(self, mismatch: bool):
+    l = super().get_dummy_layer(mismatch)
+    key = jax.random.PRNGKey(1234)
+    x = test_utils.random_sequence(2, 5, 10)
+    l = self.init_and_bind_layer(key, l, x)
+    return l
+
+  def test_verify_contract_with_jax_flags(self):
+    """Tests that disabling optional JAX features (gradients, batching) doesn't crash.
+
+    Default paths (with these flags as True) are tested in all other tests.
+    """
+    layer = self.get_dummy_layer(mismatch=False)
+    x = sl.Sequence(
+        jnp.ones((2, 5, 10)),
+        jnp.ones((2, 5), dtype=bool),
+    )
+    self.verify_contract(
+        layer, x, training=False, test_gradients=False, test_batching=False
+    )
 
 
 class StandardDtypeConfigsTest(test_utils.SequenceLayerTest):
 
   @parameterized.parameters(
       (
-          dict(),
+          {},
           {
               'p-fp32_i-fp32_c-None',  # default
               'p-bf16_i-bf16_c-bf16',  # praxis
@@ -33,7 +68,7 @@ class StandardDtypeConfigsTest(test_utils.SequenceLayerTest):
           },
       ),
       (
-          dict(param=True, compute=True),
+          {'param': True, 'compute': True},
           {
               'p-fp32_c-None',  # default
               'p-bf16_c-bf16',  # praxis
@@ -43,7 +78,7 @@ class StandardDtypeConfigsTest(test_utils.SequenceLayerTest):
           },
       ),
       (
-          dict(praxis_only=True),
+          {'praxis_only': True},
           {
               'p-fp32_i-fp32_c-None',  # default
               'p-bf16_i-bf16_c-bf16',  # praxis
@@ -58,113 +93,55 @@ def test_standard_dtype_configs_returns_names(self, kwargs, expected):
     self.assertEqual(expected, names)
 
 
-class NamedProductTest(test_utils.SequenceLayerTest):
+class NamedProductTest(test_utils.SequenceLayerTest, spec.NamedProductTest):
+  pass
 
-  @parameterized.parameters(
-      dict(
-          first=[('a', 'alpha'), ('b', 'beta')],
-          second=[('1', 1), ('2', 2), ('3', 3)],
-          expected=[
-              ('a_1', 'alpha', 1),
-              ('a_2', 'alpha', 2),
-              ('a_3', 'alpha', 3),
-              ('b_1', 'beta', 1),
-              ('b_2', 'beta', 2),
-              ('b_3', 'beta', 3),
-          ],
-      ),
-      dict(
-          first=[{'a': 'alpha', 'testcase_name': 'test'}],
-          second=[('1', 1), ('2', 2)],
-          expected=[
-              ('test_1', 'alpha', 1),
-              ('test_2', 'alpha', 2),
-          ],
-      ),
-      dict(
-          first=[
-              {'letter': 'a', 'testcase_name': 'alpha'},
-              {'testcase_name': 'beta', 'letter': 'b'},
-          ],
-          second=[
-              {'testcase_name': 'one', 'number': 1},
-              {'number': 2, 'testcase_name': 'two'},
-          ],
-          expected=[
-              {'letter': 'a', 'number': 1, 'testcase_name': 'alpha_one'},
-              {'letter': 'a', 'number': 2, 'testcase_name': 'alpha_two'},
-              {'letter': 'b', 'number': 1, 'testcase_name': 'beta_one'},
-              {'letter': 'b', 'number': 2, 'testcase_name': 'beta_two'},
-          ],
-      ),
-  )
-  @mock.patch.object(parameterized, 'named_parameters', autospec=True)
-  def test_builds_named_products(self, mock_fn, first, second, expected):
-    test_utils.named_product(first, second)
-    self.assertSequenceEqual(mock_fn.call_args.args, expected)
 
-  @parameterized.parameters(
-      dict(
-          first=[{'testcase_name': 'alpha', 'letter': 'a'}, {'letter': 'b'}],
-          second=[('1', 1), ('2', 2), ('3', 3)],
-          iterator_without_testcase_name=1,
-      ),
-      dict(
-          first=[{'testcase_name': 'alpha', 'letter': 'a'}],
-          second=[('1', 1), ()],
-          iterator_without_testcase_name=2,
-      ),
-  )
-  def test_raises_on_missing_testcase_names(
-      self, first, second, iterator_without_testcase_name
-  ):
-    with self.assertRaisesRegex(
-        ValueError, str(iterator_without_testcase_name)
-    ):
-      test_utils.named_product(first, second)
+class ZipLongestTest(test_utils.SequenceLayerTest, spec.ZipLongestTest):
+  pass
 
 
 class Shear2dTest(test_utils.SequenceLayerTest):
 
   @parameterized.named_parameters(
-      dict(
-          testcase_name='basic_3x3',
-          input_array=[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
-          expected_output=[
+      {
+          'testcase_name': 'basic_3x3',
+          'input_array': [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+          'expected_output': [
               [0, 0, 1, 2, 3],
               [0, 4, 5, 6, 0],
               [7, 8, 9, 0, 0],
           ],
-      ),
-      dict(
-          testcase_name='rect_more_rows',
-          input_array=[[1, 2], [3, 4], [5, 6]],
-          expected_output=[[0, 0, 1, 2], [0, 3, 4, 0], [5, 6, 0, 0]],
-      ),
-      dict(
-          testcase_name='rect_more_cols',
-          input_array=[[1, 2, 3, 4], [5, 6, 7, 8]],
-          expected_output=[[0, 1, 2, 3, 4], [5, 6, 7, 8, 0]],
-      ),
-      dict(
-          testcase_name='single_row',
-          input_array=[[1, 2, 3]],
-          expected_output=[[1, 2, 3]],
-      ),
-      dict(
-          testcase_name='single_col',
-          input_array=[[1], [2], [3]],
-          expected_output=[[0, 0, 1], [0, 2, 0], [3, 0, 0]],
-      ),
-      dict(
-          testcase_name='with_zeros',
-          input_array=[[0, 1], [0, 0]],
-          expected_output=[[0, 0, 1], [0, 0, 0]],
-      ),
+      },
+      {
+          'testcase_name': 'rect_more_rows',
+          'input_array': [[1, 2], [3, 4], [5, 6]],
+          'expected_output': [[0, 0, 1, 2], [0, 3, 4, 0], [5, 6, 0, 0]],
+      },
+      {
+          'testcase_name': 'rect_more_cols',
+          'input_array': [[1, 2, 3, 4], [5, 6, 7, 8]],
+          'expected_output': [[0, 1, 2, 3, 4], [5, 6, 7, 8, 0]],
+      },
+      {
+          'testcase_name': 'single_row',
+          'input_array': [[1, 2, 3]],
+          'expected_output': [[1, 2, 3]],
+      },
+      {
+          'testcase_name': 'single_col',
+          'input_array': [[1], [2], [3]],
+          'expected_output': [[0, 0, 1], [0, 2, 0], [3, 0, 0]],
+      },
+      {
+          'testcase_name': 'with_zeros',
+          'input_array': [[0, 1], [0, 0]],
+          'expected_output': [[0, 0, 1], [0, 0, 0]],
+      },
   )
   def test_shear_2d(self, input_array, expected_output):
-    output = test_utils._shear_2d(np.array(input_array))
-    self.assertAllEqual(output, np.array(expected_output))
+    output = test_utils._shear_2d(jnp.array(input_array))  # pylint: disable=protected-access
+    self.assertAllEqual(output, jnp.array(expected_output))
 
 
 if __name__ == '__main__':
diff --git a/sequence_layers/mlx/test_utils.py b/sequence_layers/mlx/test_utils.py
new file mode 100644
index 0000000..c35b6ad
--- /dev/null
+++ b/sequence_layers/mlx/test_utils.py
@@ -0,0 +1,271 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test utilities for MLX sequence layers."""
+
+from typing import Any, Callable, Iterable, Mapping, override
+from typing import Sequence as TypingSequence
+from typing import TypeVar
+
+from absl.testing import absltest
+import mlx.core as mx
+import numpy as np
+from sequence_layers import specs
+import sequence_layers.mlx as mlx_sl
+from sequence_layers.mlx import types
+from sequence_layers.specs import test_utils as spec
+
+Sequence = types.Sequence
+MaskedSequence = types.MaskedSequence
+ShapeDType = types.ShapeDType
+
+_T = TypeVar('_T')
+_TestFnT = Callable[..., None]
+
+
+def zip_longest(
+    targets: Iterable[Iterable[Any]],
+    sources: Iterable[_T],
+) -> list[_T]:
+  """Applies zip_longest, specialized to @parameterized's argument format.
+
+  Args:
+    targets: Iterable of parameterized test arguments.
+    sources: Iterable of parameterized test arguments. If `targets` is a mapping
+      `sources` must be a mapping as well.
+
+  Returns:
+    A list of the zipped arguments, of the type of `targets` and with each
+    zipped argument internally sorted (target, source). If either input sequence
+    was longer, the last element of the shorter input sequence is repeated.
+  """
+  return spec.zip_longest(targets, sources)
+
+
+def named_product(
+    first: Iterable[TypingSequence[Any] | Mapping[str, Any]],
+    second: Iterable[TypingSequence[Any] | Mapping[str, Any]],
+) -> Callable[[_TestFnT], _TestFnT]:
+  """Builds named parameters from the product of iterators of named parameters.
+
+  As in parameterized.named_parameters, if an iterator's items are sequences,
+  the first element is interpreted as the name. If an iterator's items are
+  mappings, the `testcase_name` key is used.
+
+  Args:
+    first: Iterable of named parameters, whose names will be the first part of
+      the named product's test names.
+    second: Iterable of named parameters, whose names will be the second part of
+      the named product's test names.
+
+  Returns:
+    A decorator that calls the test function with the cartesian product of the
+    given iterators, whose items are named parameters with names of the form
+    `{first_item_name}_{second_item_name}`. If both iterators' items are
+    mappings, the product's items are mappings; otherwise they are ordered
+    tuples.
+  """
+  return spec.named_product(first, second)
+
+
+def _mask_and_pad_to_max_length(
+    a: types.Sequence, b: types.Sequence
+) -> tuple[types.Sequence, types.Sequence]:
+  """Masks and pads two sequences to the same max length."""
+  # Only compare values in non-masked regions.
+  a = a.mask_invalid()
+  b = b.mask_invalid()
+  a_time = a.values.shape[1]
+  b_time = b.values.shape[1]
+  max_time = max(a_time, b_time)
+  a = a.pad_time(0, max_time - a_time, valid=False)
+  b = b.pad_time(0, max_time - b_time, valid=False)
+  return a, b
+
+
+class SequenceLayerTest(spec.SequenceLayerTest):
+  """Base class for MLX SequenceLayer tests."""
+
+  @property
+  @override
+  def sl(self) -> Any:  # pyrefly: ignore[bad-override]
+    return mlx_sl
+
+  @override
+  def setUp(self):
+    super().setUp()
+    # To avoid flakes, fix random seeds.
+    # MLX doesn't have a global seed, but we can set numpy seed.
+    np.random.seed(123456789)
+
+  @override
+  def random_sequence(
+      self,
+      *dims: int,
+      dtype=None,
+      random_mask: bool = False,
+      random_lengths: bool | None = None,
+      low: int | None = 0,
+      high: int | None = 10,
+      low_length: int = 0,
+      high_length: int | None = None,
+  ) -> types.Sequence:
+    if len(dims) < 2:
+      raise ValueError('dims must be at least (batch, time)')
+    batch_size = dims[0]
+    time = dims[1]
+    shape = dims[2:]
+
+    values_np = np.random.normal(size=(batch_size, time) + shape).astype(
+        np.float32
+    )
+    values = mx.array(values_np, dtype=dtype or mx.float32)
+
+    mask_np = np.ones((batch_size, time), dtype=bool)
+    mask = mx.array(mask_np, dtype=mx.bool_)
+
+    return types.Sequence(values, mask)
+
+  @override
+  def assertAllEqual(self, x, y):
+    """Asserts that two arrays are equal."""
+    x_np = np.array(x) if isinstance(x, mx.array) else x
+    y_np = np.array(y) if isinstance(y, mx.array) else y
+    np.testing.assert_array_equal(x_np, y_np)
+
+  @override
+  def assertSequencesEqual(  # pyrefly: ignore[bad-override]
+      self, x: types.Sequence, y: types.Sequence
+  ):
+    """After padding, checks sequence values are equal and masks are equal."""
+    x, y = _mask_and_pad_to_max_length(x, y)
+    self.assertAllEqual(x.values, y.values)
+    self.assertAllEqual(x.mask, y.mask)
+
+  @override
+  # pyrefly: ignore[bad-override]
+  def _step_by_step(
+      self,
+      layer: types.SequenceLayer,
+      x: types.Sequence,
+      *,
+      block_size: int = 1,
+      constants=None,
+      stream_constants: bool = False,
+      stream_constants_list: list[Any] | None = None,
+  ) -> tuple[types.Sequence, Any]:
+    batch = x.values.shape[0] if hasattr(x, 'values') else x.shape[0]
+    time = x.values.shape[1] if hasattr(x, 'values') else x.shape[1]
+
+    input_spec = types.ShapeDType(x.channel_shape, x.dtype)
+
+    init_constants = dict(constants) if constants else {}
+
+    state = layer.get_initial_state(
+        batch, input_spec, constants=init_constants or None, training=False
+    )
+
+    outputs_values = []
+    outputs_masks = []
+
+    for t in range(0, time, block_size):
+      x_block = Sequence(
+          x.values[:, t : t + block_size],
+          x.mask[:, t : t + block_size],
+      )
+
+      step_constants = dict(constants) if constants else {}
+      if stream_constants and stream_constants_list:
+        step_idx = t // block_size
+        if step_idx < len(stream_constants_list):
+          step_constants.update(stream_constants_list[step_idx])
+
+      y_block, state = layer.step(
+          x_block, state, constants=step_constants or None, training=False
+      )
+      outputs_values.append(y_block.values)
+      outputs_masks.append(y_block.mask)
+
+    y_values = mx.concatenate(outputs_values, axis=1)
+    y_mask = mx.concatenate(outputs_masks, axis=1)
+
+    return Sequence(y_values, y_mask), state
+
+  @override
+  # pyrefly: ignore[bad-override]
+  def verify_contract(
+      self,
+      l: types.SequenceLayer,
+      x: types.Sequence,
+      *,
+      training: bool = False,
+      constants=None,
+      stream_constants: bool = False,
+      stream_constants_list: list[Any] | None = None,
+      atol: float = 1e-5,
+      rtol: float = 1e-5,
+      **kwargs,
+  ) -> types.Sequence:
+    if hasattr(x, 'channel_shape'):
+      input_shape = x.channel_shape
+    elif hasattr(x, 'shape'):
+      input_shape = x.shape[2:]
+    else:
+      raise ValueError(f'Cannot determine input shape from {x}')
+    dtype = x.dtype if hasattr(x, 'dtype') else self.xp.float32
+
+    y_layer = l.layer(x, training=training, constants=constants)
+
+    expected_shape = l.get_output_shape(input_shape, constants=constants)
+    self.assertEqual(y_layer.channel_shape, expected_shape)
+
+    expected_dtype = l.get_output_dtype(dtype, constants=constants)
+    self.assertEqual(y_layer.dtype, expected_dtype)
+
+    if not l.supports_step:
+      return y_layer
+
+    block_size = l.block_size
+    y_step, _ = self._step_by_step(
+        l,
+        x,
+        block_size=block_size,
+        constants=constants,
+        stream_constants=stream_constants,
+        stream_constants_list=stream_constants_list,
+    )
+
+    self.assertEqual(y_step.shape, y_layer.shape)
+    self.assertSequencesClose(y_layer, y_step, atol=atol, rtol=rtol)
+
+    return y_layer
+
+  @override
+  def assertSequencesClose(self, x: Any, y: Any, **kwargs) -> None:
+    x_np = np.array(x.values) if hasattr(x, 'values') else np.array(x)
+    y_np = np.array(y.values) if hasattr(y, 'values') else np.array(y)
+    np.testing.assert_allclose(x_np, y_np, **kwargs)
+    if hasattr(x, 'mask') and hasattr(y, 'mask'):
+      mask_x = np.array(x.mask)
+      mask_y = np.array(y.mask)
+      np.testing.assert_array_equal(mask_x, mask_y)
+
+
+class ModuleSpecTest(SequenceLayerTest, spec.ModuleSpecTest):
+
+  @override
+  def module_spec_pairs(self, backend_sl: specs.ModuleSpec):
+    return {backend_sl.test_utils: spec.ModuleSpec}
+
+
+main = absltest.main
diff --git a/sequence_layers/mlx/test_utils_test.py b/sequence_layers/mlx/test_utils_test.py
new file mode 100644
index 0000000..045ff28
--- /dev/null
+++ b/sequence_layers/mlx/test_utils_test.py
@@ -0,0 +1,37 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for the test utilities."""
+
+from sequence_layers.mlx import test_utils
+from sequence_layers.specs import test_utils_behaviors as spec
+
+
+class ModuleSpecTest(test_utils.SequenceLayerTest, spec.ModuleSpecTest):
+  pass
+
+
+class NamedProductTest(test_utils.SequenceLayerTest, spec.NamedProductTest):
+  pass
+
+
+class ZipLongestTest(test_utils.SequenceLayerTest, spec.ZipLongestTest):
+  pass
+
+
+class VerifyContractTest(test_utils.SequenceLayerTest, spec.VerifyContractTest):
+  pass
+
+
+if __name__ == '__main__':
+  test_utils.main()
diff --git a/sequence_layers/specs/test_utils.py b/sequence_layers/specs/test_utils.py
index 119e4dd..09e9b4d 100644
--- a/sequence_layers/specs/test_utils.py
+++ b/sequence_layers/specs/test_utils.py
@@ -14,25 +14,144 @@
 """Utilities for testing sequence layers."""
 
 import abc
-from typing import Any
+import inspect
+import itertools
+from typing import Any, Callable, Iterable, Mapping
+from typing import Sequence as TypingSequence
+from typing import TypeVar
 
 from absl.testing import parameterized
 from sequence_layers import specs
 from sequence_layers.specs import backend as backend_spec
 from sequence_layers.specs import test_utils_spec
 from sequence_layers.specs import types as types_spec
+import typeguard
+
+_T = TypeVar('_T')
 
 
 class _AbcParameterizedTestCaseMeta(abc.ABCMeta, type(parameterized.TestCase)):
   """Metaclass for abstract parameterized test cases."""
 
 
+def zip_longest(
+    targets: Iterable[Iterable[Any]],
+    sources: Iterable[_T],
+) -> list[_T]:
+  """Applies zip_longest, specialized to @parameterized's argument format.
+
+  Args:
+    targets: Iterable of parameterized test arguments.
+    sources: Iterable of parameterized test arguments. If `targets` is a mapping
+      `sources` must be a mapping as well.
+
+  Returns:
+    A list of the zipped arguments, of the type of `targets` and with each
+    zipped argument internally sorted (target, source). If either input sequence
+    was longer, the last element of the shorter input sequence is repeated.
+  """
+  results: list[Any] = []
+  prev_source, prev_target = None, None
+  for source, target in itertools.zip_longest(sources, targets):
+    if source is None:
+      source = prev_source
+    if target is None:
+      target = prev_target
+
+    if isinstance(target, Mapping):
+      assert isinstance(source, Mapping)
+      results.append({**target, **source})
+    elif isinstance(target, Iterable) and not isinstance(target, (str, bytes)):
+      if isinstance(source, Mapping):
+        raise ValueError('Cannot zip mapping source with non-mapping target')
+      assert isinstance(source, Iterable)
+      results.append(tuple(target) + tuple(source))
+    else:
+      results.append((target, source))
+
+    prev_source, prev_target = source, target
+
+  return results
+
+
+_TestFnT = Callable[..., None]
+
+
+def named_product(
+    first: Iterable[TypingSequence[Any] | Mapping[str, Any]],
+    second: Iterable[TypingSequence[Any] | Mapping[str, Any]],
+) -> Callable[[_TestFnT], _TestFnT]:
+  """Builds named parameters from the product of iterators of named parameters.
+
+  As in parameterized.named_parameters, if an iterator's items are sequences,
+  the first element is interpreted as the name. If an iterator's items are
+  mappings, the `testcase_name` key is used.
+
+  Args:
+    first: Iterable of named parameters, whose names will be the first part of
+      the named product's test names.
+    second: Iterable of named parameters, whose names will be the second part of
+      the named product's test names.
+
+  Returns:
+    A decorator that calls the test function with the cartesian product of the
+    given iterators, whose items are named parameters with names of the form
+    `{first_item_name}_{second_item_name}`. If both iterators' items are
+    mappings, the product's items are mappings; otherwise they are ordered
+    tuples.
+  """
+  results: list[Any] = []
+
+  for p1, p2 in itertools.product(first, second):
+    for source, parameters in enumerate([p1, p2]):
+      if isinstance(parameters, Mapping):
+        if 'testcase_name' not in parameters:
+          raise ValueError(
+              f'Mapping {parameters} from iterable #{source+1} does not have'
+              ' key `testcase_name`.'
+          )
+      elif not parameters:
+        raise ValueError(
+            f'An sequence from iterable #{source+1} is empty; the first entry'
+            ' is expected to be a testcase name.'
+        )
+
+    if isinstance(p1, Mapping) and isinstance(p2, Mapping):
+      testcase_name = f'{p1["testcase_name"]}_{p2["testcase_name"]}'
+      p1 = {k: v for k, v in p1.items() if k != 'testcase_name'}
+      p2 = {k: v for k, v in p2.items() if k != 'testcase_name'}
+      results.append({**p1, **p2, 'testcase_name': testcase_name})
+    else:
+      if isinstance(p1, Mapping):
+        p1_name = p1['testcase_name']
+        p1 = tuple(v for k, v in p1.items() if k != 'testcase_name')
+      else:
+        p1_name = p1[0]
+        p1 = p1[1:]
+
+      if isinstance(p2, Mapping):
+        p2_name = p2['testcase_name']
+        p2 = tuple(v for k, v in p2.items() if k != 'testcase_name')
+      else:
+        p2_name = p2[0]
+        p2 = p2[1:]
+
+      testcase_name = f'{p1_name}_{p2_name}'
+      results.append((testcase_name, *p1, *p2))
+
+  return parameterized.named_parameters(*results)
+
+
 class SequenceLayerTest(
     parameterized.TestCase,
     metaclass=_AbcParameterizedTestCaseMeta,
 ):
-  """Base test class providing common sequence testing assertions."""
+  """Base test class providing common sequence testing assertions.
+
+  Binds a backend implementation to tests.
+  """
 
+  # sequence_layers.<backend> module
   sl: specs.ModuleSpec
 
   @property
@@ -54,6 +173,52 @@ def assertAllEqual(self, x: Any, y: Any) -> None:
 
   # pylint: enable=invalid-name
 
+  @abc.abstractmethod
+  def random_sequence(
+      self,
+      *dims: int,
+      dtype=None,
+      random_mask: bool = False,
+      random_lengths: bool | None = None,
+      low: int | None = 0,
+      high: int | None = 10,
+      low_length: int = 0,
+      high_length: int | None = None,
+  ) -> types_spec.Sequence:
+    """Generates a random sequence."""
+
+  @abc.abstractmethod
+  def _step_by_step(
+      self,
+      layer: types_spec.SequenceLayer,
+      x: types_spec.Sequence,
+      *,
+      block_size: int = 1,
+      constants=None,
+      stream_constants=None,
+  ) -> tuple[types_spec.Sequence, Any]:
+    """Runs a layer step by step."""
+
+  @abc.abstractmethod
+  def verify_contract(
+      self,
+      l: types_spec.SequenceLayer,
+      x: types_spec.Sequence,
+      *,
+      training: bool = False,
+      constants=None,
+      stream_constants: bool = False,
+      stream_constants_list: list[Any] | None = None,
+      atol: float = 1e-5,
+      rtol: float = 1e-5,
+      **kwargs,
+  ) -> types_spec.Sequence:
+    """Verifies that a layer satisfies the contract."""
+
+  @abc.abstractmethod
+  def assertSequencesClose(self, x: Any, y: Any, **kwargs) -> None:  # pylint: disable=invalid-name
+    """Asserts that two sequences are close."""
+
 
 class ModuleSpecTest(SequenceLayerTest):
   """Test that a backend-specific module implements the ModuleSpec protocol."""
@@ -67,6 +232,19 @@ def test_backend_specific_module_has_interface(self) -> None:
     for mod, protocol in pairs.items():
       self.assertIsInstance(mod, protocol)
 
+  def test_module_spec_with_typeguard(self) -> None:
+    pairs = self.module_spec_pairs(self.sl)
+    sig = inspect.signature(typeguard.check_type)
+    check_fn = getattr(typeguard, 'check_type')
+    for mod, protocol in pairs.items():
+      if 'argname' in sig.parameters:
+        check_fn('mod', mod, protocol)
+      else:
+        check_fn(mod, protocol)  # pylint: disable=no-value-for-parameter
+
 
+# Re-export the protocol and __all__ from the leaf module so that existing
+# imports (e.g. ``from sequence_layers.specs import test_utils``) continue
+# to resolve ``test_utils.ModuleSpec``.
 ModuleSpec = test_utils_spec.ModuleSpec
 __all__ = test_utils_spec.__all__
diff --git a/sequence_layers/specs/test_utils_behaviors.py b/sequence_layers/specs/test_utils_behaviors.py
new file mode 100644
index 0000000..b0bc755
--- /dev/null
+++ b/sequence_layers/specs/test_utils_behaviors.py
@@ -0,0 +1,347 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Abstract tests for test utilities."""
+
+# pylint: disable=abstract-method
+
+import fractions
+from typing import Any, override
+from unittest import mock
+
+from absl.testing import parameterized
+import numpy as np
+from sequence_layers import specs
+from sequence_layers.specs import test_utils as test_utils_spec
+from sequence_layers.specs import types as types_spec
+
+
+class ModuleSpecTest(test_utils_spec.ModuleSpecTest):
+
+  @override
+  def module_spec_pairs(self, backend_sl: specs.ModuleSpec):
+    return {backend_sl.test_utils: test_utils_spec.ModuleSpec}
+
+
+class NamedProductTest(test_utils_spec.SequenceLayerTest):
+  """Abstract tests for named_product."""
+
+  @parameterized.parameters(
+      {
+          'first': [('a', 'alpha'), ('b', 'beta')],
+          'second': [('1', 1), ('2', 2), ('3', 3)],
+          'expected': [
+              ('a_1', 'alpha', 1),
+              ('a_2', 'alpha', 2),
+              ('a_3', 'alpha', 3),
+              ('b_1', 'beta', 1),
+              ('b_2', 'beta', 2),
+              ('b_3', 'beta', 3),
+          ],
+      },
+      {
+          'first': [{'a': 'alpha', 'testcase_name': 'test'}],
+          'second': [('1', 1), ('2', 2)],
+          'expected': [
+              ('test_1', 'alpha', 1),
+              ('test_2', 'alpha', 2),
+          ],
+      },
+      {
+          'first': [
+              {'letter': 'a', 'testcase_name': 'alpha'},
+              {'testcase_name': 'beta', 'letter': 'b'},
+          ],
+          'second': [
+              {'testcase_name': 'one', 'number': 1},
+              {'number': 2, 'testcase_name': 'two'},
+          ],
+          'expected': [
+              {'letter': 'a', 'number': 1, 'testcase_name': 'alpha_one'},
+              {'letter': 'a', 'number': 2, 'testcase_name': 'alpha_two'},
+              {'letter': 'b', 'number': 1, 'testcase_name': 'beta_one'},
+              {'letter': 'b', 'number': 2, 'testcase_name': 'beta_two'},
+          ],
+      },
+  )
+  @mock.patch.object(parameterized, 'named_parameters', autospec=True)
+  def test_builds_named_products(self, mock_fn, first, second, expected):
+    self.sl.test_utils.named_product(first, second)
+    self.assertSequenceEqual(mock_fn.call_args.args, expected)
+
+  @parameterized.parameters(
+      {
+          'first': [{'testcase_name': 'alpha', 'letter': 'a'}, {'letter': 'b'}],
+          'second': [('1', 1), ('2', 2), ('3', 3)],
+          'iterator_without_testcase_name': 1,
+      },
+      {
+          'first': [{'testcase_name': 'alpha', 'letter': 'a'}],
+          'second': [('1', 1), ()],
+          'iterator_without_testcase_name': 2,
+      },
+  )
+  def test_raises_on_missing_testcase_names(
+      self, first, second, iterator_without_testcase_name
+  ):
+    with self.assertRaisesRegex(
+        ValueError, str(iterator_without_testcase_name)
+    ):
+      self.sl.test_utils.named_product(first, second)
+
+
+class ZipLongestTest(test_utils_spec.SequenceLayerTest):
+  """Abstract tests for zip_longest."""
+
+  @parameterized.parameters(
+      {
+          'targets': [('a',), ('b',)],
+          'sources': [(1,), (2,)],
+          'expected': [('a', 1), ('b', 2)],
+      },
+      {
+          'targets': [('a',), ('b',)],
+          'sources': [(1,)],
+          'expected': [('a', 1), ('b', 1)],
+      },
+      {
+          'targets': [('a',)],
+          'sources': [(1,), (2,)],
+          'expected': [('a', 1), ('a', 2)],
+      },
+      {
+          'targets': [{'testcase_name': 'a'}],
+          'sources': [{'val': 1}],
+          'expected': [{'testcase_name': 'a', 'val': 1}],
+      },
+  )
+  def test_zip_longest(self, targets, sources, expected):
+    results = self.sl.test_utils.zip_longest(targets, sources)
+    self.assertEqual(results, expected)
+
+
+class GenericDummyLayer(types_spec.SequenceLayer):
+  """Generic dummy layer for testing verify_contract."""
+
+  @override
+  def layer(
+      self,
+      x: types_spec.Sequence,
+      *,
+      training: bool,
+      constants: types_spec.Constants | None = None,
+  ) -> types_spec.Sequence:
+    return x
+
+  @override
+  def step(
+      self,
+      x: types_spec.Sequence,
+      state: types_spec.State,
+      *,
+      training: bool,
+      constants: types_spec.Constants | None = None,
+  ) -> tuple[types_spec.Sequence, types_spec.State]:
+    return x, state
+
+  @override
+  def step_with_emits(
+      self,
+      x: types_spec.Sequence,
+      state: types_spec.State,
+      *,
+      training: bool,
+      constants: types_spec.Constants | None = None,
+  ) -> tuple[types_spec.Sequence, types_spec.State, types_spec.Emits]:
+    y, state = self.step(x, state, constants=constants, training=training)
+    return y, state, ()
+
+  @override
+  def get_initial_state(
+      self,
+      batch_size: int,
+      input_spec: types_spec.ChannelSpec,
+      *,
+      training: bool,
+      constants: types_spec.Constants | None = None,
+  ) -> types_spec.State:
+    return None
+
+  @property
+  @override
+  def receptive_field(self) -> tuple[int, int]:
+    return (0, 0)
+
+  @property
+  @override
+  def block_size(self) -> int:
+    return 1
+
+  @property
+  @override
+  def output_ratio(self) -> fractions.Fraction:
+    return fractions.Fraction(1)
+
+  @property
+  @override
+  def input_latency(self) -> int:
+    return 0
+
+  @property
+  @override
+  def output_latency(self) -> int:
+    return 0
+
+  @property
+  @override
+  def supports_step(self) -> bool:
+    return True
+
+  @override
+  def get_accumulated_input_latency(self, input_latency: int) -> int:
+    return input_latency
+
+  @override
+  def get_accumulated_output_latency(self, output_latency: int) -> int:
+    return output_latency
+
+  @override
+  def layer_with_emits(
+      self,
+      x: types_spec.Sequence,
+      *,
+      training: bool,
+      constants: types_spec.Constants | None = None,
+  ) -> tuple[types_spec.Sequence, types_spec.Emits]:
+    return self.layer(x, training=training, constants=constants), ()
+
+  @override
+  def get_output_shape_for_sequence(
+      self,
+      x: types_spec.Sequence,
+      *,
+      constants: types_spec.Constants | None = None,
+  ) -> types_spec.Shape:
+    return self.get_output_shape(x.channel_shape, constants=constants)
+
+  @override
+  def get_output_shape(
+      self,
+      input_shape: types_spec.ShapeLike,
+      *,
+      constants: types_spec.Constants | None = None,
+  ) -> types_spec.Shape:
+    return tuple(input_shape)
+
+  @override
+  def get_output_dtype(
+      self,
+      input_dtype: types_spec.DType,
+      *,
+      constants: types_spec.Constants | None = None,
+  ) -> types_spec.DType:
+    return input_dtype
+
+  @override
+  def get_output_spec(
+      self,
+      input_spec: Any,
+      *,
+      constants: types_spec.Constants | None = None,
+  ) -> Any:
+    shape = self.get_output_shape(input_spec.shape, constants=constants)
+    dtype = self.get_output_dtype(input_spec.dtype, constants=constants)
+
+    class Spec:
+      """Dummy spec class."""
+
+      def __init__(self, s, d):
+        self.shape = s
+        self.dtype = d
+
+    return Spec(shape, dtype)
+
+
+class GenericMismatchedDummyLayer(GenericDummyLayer):
+  """Dummy layer that induces a mismatch by returning zeros in layer()."""
+
+  @override
+  def layer(
+      self,
+      x: types_spec.Sequence,
+      *,
+      training: bool,
+      constants: types_spec.Constants | None = None,
+  ) -> types_spec.Sequence:
+    return x.apply_values(lambda v: v * 0.0)
+
+
+class VerifyContractTest(test_utils_spec.SequenceLayerTest):
+  """Abstract tests for verify_contract."""
+
+  def get_dummy_layer(self, mismatch: bool) -> Any:
+    """Returns a dummy layer for testing."""
+    backend_sl = self.sl
+
+    if mismatch:
+
+      class BackendMismatchedDummyLayer(
+          GenericMismatchedDummyLayer, backend_sl.types.SequenceLayer
+      ):
+        """Mismatched dummy layer for backend."""
+
+      return BackendMismatchedDummyLayer()
+
+    class BackendDummyLayer(GenericDummyLayer, backend_sl.types.SequenceLayer):
+      """Dummy layer for backend."""
+
+    return BackendDummyLayer()
+
+  def test_verify_contract_catches_step_mismatch(self):
+    layer = self.get_dummy_layer(mismatch=True)
+
+    x = self.sl.Sequence(
+        self.xp.array(np.ones((2, 5, 10))),
+        self.xp.array(np.ones((2, 5), dtype=bool)),
+    )
+
+    with self.assertRaises(AssertionError):
+      self.verify_contract(layer, x, training=False)
+
+  def test_verify_contract_succeeds_when_equivalent(self):
+    layer = self.get_dummy_layer(mismatch=False)
+
+    x = self.sl.Sequence(
+        self.xp.array(np.ones((2, 5, 10))),
+        self.xp.array(np.ones((2, 5), dtype=bool)),
+    )
+
+    self.verify_contract(layer, x, training=False)
+
+  def test_verify_contract_handles_stream_constants(self):
+    layer = self.get_dummy_layer(mismatch=False)
+
+    x = self.sl.Sequence(
+        self.xp.array(np.ones((2, 5, 10))),
+        self.xp.array(np.ones((2, 5), dtype=bool)),
+    )
+    constants = {
+        'c': self.sl.Sequence(
+            self.xp.array(np.ones((2, 5, 1))),
+            self.xp.array(np.ones((2, 5), dtype=bool)),
+        )
+    }
+
+    self.verify_contract(
+        layer, x, training=False, constants=constants, stream_constants=True
+    )

From 90b9d3c94eb993e65fe367300f3c9515bf10555d Mon Sep 17 00:00:00 2001
From: Julian Salazar <julsal@google.com>
Date: Mon, 1 Jun 2026 11:07:18 +0000
Subject: [PATCH 05/29] docs(sync): add temporary agent guidelines and staging
 path override

Co-authored-by: David Braun <davidbraun@google.com>
Co-authored-by: Kehang Han <kehanghan@google.com>

PiperPending-RevId: 924617734
PiperOrigin-RevId: 924617734
---
 pyproject.toml                           |  1 +
 sequence_layers/jax/__init__.py          |  1 +
 sequence_layers/jax/test_utils.py        |  4 +++-
 sequence_layers/jax/types.py             | 21 +++++++++++++++------
 sequence_layers/mlx/__init__.py          |  2 +-
 sequence_layers/mlx/backend.py           |  2 +-
 sequence_layers/mlx/test_utils.py        |  1 +
 sequence_layers/specs/backend.py         |  3 ++-
 sequence_layers/specs/test_utils.py      |  1 +
 sequence_layers/specs/test_utils_spec.py |  2 +-
 sequence_layers/specs/types.py           |  4 ++--
 11 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 9930f9b..e40e2bf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -95,6 +95,7 @@ disable = [
   "too-many-public-methods",
   "too-many-return-statements",
   "too-many-statements",
+  "unknown-option-value",
 ]
 
 
diff --git a/sequence_layers/jax/__init__.py b/sequence_layers/jax/__init__.py
index 3eb88cc..bdbcf7d 100644
--- a/sequence_layers/jax/__init__.py
+++ b/sequence_layers/jax/__init__.py
@@ -30,6 +30,7 @@
 
 # (re-export the names for typechecking)
 # pylint: disable=useless-import-alias
+from . import backend as backend
 from . import test_utils as test_utils
 from . import types as types
 from .test_utils import SequenceLayerTest
diff --git a/sequence_layers/jax/test_utils.py b/sequence_layers/jax/test_utils.py
index 54c58e8..eeba552 100644
--- a/sequence_layers/jax/test_utils.py
+++ b/sequence_layers/jax/test_utils.py
@@ -20,7 +20,9 @@
 import functools
 import logging
 import random
-from typing import Any, Callable, Iterable, Mapping, Sequence as TypingSequence, TypeVar, override
+from typing import Any, Callable, Iterable, Mapping, override
+from typing import Sequence as TypingSequence
+from typing import TypeVar
 
 from absl.testing import absltest
 import chex
diff --git a/sequence_layers/jax/types.py b/sequence_layers/jax/types.py
index 08f8f06..4a9edfe 100644
--- a/sequence_layers/jax/types.py
+++ b/sequence_layers/jax/types.py
@@ -28,11 +28,11 @@
     ParamSpec,
     Protocol,
     Self,
-    Sequence as TypingSequence,
-    TypeVar,
     cast,
     override,
 )
+from typing import Sequence as TypingSequence
+from typing import TypeVar
 
 from absl import logging
 from flax import linen as nn
@@ -574,7 +574,9 @@ def apply_values_masked(
   ) -> 'MaskedSequence[NewValuesT, MaskT]':
     return cast(
         MaskedSequence,
-        super().apply_values_masked(values_fn, *args, **kwargs),  # pytype: disable=wrong-arg-types
+        super().apply_values_masked(
+            values_fn, *args, **kwargs
+        ),  # pytype: disable=wrong-arg-types
     )
 
   @override
@@ -585,7 +587,10 @@ def apply_masked(
       **kwargs: ApplyMaskedParams.kwargs,
   ) -> 'MaskedSequence[NewValuesT, NewMaskT]':
     return cast(
-        MaskedSequence, super().apply_masked(apply_fn, *args, **kwargs)  # pytype: disable=wrong-arg-types
+        MaskedSequence,
+        super().apply_masked(
+            apply_fn, *args, **kwargs
+        ),  # pytype: disable=wrong-arg-types
     )
 
   @override
@@ -1342,7 +1347,9 @@ def get_output_shape(
     return tuple(input_shape)
 
 
-class Emitting(SequenceLayer, spec.Emitting[Sequence, Sequence, ChannelSpec]):  # pytype: disable=ignored-abstractmethod
+class Emitting(
+    SequenceLayer, spec.Emitting[Sequence, Sequence, ChannelSpec]
+):  # pytype: disable=ignored-abstractmethod
   """A SequenceLayer that emits auxiliary arrays.
 
   This is a convenience subclass that implements step and layer in terms of
@@ -1401,7 +1408,9 @@ def layer_with_emits(
     pass
 
 
-class Stateless(SequenceLayer, spec.Stateless[Sequence, Sequence, ChannelSpec]):  # pytype: disable=ignored-abstractmethod
+class Stateless(
+    SequenceLayer, spec.Stateless[Sequence, Sequence, ChannelSpec]
+):  # pytype: disable=ignored-abstractmethod
   """A SequenceLayer with no state over time required for step-wise processing.
 
   Sub-classes must only implement:
diff --git a/sequence_layers/mlx/__init__.py b/sequence_layers/mlx/__init__.py
index ddaa329..5bba4b2 100644
--- a/sequence_layers/mlx/__init__.py
+++ b/sequence_layers/mlx/__init__.py
@@ -1,4 +1,4 @@
-# pylint: disable=cyclic-import
+# pylint: disable=cyclic-import,g-importing-member
 # Copyright 2026 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/sequence_layers/mlx/backend.py b/sequence_layers/mlx/backend.py
index 513cd21..1b2ad85 100644
--- a/sequence_layers/mlx/backend.py
+++ b/sequence_layers/mlx/backend.py
@@ -37,7 +37,7 @@ def zeros(self, shape, dtype=None) -> types_spec.Array:
 
   @override
   def concatenate(self, arrays, axis=0) -> types_spec.Array:
-    return mx.concatenate(arrays, axis=axis)  # pyrefly: ignore[bad-argument-type]
+    return mx.concatenate(list(arrays), axis=axis)
 
 
 xp: spec.xp = BackendWrapper()
diff --git a/sequence_layers/mlx/test_utils.py b/sequence_layers/mlx/test_utils.py
index c35b6ad..d23bd3a 100644
--- a/sequence_layers/mlx/test_utils.py
+++ b/sequence_layers/mlx/test_utils.py
@@ -99,6 +99,7 @@ class SequenceLayerTest(spec.SequenceLayerTest):
   @property
   @override
   def sl(self) -> Any:  # pyrefly: ignore[bad-override]
+    """Returns the MLX sequence_layers module."""
     return mlx_sl
 
   @override
diff --git a/sequence_layers/specs/backend.py b/sequence_layers/specs/backend.py
index 45657bf..706b8b7 100644
--- a/sequence_layers/specs/backend.py
+++ b/sequence_layers/specs/backend.py
@@ -13,7 +13,8 @@
 # limitations under the License.
 """Specification for backend-specific helpers."""
 
-from typing import Any, Protocol, Sequence as TypingSequence, runtime_checkable
+from typing import Any, Protocol, runtime_checkable
+from typing import Sequence as TypingSequence
 
 from sequence_layers.specs import types as types_spec
 
diff --git a/sequence_layers/specs/test_utils.py b/sequence_layers/specs/test_utils.py
index 09e9b4d..d4b98c3 100644
--- a/sequence_layers/specs/test_utils.py
+++ b/sequence_layers/specs/test_utils.py
@@ -21,6 +21,7 @@
 from typing import TypeVar
 
 from absl.testing import parameterized
+
 from sequence_layers import specs
 from sequence_layers.specs import backend as backend_spec
 from sequence_layers.specs import test_utils_spec
diff --git a/sequence_layers/specs/test_utils_spec.py b/sequence_layers/specs/test_utils_spec.py
index 636062e..b1a6a32 100644
--- a/sequence_layers/specs/test_utils_spec.py
+++ b/sequence_layers/specs/test_utils_spec.py
@@ -39,7 +39,7 @@ def named_product(
     """Creates a named product."""
 
   @property
-  def SequenceLayerTest(self) -> type[Any]:  # pylint: disable=invalid-name
+  def SequenceLayerTest(self) -> type[Any]:  # pylint: disable=invalid-name,missing-function-docstring
     ...
 
 
diff --git a/sequence_layers/specs/types.py b/sequence_layers/specs/types.py
index f37279f..aec921e 100644
--- a/sequence_layers/specs/types.py
+++ b/sequence_layers/specs/types.py
@@ -31,11 +31,11 @@
     Iterable,
     Literal,
     MutableMapping,
+    override,
     Protocol,
+    runtime_checkable,
     Self,
     TypeVar,
-    override,
-    runtime_checkable,
 )
 from typing import cast  # pylint: disable=unused-import
 

From 7172b0b569443a67f313ed2e935f1a270b69acc8 Mon Sep 17 00:00:00 2001
From: Julian Salazar <julsal@google.com>
Date: Mon, 1 Jun 2026 11:07:18 +0000
Subject: [PATCH 06/29] refactor(mlx/test_utils): standardize abstract
 test_utils and backends for specs architecture

Co-authored-by: David Braun <davidbraun@google.com>
Co-authored-by: Kehang Han <kehanghan@google.com>

PiperPending-RevId: 924617736
PiperOrigin-RevId: 924617736
---
 sequence_layers/jax/backend.py           |  79 ++++++
 sequence_layers/jax/test_utils.py        |  10 +-
 sequence_layers/mlx/backend.py           |  98 +++++++
 sequence_layers/mlx/test_utils.py        | 311 ++++++++++++++++++-----
 sequence_layers/mlx/types.py             |  41 ++-
 sequence_layers/specs/backend.py         |  68 ++++-
 sequence_layers/specs/test_utils.py      |  50 ++++
 sequence_layers/specs/test_utils_spec.py |   4 +
 8 files changed, 591 insertions(+), 70 deletions(-)

diff --git a/sequence_layers/jax/backend.py b/sequence_layers/jax/backend.py
index ab694ac..37ac458 100644
--- a/sequence_layers/jax/backend.py
+++ b/sequence_layers/jax/backend.py
@@ -15,6 +15,7 @@
 
 from typing import override
 
+import jax.nn as jnn
 import jax.numpy as jnp
 from sequence_layers.specs import backend as spec
 from sequence_layers.specs import types as types_spec
@@ -35,8 +36,86 @@ def array(self, a, dtype=None) -> types_spec.Array:
   def zeros(self, shape, dtype=None) -> types_spec.Array:
     return jnp.zeros(shape, dtype=dtype)
 
+  @override
   def concatenate(self, arrays, axis=0) -> types_spec.Array:
     return jnp.concatenate(arrays, axis=axis)
 
+  @override
+  def broadcast_to(self, array, shape) -> types_spec.Array:
+    return jnp.broadcast_to(array, shape)
+
+  @override
+  def abs(self, x) -> types_spec.Array:
+    return jnp.abs(x)
+
+  @override
+  def exp(self, x) -> types_spec.Array:
+    return jnp.exp(x)
+
+  @override
+  def log(self, x) -> types_spec.Array:
+    return jnp.log(x)
+
+  @override
+  def mean(
+      self,
+      x,
+      axis=None,
+      dtype=None,
+      keepdims=False,
+      where=None,
+  ) -> types_spec.Array:
+    return jnp.mean(x, axis=axis, dtype=dtype, keepdims=keepdims, where=where)
+
+  @override
+  def var(
+      self,
+      x,
+      axis=None,
+      dtype=None,
+      keepdims=False,
+      where=None,
+  ) -> types_spec.Array:
+    return jnp.var(x, axis=axis, dtype=dtype, keepdims=keepdims, where=where)
+
 
 xp: spec.xp = BackendWrapper()
+
+
+class NNWrapper(spec.nn):
+  """Wrapper around JAX activations to match backend protocol."""
+
+  @override
+  def relu(self, x: types_spec.Array) -> types_spec.Array:
+    return jnn.relu(x)
+
+  @override
+  def sigmoid(self, x: types_spec.Array) -> types_spec.Array:
+    return jnn.sigmoid(x)
+
+  @override
+  def tanh(self, x: types_spec.Array) -> types_spec.Array:
+    return jnn.tanh(x)
+
+  @override
+  def swish(self, x: types_spec.Array) -> types_spec.Array:
+    return jnn.swish(x)
+
+  @override
+  def gelu(self, x: types_spec.Array) -> types_spec.Array:
+    return jnn.gelu(x)
+
+  @override
+  def elu(self, x: types_spec.Array) -> types_spec.Array:
+    return jnn.elu(x)
+
+  @override
+  def softplus(self, x: types_spec.Array) -> types_spec.Array:
+    return jnn.softplus(x)
+
+  @override
+  def softmax(self, x: types_spec.Array, axis: int = -1) -> types_spec.Array:
+    return jnn.softmax(x, axis=axis)
+
+
+nn: spec.nn = NNWrapper()
diff --git a/sequence_layers/jax/test_utils.py b/sequence_layers/jax/test_utils.py
index eeba552..49ec878 100644
--- a/sequence_layers/jax/test_utils.py
+++ b/sequence_layers/jax/test_utils.py
@@ -725,6 +725,10 @@ def setUp(self):
     random.seed(123456789)
     np.random.seed(123456789)
 
+  @override
+  def get_variables(self, layer):
+    return layer.variables
+
   def init_and_bind_layer(
       self,
       key: jax.Array,
@@ -774,10 +778,12 @@ def randomize_weights_fn(variables):
 
     return layer.bind(variables)
 
-  def init_layer(self, layer, x, **kwargs):
+  def init_layer(self, layer, x, bind_only=False, constants=None):
     """Initialize and bind variables for JAX."""
+    if bind_only:
+      return layer.bind({})
     key = jax.random.PRNGKey(1234)
-    return self.init_and_bind_layer(key, layer, x, **kwargs)
+    return self.init_and_bind_layer(key, layer, x, constants=constants)
 
   def verify_masked(self, x: types.Sequence):
     """Asserts all invalid timesteps in x have values masked to zero."""
diff --git a/sequence_layers/mlx/backend.py b/sequence_layers/mlx/backend.py
index 1b2ad85..69353a2 100644
--- a/sequence_layers/mlx/backend.py
+++ b/sequence_layers/mlx/backend.py
@@ -16,6 +16,7 @@
 from typing import override
 
 import mlx.core as mx
+import mlx.nn as nn_mlx
 from sequence_layers.specs import backend as spec
 from sequence_layers.specs import types as types_spec
 
@@ -39,5 +40,102 @@ def zeros(self, shape, dtype=None) -> types_spec.Array:
   def concatenate(self, arrays, axis=0) -> types_spec.Array:
     return mx.concatenate(list(arrays), axis=axis)
 
+  @override
+  def broadcast_to(self, array, shape) -> types_spec.Array:
+    return mx.broadcast_to(array, shape)
+
+  @override
+  def abs(self, x) -> types_spec.Array:
+    return mx.abs(x)
+
+  @override
+  def exp(self, x) -> types_spec.Array:
+    return mx.exp(x)
+
+  @override
+  def log(self, x) -> types_spec.Array:
+    return mx.log(x)
+
+  @override
+  def mean(
+      self,
+      x,
+      axis=None,
+      dtype=None,
+      keepdims=False,
+      where=None,
+  ) -> types_spec.Array:
+    if where is not None:
+      x_masked = mx.where(where, x, 0.0)
+      summed = mx.sum(x_masked, axis=axis, keepdims=keepdims)
+      counts = mx.sum(where.astype(mx.int32), axis=axis, keepdims=keepdims)
+      counts = mx.maximum(counts, 1)
+      result = summed / counts
+    else:
+      result = mx.mean(x, axis=axis, keepdims=keepdims)
+    if dtype is not None:
+      result = result.astype(dtype)
+    return result
+
+  @override
+  def var(
+      self,
+      x,
+      axis=None,
+      dtype=None,
+      keepdims=False,
+      where=None,
+  ) -> types_spec.Array:
+    if where is not None:
+      mean_val = self.mean(x, axis=axis, keepdims=True, where=where)
+      squared_diff = mx.square(x - mean_val)
+      result = self.mean(
+          squared_diff, axis=axis, keepdims=keepdims, where=where
+      )
+    else:
+      result = mx.var(x, axis=axis, keepdims=keepdims)
+    if dtype is not None:
+      result = result.astype(dtype)
+    return result
+
 
 xp: spec.xp = BackendWrapper()
+
+
+class NNWrapper(spec.nn):
+  """Wrapper around MLX activations to match backend protocol."""
+
+  @override
+  def relu(self, x: types_spec.Array) -> types_spec.Array:
+    return nn_mlx.relu(x)
+
+  @override
+  def sigmoid(self, x: types_spec.Array) -> types_spec.Array:
+    return mx.sigmoid(x)
+
+  @override
+  def tanh(self, x: types_spec.Array) -> types_spec.Array:
+    return mx.tanh(x)
+
+  @override
+  def swish(self, x: types_spec.Array) -> types_spec.Array:
+    return nn_mlx.silu(x)
+
+  @override
+  def gelu(self, x: types_spec.Array) -> types_spec.Array:
+    return nn_mlx.gelu(x)
+
+  @override
+  def elu(self, x: types_spec.Array) -> types_spec.Array:
+    return nn_mlx.elu(x)
+
+  @override
+  def softplus(self, x: types_spec.Array) -> types_spec.Array:
+    return nn_mlx.softplus(x)
+
+  @override
+  def softmax(self, x: types_spec.Array, axis: int = -1) -> types_spec.Array:
+    return mx.softmax(x, axis=axis)
+
+
+nn: spec.nn = NNWrapper()
diff --git a/sequence_layers/mlx/test_utils.py b/sequence_layers/mlx/test_utils.py
index d23bd3a..cf87ca9 100644
--- a/sequence_layers/mlx/test_utils.py
+++ b/sequence_layers/mlx/test_utils.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 """Test utilities for MLX sequence layers."""
 
+import dataclasses
 from typing import Any, Callable, Iterable, Mapping, override
 from typing import Sequence as TypingSequence
 from typing import TypeVar
@@ -52,6 +53,150 @@ def zip_longest(
   return spec.zip_longest(targets, sources)
 
 
+def random_sequence(
+    *dims: int,
+    dtype=None,
+    random_mask: bool = False,
+    random_lengths: bool | None = None,
+    low: int | None = 0,
+    high: int | None = 10,
+    low_length: int = 0,
+    high_length: int | None = None,
+) -> types.Sequence:
+  """Generates a random sequence for MLX testing."""
+  # pylint: disable=unused-argument
+  if len(dims) < 2:
+    raise ValueError('dims must be at least (batch, time)')
+  batch_size = dims[0]
+  time = dims[1]
+  shape = dims[2:]
+
+  if dtype is not None:
+    if dtype == np.float32:
+      dtype = mx.float32
+    elif dtype == np.float16:
+      dtype = mx.float16
+    elif dtype == np.int32:
+      dtype = mx.int32
+    elif dtype == np.bool_:
+      dtype = mx.bool_
+
+  if dtype is not None and dtype in (
+      mx.int32,
+      mx.int16,
+      mx.int8,
+      mx.uint32,
+      mx.uint16,
+      mx.uint8,
+  ):
+    values_np = np.random.randint(
+        low if low is not None else 0,
+        high if high is not None else 10,
+        size=(batch_size, time) + shape,
+    )
+  else:
+    values_np = np.random.normal(size=(batch_size, time) + shape).astype(
+        np.float32
+    )
+  values = mx.array(values_np, dtype=dtype or mx.float32)
+
+  mask_np = np.ones((batch_size, time), dtype=bool)
+  mask = mx.array(mask_np, dtype=mx.bool_)
+
+  return types.Sequence(values, mask)
+
+
+def step_by_step(
+    layer: types.SequenceLayer,
+    x: types.Sequence,
+    *,
+    block_size: int = 1,
+    constants=None,
+    stream_constants=None,
+    stream_constants_list: list[Any] | None = None,
+) -> tuple[types.Sequence, Any]:
+  """Applies step-by-step processing to the sequence using MLX step functions."""
+  batch = x.values.shape[0] if hasattr(x, 'values') else x.shape[0]
+  time = x.values.shape[1] if hasattr(x, 'values') else x.shape[1]
+  remainder = time % block_size
+  if remainder != 0:
+    pad_amount = block_size - remainder
+    x = x.pad_time(0, pad_amount, valid=False)
+    time = x.values.shape[1]
+
+  # Pad to multiple of block_size.
+  num_blocks = (time + block_size - 1) // block_size
+  padded_time = num_blocks * block_size
+  pad_amount = padded_time - time
+  x = x.pad_time(0, pad_amount, valid=False)
+  time = padded_time
+
+  input_spec = types.ShapeDType(x.channel_shape, x.dtype)
+
+  # Handle JAX-style stream_constants (dict of sequences or bool)
+  if isinstance(stream_constants, bool) and stream_constants:
+    stream_source = constants
+  elif isinstance(stream_constants, dict):
+    stream_source = stream_constants
+  else:
+    stream_source = None
+
+  if stream_source and stream_constants_list is None:
+    padded_stream_source = {}
+    for name, seq in stream_source.items():
+      if hasattr(seq, 'pad_time'):
+        pad_back = max(0, time - seq.values.shape[1])
+        padded_stream_source[name] = seq.pad_time(0, pad_back, valid=False)
+      else:
+        padded_stream_source[name] = seq
+    stream_source = padded_stream_source
+
+    stream_constants_list = []
+    for t in range(0, time, block_size):
+      step_dict = {}
+      for name, seq in stream_source.items():
+        if hasattr(seq, 'values') and hasattr(seq, 'mask'):
+          step_dict[name] = types.Sequence(
+              seq.values[:, t : t + block_size],
+              seq.mask[:, t : t + block_size],
+          )
+      stream_constants_list.append(step_dict)
+
+  init_constants = dict(constants) if constants else {}
+  if isinstance(stream_constants, dict):
+    init_constants.update(stream_constants)
+
+  state = layer.get_initial_state(
+      batch, input_spec, constants=init_constants or None, training=False
+  )
+
+  outputs_values = []
+  outputs_masks = []
+
+  for t in range(0, time, block_size):
+    x_block = types.Sequence(
+        x.values[:, t : t + block_size],
+        x.mask[:, t : t + block_size],
+    )
+
+    step_constants = dict(constants) if constants else {}
+    if stream_constants_list:
+      step_idx = t // block_size
+      if step_idx < len(stream_constants_list):
+        step_constants.update(stream_constants_list[step_idx])
+
+    y_block, state = layer.step(
+        x_block, state, constants=step_constants or None, training=False
+    )
+    outputs_values.append(y_block.values)
+    outputs_masks.append(y_block.mask)
+
+  y_values = mx.concatenate(outputs_values, axis=1)
+  y_mask = mx.concatenate(outputs_masks, axis=1)
+
+  return types.Sequence(y_values, y_mask), state
+
+
 def named_product(
     first: Iterable[TypingSequence[Any] | Mapping[str, Any]],
     second: Iterable[TypingSequence[Any] | Mapping[str, Any]],
@@ -109,6 +254,17 @@ def setUp(self):
     # MLX doesn't have a global seed, but we can set numpy seed.
     np.random.seed(123456789)
 
+  @override
+  def get_variables(self, layer: Any) -> dict[str, Any]:
+
+    return layer.parameters()
+
+  @override
+  def init_layer(self, layer, x, bind_only=False, constants=None):
+    if not bind_only:
+      _ = layer.layer(x, training=False, constants=constants)
+    return layer
+
   @override
   def random_sequence(
       self,
@@ -121,28 +277,38 @@ def random_sequence(
       low_length: int = 0,
       high_length: int | None = None,
   ) -> types.Sequence:
-    if len(dims) < 2:
-      raise ValueError('dims must be at least (batch, time)')
-    batch_size = dims[0]
-    time = dims[1]
-    shape = dims[2:]
-
-    values_np = np.random.normal(size=(batch_size, time) + shape).astype(
-        np.float32
+    return random_sequence(
+        *dims,
+        dtype=dtype,
+        random_mask=random_mask,
+        random_lengths=random_lengths,
+        low=low,
+        high=high,
+        low_length=low_length,
+        high_length=high_length,
     )
-    values = mx.array(values_np, dtype=dtype or mx.float32)
 
-    mask_np = np.ones((batch_size, time), dtype=bool)
-    mask = mx.array(mask_np, dtype=mx.bool_)
-
-    return types.Sequence(values, mask)
+  @override
+  def assertEqual(self, first, second, msg=None):
+    """Override to handle MLX vs NumPy dtypes."""
+    if isinstance(first, mx.Dtype) and isinstance(second, (type, np.dtype)):
+      first_str = str(first).rsplit('.', maxsplit=1)[-1]
+      second_str = np.dtype(second).name
+      if first_str == second_str:
+        return
+    super().assertEqual(first, second, msg)
 
   @override
   def assertAllEqual(self, x, y):
-    """Asserts that two arrays are equal."""
-    x_np = np.array(x) if isinstance(x, mx.array) else x
-    y_np = np.array(y) if isinstance(y, mx.array) else y
-    np.testing.assert_array_equal(x_np, y_np)
+    """Asserts that two arrays are equal (or close if float)."""
+    x_np = np.array(x) if isinstance(x, mx.array) else np.asarray(x)
+    y_np = np.array(y) if isinstance(y, mx.array) else np.asarray(y)
+    if np.issubdtype(x_np.dtype, np.floating) or np.issubdtype(
+        y_np.dtype, np.floating
+    ):
+      np.testing.assert_allclose(x_np, y_np, rtol=1e-5, atol=1e-5)
+    else:
+      np.testing.assert_array_equal(x_np, y_np)
 
   @override
   def assertSequencesEqual(  # pyrefly: ignore[bad-override]
@@ -162,52 +328,24 @@ def _step_by_step(
       *,
       block_size: int = 1,
       constants=None,
-      stream_constants: bool = False,
+      stream_constants=None,
       stream_constants_list: list[Any] | None = None,
   ) -> tuple[types.Sequence, Any]:
-    batch = x.values.shape[0] if hasattr(x, 'values') else x.shape[0]
-    time = x.values.shape[1] if hasattr(x, 'values') else x.shape[1]
-
-    input_spec = types.ShapeDType(x.channel_shape, x.dtype)
-
-    init_constants = dict(constants) if constants else {}
-
-    state = layer.get_initial_state(
-        batch, input_spec, constants=init_constants or None, training=False
+    return step_by_step(
+        layer,
+        x,
+        block_size=block_size,
+        constants=constants,
+        stream_constants=stream_constants,
+        stream_constants_list=stream_constants_list,
     )
 
-    outputs_values = []
-    outputs_masks = []
-
-    for t in range(0, time, block_size):
-      x_block = Sequence(
-          x.values[:, t : t + block_size],
-          x.mask[:, t : t + block_size],
-      )
-
-      step_constants = dict(constants) if constants else {}
-      if stream_constants and stream_constants_list:
-        step_idx = t // block_size
-        if step_idx < len(stream_constants_list):
-          step_constants.update(stream_constants_list[step_idx])
-
-      y_block, state = layer.step(
-          x_block, state, constants=step_constants or None, training=False
-      )
-      outputs_values.append(y_block.values)
-      outputs_masks.append(y_block.mask)
-
-    y_values = mx.concatenate(outputs_values, axis=1)
-    y_mask = mx.concatenate(outputs_masks, axis=1)
-
-    return Sequence(y_values, y_mask), state
-
   @override
   # pyrefly: ignore[bad-override]
   def verify_contract(
       self,
       l: types.SequenceLayer,
-      x: types.Sequence,
+      x: types.Sequence | tuple[int, ...],
       *,
       training: bool = False,
       constants=None,
@@ -217,6 +355,9 @@ def verify_contract(
       rtol: float = 1e-5,
       **kwargs,
   ) -> types.Sequence:
+    if isinstance(x, tuple):
+      x = self.random_sequence(2, 5, *x)
+
     if hasattr(x, 'channel_shape'):
       input_shape = x.channel_shape
     elif hasattr(x, 'shape'):
@@ -237,28 +378,38 @@ def verify_contract(
       return y_layer
 
     block_size = l.block_size
+    x_padded = x.pad_time(0, l.input_latency, valid=False)
     y_step, _ = self._step_by_step(
         l,
-        x,
+        x_padded,
         block_size=block_size,
         constants=constants,
         stream_constants=stream_constants,
         stream_constants_list=stream_constants_list,
     )
+    y_step = y_step[:, l.output_latency :]
 
-    self.assertEqual(y_step.shape, y_layer.shape)
     self.assertSequencesClose(y_layer, y_step, atol=atol, rtol=rtol)
 
     return y_layer
 
   @override
   def assertSequencesClose(self, x: Any, y: Any, **kwargs) -> None:
-    x_np = np.array(x.values) if hasattr(x, 'values') else np.array(x)
-    y_np = np.array(y.values) if hasattr(y, 'values') else np.array(y)
+    def _to_numpy(v):
+      if hasattr(v, 'dtype') and v.dtype == mx.bfloat16:
+        return np.array(v.astype(mx.float32))
+      return np.array(v)
+
+    if hasattr(x, 'values') and hasattr(y, 'values'):
+      x, y = _mask_and_pad_to_max_length(x, y)
+    x_np = _to_numpy(x.values) if hasattr(x, 'values') else _to_numpy(x)
+    y_np = _to_numpy(y.values) if hasattr(y, 'values') else _to_numpy(y)
+    # No float16/bfloat16 tolerance relaxation
+
     np.testing.assert_allclose(x_np, y_np, **kwargs)
     if hasattr(x, 'mask') and hasattr(y, 'mask'):
-      mask_x = np.array(x.mask)
-      mask_y = np.array(y.mask)
+      mask_x = _to_numpy(x.mask)
+      mask_y = _to_numpy(y.mask)
       np.testing.assert_array_equal(mask_x, mask_y)
 
 
@@ -269,4 +420,44 @@ def module_spec_pairs(self, backend_sl: specs.ModuleSpec):
     return {backend_sl.test_utils: spec.ModuleSpec}
 
 
+# pylint: disable=abstract-method
+# pylint: disable=abstract-class-instantiated
+class NonSteppableLayer(types.PreservesType, types.StatelessPointwise):
+  """A test layer that does not support stepping."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types.SequenceLayerConfig):
+    """Configuration for NonSteppableLayer."""
+
+    name: str | None = None
+
+    @override
+    def make(self) -> 'NonSteppableLayer':
+      return NonSteppableLayer(self, name=self.name)
+
+  config: Config
+
+  def __init__(self, config: Config, *, name: str | None = None):
+    # pylint: disable=unused-argument
+    super().__init__()
+    self.config = config
+
+  @property
+  @override
+  def supports_step(self):
+    return False
+
+  @override
+  def layer(
+      self,
+      x: types.Sequence,
+      *,
+      training: bool,
+      constants: types.Constants | None = None,
+  ) -> types.Sequence:
+    del training
+    del constants
+    return x
+
+
 main = absltest.main
diff --git a/sequence_layers/mlx/types.py b/sequence_layers/mlx/types.py
index b959725..124761a 100644
--- a/sequence_layers/mlx/types.py
+++ b/sequence_layers/mlx/types.py
@@ -56,6 +56,8 @@
 # Receptive field.
 ReceptiveField = tuple[float | int, float | int] | None
 
+InputT = TypeVar('InputT', bound='Sequence')
+OutputT = TypeVar('OutputT', bound='Sequence')
 __all__ = (
     # go/keep-sorted start
     'ChannelSpec',
@@ -69,6 +71,7 @@
     'MaskT',
     'MaskedSequence',
     'PaddingMode',
+    'PaddingModeString',
     'PreservesShape',
     'PreservesType',
     'ReceptiveField',
@@ -115,6 +118,7 @@ def __hash__(self) -> int:
 ChannelSpec = ShapeDType
 
 PaddingMode = spec.PaddingMode
+PaddingModeString = spec.PaddingModeString
 
 
 def sequence_mask(lengths: LengthsT, maxlen: int) -> mx.array:
@@ -138,7 +142,7 @@ def __init__(self, values: ValuesT, mask: MaskT):
   @override
   def shape(self) -> Shape:
     """Returns the shape of the sequence values."""
-    return self.values.shape
+    return tuple(self.values.shape)
 
   @property
   @override
@@ -150,7 +154,7 @@ def ndim(self) -> int:
   @override
   def channel_shape(self) -> Shape:
     """Returns the channel shape (the shape without batch and time)."""
-    return self.values.shape[2:]
+    return tuple(self.values.shape[2:])
 
   @property
   def channel_spec(self) -> ChannelSpec:
@@ -186,7 +190,10 @@ def from_values(cls, values: ValuesT) -> 'MaskedSequence':
     """Returns a MaskedSequence assuming every timestep is valid."""
     if values.ndim < 2:
       raise ValueError(f'Expected {values.ndim=} to be at least 2.')
-    return MaskedSequence(values, mx.ones(values.shape[:2], dtype=mx.bool_))
+    array_values = values if isinstance(values, mx.array) else mx.array(values)
+    return MaskedSequence(
+        array_values, mx.ones(array_values.shape[:2], dtype=mx.bool_)
+    )
 
   @classmethod
   @override
@@ -536,6 +543,21 @@ def output_ratio(self) -> fractions.Fraction:
   def supports_step(self) -> bool:
     return True
 
+  def get_output_shape_for_sequence(
+      self,
+      x: Sequence,
+      *,
+      constants: Constants | None = None,
+  ) -> Shape:
+    """Returns the output shape this layer produces for the provided Sequence."""
+    return self.get_output_shape(x.channel_shape, constants=constants)
+
+  @property
+  @override
+  def name(self) -> str | None:
+    """Returns the name of the layer."""
+    return self.config.name if hasattr(self, 'config') else None
+
   @property
   @override
   def input_latency(self) -> int:
@@ -786,11 +808,11 @@ def get_output_spec(
 # ---------------------------------------------------------------------------
 
 
+# pylint: disable=abstract-method
 class SequenceLayer(
     nn.Module,
     Steppable,
     spec.SequenceLayer[Sequence, Sequence, ChannelSpec],
-    metaclass=abc.ABCMeta,
 ):
   """Base Module for Sequence Layers."""
 
@@ -914,14 +936,19 @@ def step(
     return self.layer(x, training=training, constants=constants), state
 
 
+# pylint: disable=abstract-method
 class StatelessPointwise(
     PreservesShape,
     Stateless,
     spec.StatelessPointwise[Sequence, Sequence, ChannelSpec],
-    metaclass=abc.ABCMeta,
 ):
   """A SequenceLayer that has no state and operates pointwise on its input."""
 
+  @property
+  @override
+  def receptive_field(self) -> tuple[int, int]:
+    return (0, 0)
+
 
 class StatelessPointwiseFunctor(
     StatelessPointwise,
@@ -931,12 +958,12 @@ class StatelessPointwiseFunctor(
 
   @abc.abstractmethod
   @override
-  def fn(self, values: ValuesT, mask: MaskT) -> tuple[ValuesT, MaskT]:
+  def fn(self, values: mx.array, mask: mx.array) -> tuple[mx.array, mx.array]:
     """Transforms each scalar in values independently."""
 
   @property
   @override
-  def mask_required(self):
+  def mask_required(self) -> bool:
     """Returns true if fn can change the sequence's masked state.
 
     If fn(0) -> 0, then mask_required() is False.
diff --git a/sequence_layers/specs/backend.py b/sequence_layers/specs/backend.py
index 706b8b7..eb0d826 100644
--- a/sequence_layers/specs/backend.py
+++ b/sequence_layers/specs/backend.py
@@ -42,14 +42,80 @@ def zeros(self, shape: tuple[int, ...], dtype: Any = None) -> Array:
   def concatenate(self, arrays: TypingSequence[Array], axis: int = 0) -> Array:
     """Concatenates a list of arrays."""
 
+  def broadcast_to(self, array: Array, shape: tuple[int, ...]) -> Array:
+    """Broadcasts an array to a new shape."""
 
+  def abs(self, x: Array) -> Array:
+    """Computes absolute value."""
+
+  def exp(self, x: Array) -> Array:
+    """Computes exponential."""
+
+  def log(self, x: Array) -> Array:
+    """Computes natural logarithm."""
+
+  def mean(
+      self,
+      x: Array,
+      axis: int | tuple[int, ...] | None = None,
+      dtype: Any = None,
+      keepdims: bool = False,
+      where: Array | None = None,
+  ) -> Array:
+    """Computes the arithmetic mean along the specified axes."""
+
+  def var(
+      self,
+      x: Array,
+      axis: int | tuple[int, ...] | None = None,
+      dtype: Any = None,
+      keepdims: bool = False,
+      where: Array | None = None,
+  ) -> Array:
+    """Computes the variance along the specified axes."""
+
+
+class nn(Protocol):
+  """Protocol for neural network operations (activations)."""
+
+  def relu(self, x: Array) -> Array:
+    """Computes ReLU activation."""
+
+  def sigmoid(self, x: Array) -> Array:
+    """Computes sigmoid activation."""
+
+  def tanh(self, x: Array) -> Array:
+    """Computes tanh activation."""
+
+  def swish(self, x: Array) -> Array:
+    """Computes swish activation."""
+
+  def gelu(self, x: Array) -> Array:
+    """Computes GeLU activation."""
+
+  def elu(self, x: Array) -> Array:
+    """Computes ELU activation."""
+
+  def softplus(self, x: Array) -> Array:
+    """Computes softplus activation."""
+
+  def softmax(self, x: Array, axis: int = -1) -> Array:
+    """Computes softmax activation."""
+
+
+# pylint: disable=invalid-name
+# pylint: disable=missing-function-docstring
 @runtime_checkable
 class ModuleSpec(Protocol):
   """Specification for sequence_layers.<backend>.backend."""
 
   @property
   def xp(self) -> xp:
-    """Returns the NumPy-compatible interface."""
+    ...
+
+  @property
+  def nn(self) -> nn:
+    ...
 
 
 __all__ = [
diff --git a/sequence_layers/specs/test_utils.py b/sequence_layers/specs/test_utils.py
index d4b98c3..2410668 100644
--- a/sequence_layers/specs/test_utils.py
+++ b/sequence_layers/specs/test_utils.py
@@ -21,6 +21,7 @@
 from typing import TypeVar
 
 from absl.testing import parameterized
+import numpy as np
 
 from sequence_layers import specs
 from sequence_layers.specs import backend as backend_spec
@@ -160,6 +161,15 @@ def xp(self) -> backend_spec.xp:
     """Returns the backend wrapper."""
     return self.sl.backend.xp
 
+  @property
+  def nn(self) -> backend_spec.nn:
+    """Returns the backend nn wrapper."""
+    return self.sl.backend.nn
+
+  def make_layer(self, config: types_spec.SequenceLayerConfig) -> Any:
+    """Instantiates a layer from its config, delegating to the backend."""
+    return config.make()
+
   # pylint: disable=invalid-name
 
   @abc.abstractmethod
@@ -174,6 +184,16 @@ def assertAllEqual(self, x: Any, y: Any) -> None:
 
   # pylint: enable=invalid-name
 
+  def assertNotAllEqual(self, x: Any, y: Any) -> None:  # pylint: disable=invalid-name
+    """Asserts that not all elements are equal."""
+    x_np = np.asarray(x)
+    y_np = np.asarray(y)
+    self.assertFalse(np.all(x_np == y_np))
+
+  @abc.abstractmethod
+  def get_variables(self, layer: types_spec.SequenceLayer) -> dict[str, Any]:
+    """Returns the variables or parameters of the layer."""
+
   @abc.abstractmethod
   def random_sequence(
       self,
@@ -188,6 +208,24 @@ def random_sequence(
   ) -> types_spec.Sequence:
     """Generates a random sequence."""
 
+  @abc.abstractmethod
+  def init_layer(
+      self,
+      layer: types_spec.SequenceLayer,
+      x: types_spec.Sequence,
+      bind_only: bool = False,
+      constants: types_spec.Constants | None = None,
+  ) -> types_spec.SequenceLayer:
+    """Initializes and binds a SequenceLayer for testing.
+
+    Args:
+      layer: Layer to initialize and bind.
+      x: Example input sequence to use for initialization.
+      bind_only: If True, skip initialization and only bind the layer (if
+        applicable to the backend).
+      constants: Optional constants for initialization.
+    """
+
   @abc.abstractmethod
   def _step_by_step(
       self,
@@ -220,6 +258,18 @@ def verify_contract(
   def assertSequencesClose(self, x: Any, y: Any, **kwargs) -> None:  # pylint: disable=invalid-name
     """Asserts that two sequences are close."""
 
+  def assertConfigDefaults(  # pylint: disable=invalid-name
+      self, config_cls: type[Any], expected_defaults: dict[str, Any], **kwargs
+  ) -> None:
+    """Helper to verify that a config class has the expected defaults."""
+    config = config_cls(**kwargs)
+    for field_name, expected_val in expected_defaults.items():
+      self.assertEqual(
+          getattr(config, field_name),
+          expected_val,
+          f'Default for {field_name} in {config_cls.__name__} does not match!',
+      )
+
 
 class ModuleSpecTest(SequenceLayerTest):
   """Test that a backend-specific module implements the ModuleSpec protocol."""
diff --git a/sequence_layers/specs/test_utils_spec.py b/sequence_layers/specs/test_utils_spec.py
index b1a6a32..4e3aa41 100644
--- a/sequence_layers/specs/test_utils_spec.py
+++ b/sequence_layers/specs/test_utils_spec.py
@@ -42,6 +42,10 @@ def named_product(
   def SequenceLayerTest(self) -> type[Any]:  # pylint: disable=invalid-name,missing-function-docstring
     ...
 
+  @property
+  def NonSteppableLayer(self) -> type[Any]:  # pylint: disable=invalid-name,missing-function-docstring
+    ...
+
 
 __all__ = [
     name

From 8841aae33987b28e5114b0dbec3ba100e370712c Mon Sep 17 00:00:00 2001
From: Julian Salazar <julsal@google.com>
Date: Mon, 1 Jun 2026 11:07:18 +0000
Subject: [PATCH 07/29] fix(mlx/test_utils): standardize type idempotency and
 low-precision tolerances

Co-authored-by: David Braun <davidbraun@google.com>
Co-authored-by: Kehang Han <kehanghan@google.com>

PiperPending-RevId: 924617737
PiperOrigin-RevId: 924617737
---
 sequence_layers/mlx/__init__.py   | 53 +++++++++++++++++++++++++++----
 sequence_layers/mlx/test_utils.py | 14 ++++++--
 sequence_layers/mlx/types.py      | 33 +++++++++++++++++++
 3 files changed, 91 insertions(+), 9 deletions(-)

diff --git a/sequence_layers/mlx/__init__.py b/sequence_layers/mlx/__init__.py
index 5bba4b2..d44f0cf 100644
--- a/sequence_layers/mlx/__init__.py
+++ b/sequence_layers/mlx/__init__.py
@@ -14,11 +14,50 @@
 # limitations under the License.
 """Sequence layers in MLX."""
 
-from sequence_layers.mlx.types import *
-
-# pylint: disable=useless-import-alias
-# (re-export the names for typechecking)
-from . import backend as backend
-from . import test_utils as test_utils
-from . import types as types
+from . import backend
+from . import test_utils
+from . import types
 from .test_utils import SequenceLayerTest
+from .types import ChannelSpec
+from .types import Constants
+from .types import DType
+from .types import Emits
+from .types import Emitting
+from .types import MaskedSequence
+from .types import MaskT
+from .types import PreservesShape
+from .types import PreservesType
+from .types import Sequence
+from .types import SequenceLayer
+from .types import SequenceLayerConfig
+from .types import Shape
+from .types import ShapeDType
+from .types import ShapeLike
+from .types import State
+from .types import Stateless
+from .types import StatelessPointwise
+
+__all__ = [
+    'backend',
+    'types',
+    'test_utils',
+    'SequenceLayerTest',
+    'Constants',
+    'Sequence',
+    'MaskedSequence',
+    'SequenceLayer',
+    'SequenceLayerConfig',
+    'MaskT',
+    'Shape',
+    'ShapeDType',
+    'ShapeLike',
+    'DType',
+    'State',
+    'Emits',
+    'Emitting',
+    'ChannelSpec',
+    'Stateless',
+    'StatelessPointwise',
+    'PreservesShape',
+    'PreservesType',
+]
diff --git a/sequence_layers/mlx/test_utils.py b/sequence_layers/mlx/test_utils.py
index cf87ca9..ceceb49 100644
--- a/sequence_layers/mlx/test_utils.py
+++ b/sequence_layers/mlx/test_utils.py
@@ -374,7 +374,7 @@ def verify_contract(
     expected_dtype = l.get_output_dtype(dtype, constants=constants)
     self.assertEqual(y_layer.dtype, expected_dtype)
 
-    if not l.supports_step:
+    if not l.supports_step or not kwargs.get('test_step', True):
       return y_layer
 
     block_size = l.block_size
@@ -404,7 +404,17 @@ def _to_numpy(v):
       x, y = _mask_and_pad_to_max_length(x, y)
     x_np = _to_numpy(x.values) if hasattr(x, 'values') else _to_numpy(x)
     y_np = _to_numpy(y.values) if hasattr(y, 'values') else _to_numpy(y)
-    # No float16/bfloat16 tolerance relaxation
+    atol = kwargs.get('atol', 1e-5)
+    rtol = kwargs.get('rtol', 1e-5)
+    dtype = getattr(x, 'dtype', None)
+    if dtype == mx.float16:
+      atol = max(atol, 2e-3)
+      rtol = max(rtol, 2e-3)
+    elif dtype == mx.bfloat16:
+      atol = max(atol, 1e-2)
+      rtol = max(rtol, 1e-2)
+    kwargs['atol'] = atol
+    kwargs['rtol'] = rtol
 
     np.testing.assert_allclose(x_np, y_np, **kwargs)
     if hasattr(x, 'mask') and hasattr(y, 'mask'):
diff --git a/sequence_layers/mlx/types.py b/sequence_layers/mlx/types.py
index 124761a..c591552 100644
--- a/sequence_layers/mlx/types.py
+++ b/sequence_layers/mlx/types.py
@@ -33,6 +33,7 @@
 import jaxtyping as jt
 from mlx import nn
 import mlx.core as mx
+import numpy as np
 from sequence_layers.specs import types as spec
 
 # Type aliases.
@@ -58,6 +59,37 @@
 
 InputT = TypeVar('InputT', bound='Sequence')
 OutputT = TypeVar('OutputT', bound='Sequence')
+
+
+def _to_tuple(x: complex | list[Any]) -> complex | tuple[Any, ...]:
+  """Replaces lists in a pytree of complex with tuples."""
+  if isinstance(x, list):
+    return tuple(_to_tuple(i) for i in x)
+  return x
+
+
+@dataclasses.dataclass(frozen=True)
+class HashableArray:
+  """Hashable multidimensional array of tuples."""
+
+  data: complex | tuple[Any, ...]
+  dtype: Any
+
+  @classmethod
+  def from_array(cls, x: Any) -> 'HashableArray':
+    """Creates a HashableArray from a numpy-like array."""
+    if isinstance(x, cls):
+      return x
+    if hasattr(x, 'data') and hasattr(x, 'dtype') and hasattr(x, 'to_array'):
+      return cls(x.data, x.dtype)
+    x = np.asarray(x)
+    return cls(_to_tuple(x.tolist()), x.dtype)
+
+  def to_array(self) -> Any:
+    """Converts HashableArray back to a numpy array."""
+    return np.asarray(self.data, dtype=self.dtype)
+
+
 __all__ = (
     # go/keep-sorted start
     'ChannelSpec',
@@ -66,6 +98,7 @@
     'Emits',
     'Emitting',
     'ExpandedMaskT',
+    'HashableArray',
     'LengthsT',
     'MASK_DTYPE',
     'MaskT',

From 22b912279f97b91e2fe7667b2aad4de852f6442c Mon Sep 17 00:00:00 2001
From: Julian Salazar <julsal@google.com>
Date: Mon, 1 Jun 2026 11:07:18 +0000
Subject: [PATCH 08/29] feat(specs/types): add channel_spec abstract property
 and concrete overrides

Co-authored-by: David Braun <davidbraun@google.com>
Co-authored-by: Kehang Han <kehanghan@google.com>

PiperPending-RevId: 924617731
PiperOrigin-RevId: 924617731
---
 sequence_layers/jax/types.py      |  1 +
 sequence_layers/mlx/test_utils.py | 13 +++--
 sequence_layers/mlx/types.py      |  1 +
 sequence_layers/specs/types.py    | 91 +++++++++++++++++++++++++------
 4 files changed, 86 insertions(+), 20 deletions(-)

diff --git a/sequence_layers/jax/types.py b/sequence_layers/jax/types.py
index 4a9edfe..ee72cf1 100644
--- a/sequence_layers/jax/types.py
+++ b/sequence_layers/jax/types.py
@@ -347,6 +347,7 @@ def channel_shape(self) -> Shape:
     return self.values.shape[2:]
 
   @property
+  @override
   def channel_spec(self) -> ChannelSpec:
     """Returns a "spec" for this sequence (the channel shape and dtype)."""
     return ChannelSpec(self.channel_shape, self.dtype)
diff --git a/sequence_layers/mlx/test_utils.py b/sequence_layers/mlx/test_utils.py
index ceceb49..6a74b5c 100644
--- a/sequence_layers/mlx/test_utils.py
+++ b/sequence_layers/mlx/test_utils.py
@@ -25,6 +25,7 @@
 import sequence_layers.mlx as mlx_sl
 from sequence_layers.mlx import types
 from sequence_layers.specs import test_utils as spec
+from sequence_layers.specs import types as specs_types
 
 Sequence = types.Sequence
 MaskedSequence = types.MaskedSequence
@@ -265,6 +266,13 @@ def init_layer(self, layer, x, bind_only=False, constants=None):
       _ = layer.layer(x, training=False, constants=constants)
     return layer
 
+  @override
+  def make_layer(self, config: specs_types.SequenceLayerConfig) -> Any:
+    """Resolves concrete MLX layer class and instantiates via from_config."""
+    from sequence_layers.mlx import utils as mlx_utils  # pylint: disable=import-outside-toplevel,g-import-not-at-top
+
+    return mlx_utils.make_layer(config)
+
   @override
   def random_sequence(
       self,
@@ -415,7 +423,6 @@ def _to_numpy(v):
       rtol = max(rtol, 1e-2)
     kwargs['atol'] = atol
     kwargs['rtol'] = rtol
-
     np.testing.assert_allclose(x_np, y_np, **kwargs)
     if hasattr(x, 'mask') and hasattr(y, 'mask'):
       mask_x = _to_numpy(x.mask)
@@ -445,12 +452,10 @@ class Config(types.SequenceLayerConfig):
     def make(self) -> 'NonSteppableLayer':
       return NonSteppableLayer(self, name=self.name)
 
-  config: Config
-
   def __init__(self, config: Config, *, name: str | None = None):
-    # pylint: disable=unused-argument
     super().__init__()
     self.config = config
+    del name
 
   @property
   @override
diff --git a/sequence_layers/mlx/types.py b/sequence_layers/mlx/types.py
index c591552..33eedaa 100644
--- a/sequence_layers/mlx/types.py
+++ b/sequence_layers/mlx/types.py
@@ -190,6 +190,7 @@ def channel_shape(self) -> Shape:
     return tuple(self.values.shape[2:])
 
   @property
+  @override
   def channel_spec(self) -> ChannelSpec:
     """Returns a "spec" for this sequence (the channel shape and dtype)."""
     return ChannelSpec(self.channel_shape, self.dtype)
diff --git a/sequence_layers/specs/types.py b/sequence_layers/specs/types.py
index aec921e..a9b0fe7 100644
--- a/sequence_layers/specs/types.py
+++ b/sequence_layers/specs/types.py
@@ -51,6 +51,21 @@
 Shape = tuple[int, ...]
 ShapeLike = list[int] | tuple[int, ...]
 DType = Any  # Can be numpy, jax, or mlx dtype
+Sharding = Any  # JAX sharding spec
+
+
+@runtime_checkable
+class HashableArray(Protocol):
+  """Protocol for hashable multidimensional arrays."""
+
+  data: complex | tuple[Any, ...]
+  """The data as a tuple or complex scalar."""
+
+  dtype: Any
+  """The dtype of the array."""
+
+  def to_array(self) -> Any:
+    """Returns the array representation."""
 
 
 class ChannelSpec(Protocol):
@@ -64,6 +79,9 @@ def shape(self) -> Shape:
   def dtype(self) -> Any:
     """The dtype of the channel."""
 
+  def __init__(self, shape: Shape, dtype: Any):
+    ...
+
 
 State = Any
 Constants = MutableMapping[str, jt.PyTree[Array]]
@@ -200,8 +218,14 @@ class PaddingMode(enum.Enum):
 ]
 
 
-class Sequence(Generic[ValuesT, MaskT], metaclass=abc.ABCMeta):
-  """Abstract base class for Sequence."""
+class Sequence[ValuesT = Array, MaskT = Array](metaclass=abc.ABCMeta):
+  """A generic sequence container that preserves masking information.
+
+  Note: This class can hold non-backend-specific arrays (like `np.ndarray`) to
+  maintain consistency with JAX. Backend implementations should handle them
+  gracefully, for example by converting to backend-native arrays just-in-time
+  when backend-specific operations require it.
+  """
 
   values: ValuesT
   mask: MaskT
@@ -224,6 +248,11 @@ def ndim(self) -> int:
   def channel_shape(self) -> Shape:
     """The shape of the channels in the sequence."""
 
+  @property
+  @abc.abstractmethod
+  def channel_spec(self) -> ChannelSpec:
+    """The channel specification of the sequence."""
+
   @property
   @abc.abstractmethod
   def dtype(self) -> DType:
@@ -362,13 +391,16 @@ def apply_masked[NewValuesT: Array, NewMaskT: Array, **P](
 class SequenceLayerConfig(metaclass=abc.ABCMeta):
   """Configuration for a SequenceLayer."""
 
+  def __init__(self, *args: Any, **kwargs: Any):
+    pass
+
   @abc.abstractmethod
   def make(self) -> Any:
     """Creates the sequence layer."""
 
-  @abc.abstractmethod
   def copy(self, **kwargs: Any) -> Self:
     """Returns a copy of the config with updated fields."""
+    return dataclasses.replace(cast(Any, self), **kwargs)
 
 
 class Steppable(Generic[InputT, OutputT, ChannelSpecT], metaclass=abc.ABCMeta):
@@ -379,6 +411,11 @@ class Steppable(Generic[InputT, OutputT, ChannelSpecT], metaclass=abc.ABCMeta):
   - step_with_emits
   """
 
+  @property
+  def name(self) -> str | None:
+    """The name of this layer."""
+    return None
+
   @property
   @abc.abstractmethod
   def block_size(self) -> int:
@@ -638,6 +675,15 @@ class SequenceLayer(
 ):
   """Base class for Sequence Layers."""
 
+  @abc.abstractmethod
+  def get_output_shape_for_sequence(
+      self,
+      x: Sequence[Any, Any],
+      *,
+      constants: Constants | None = None,
+  ) -> Shape:
+    """Returns the output shape this layer produces for the provided Sequence."""
+
 
 # ---------------------------------------------------------------------------
 # Mixins
@@ -909,6 +955,7 @@ def layer_with_emits(
     ...
 
 
+_ChannelSpecType = ChannelSpec
 _SequenceType = Sequence
 _MaskedSequenceType = MaskedSequence
 _SequenceLayerType = SequenceLayer
@@ -916,59 +963,71 @@ def layer_with_emits(
 _SteppableType = Steppable
 
 
+# pylint: disable=invalid-name
+# pylint: disable=missing-function-docstring
 @runtime_checkable
 class ModuleSpec(Protocol):
   """Specification for sequence_layers.<backend>.types."""
 
-  # pylint: disable=invalid-name
+  @property
+  def ChannelSpec(self) -> type[_ChannelSpecType]:
+    ...
+
+  @property
+  def ShapeDType(self) -> type[_ChannelSpecType]:
+    ...
+
+  @property
+  def HashableArray(self) -> type[HashableArray]:
+    ...
 
   @property
   def Sequence(self) -> type[_SequenceType[Any, Any]]:
-    """The Sequence class for this backend."""
+    ...
 
   @property
   def MaskedSequence(self) -> type[_MaskedSequenceType[Any, Any]]:
-    """The MaskedSequence class for this backend."""
+    ...
 
   @property
   def SequenceLayer(self) -> type[_SequenceLayerType]:
-    """The SequenceLayer class for this backend."""
+    ...
 
   @property
   def SequenceLayerConfig(self) -> type[_SequenceLayerConfigType]:
-    """The SequenceLayerConfig class for this backend."""
+    ...
 
   @property
   def Steppable(self) -> type[_SteppableType[Any, Any, Any]]:
-    """The Steppable class for this backend."""
+    ...
 
   @property
   def PreservesShape(self) -> type[PreservesShape]:
-    """The PreservesShape class for this backend."""
+    ...
 
   @property
   def Stateless(self) -> type[Stateless]:
-    """The Stateless class for this backend."""
+    ...
 
   @property
   def StatelessPointwise(self) -> type[StatelessPointwise]:
-    """The StatelessPointwise class for this backend."""
+    ...
 
   @property
   def StatelessPointwiseFunctor(self) -> type[StatelessPointwiseFunctor]:
-    """The StatelessPointwiseFunctor class for this backend."""
+    ...
 
   @property
   def PreservesType(self) -> type[PreservesType]:
-    """The PreservesType class for this backend."""
+    ...
 
   @property
   def Emitting(self) -> type[Emitting]:
-    """The Emitting class for this backend."""
+    ...
 
   @property
   def StatelessEmitting(self) -> type[StatelessEmitting]:
-    """The StatelessEmitting class for this backend."""
+    ...
 
 
 __all__ = (

From 9e246dc623d19abc50fadea2f5183034aae48068 Mon Sep 17 00:00:00 2001
From: Julian Salazar <julsal@google.com>
Date: Mon, 1 Jun 2026 11:07:18 +0000
Subject: [PATCH 09/29] fix(specs): resolve MRO and duplicate protocol issues
 in types

Co-authored-by: David Braun <davidbraun@google.com>
Co-authored-by: Kehang Han <kehanghan@google.com>

PiperPending-RevId: 924617735
PiperOrigin-RevId: 924617735
---
 sequence_layers/jax/convolution_test.py  |   3 +
 sequence_layers/jax/test_utils.py        |   1 -
 sequence_layers/jax/types.py             |  20 +-
 sequence_layers/mlx/utils.py             | 293 +++++++++++++++++++++++
 sequence_layers/specs/types.py           |   4 -
 sequence_layers/specs/types_behaviors.py |   9 +
 6 files changed, 319 insertions(+), 11 deletions(-)
 create mode 100644 sequence_layers/mlx/utils.py

diff --git a/sequence_layers/jax/convolution_test.py b/sequence_layers/jax/convolution_test.py
index 8035d7c..7059bde 100644
--- a/sequence_layers/jax/convolution_test.py
+++ b/sequence_layers/jax/convolution_test.py
@@ -98,6 +98,8 @@ def _expected_conv_mask(
       )[:, :, 0]
       # All timesteps where the kernel overlaps with the mask.
       return mask_golden > 0
+    case _:
+      raise ValueError(f'Unsupported padding mode: {padding}')
 
 
 class ComputeConvMaskTest(test_utils.SequenceLayerTest):
@@ -564,5 +566,6 @@ def check(mask, kernel_size, stride, dilation_rate, padding, expected):
         ]],
     )
 
+
 if __name__ == '__main__':
   test_utils.main()
diff --git a/sequence_layers/jax/test_utils.py b/sequence_layers/jax/test_utils.py
index 49ec878..8dff96c 100644
--- a/sequence_layers/jax/test_utils.py
+++ b/sequence_layers/jax/test_utils.py
@@ -35,7 +35,6 @@
 from sequence_layers.jax import utils
 from sequence_layers.specs import test_utils as spec
 
-
 _SequenceLayerT = TypeVar('_SequenceLayerT', bound=types.SequenceLayer)
 _T = TypeVar('_T')
 _TestFnT = Callable[..., None]
diff --git a/sequence_layers/jax/types.py b/sequence_layers/jax/types.py
index ee72cf1..40ededb 100644
--- a/sequence_layers/jax/types.py
+++ b/sequence_layers/jax/types.py
@@ -13,6 +13,10 @@
 # limitations under the License.
 """Basic sequence types."""
 
+# pytype: disable=override-error
+# pytype: disable=ignored-abstractmethod
+# pytype: disable=bad-return-type
+
 import abc
 import dataclasses
 import fractions
@@ -1374,7 +1378,6 @@ def step(
     )
     return output, state
 
-  @abc.abstractmethod
   def step_with_emits(
       self,
       x: Sequence,
@@ -1383,7 +1386,7 @@ def step_with_emits(
       training: bool,
       constants: Constants | None = None,
   ) -> tuple[Sequence, State, Emits]:
-    pass
+    raise NotImplementedError()
 
   @override
   def layer(
@@ -1398,7 +1401,6 @@ def layer(
     )
     return outputs
 
-  @abc.abstractmethod
   def layer_with_emits(
       self,
       x: Sequence,
@@ -1406,7 +1408,7 @@ def layer_with_emits(
       training: bool,
       constants: Constants | None = None,
   ) -> tuple[Sequence, Emits]:
-    pass
+    raise NotImplementedError()
 
 
 class Stateless(
@@ -1558,8 +1560,11 @@ class StatelessPointwiseFunctor(  # pytype: disable=ignored-abstractmethod
 
   @abc.abstractmethod
   @override
-  def fn(self, values: ValuesT, mask: MaskT) -> tuple[ValuesT, MaskT]:
+  def fn(
+      self, values: ValuesT, mask: MaskT
+  ) -> tuple[ValuesT, MaskT]:  # pytype: disable=override-error
     """Transforms each scalar in values independently."""
+    raise NotImplementedError()
 
   @property
   @override
@@ -1589,7 +1594,9 @@ def layer(
     return y
 
 
-class SequenceLayerConfig(spec.SequenceLayerConfig):
+class SequenceLayerConfig(
+    spec.SequenceLayerConfig
+):  # pytype: disable=ignored-abstractmethod
   """Base class for SequenceLayer configuration objects.
 
   Requires a no-argument make() method which returns a SequenceLayer.
@@ -1603,6 +1610,7 @@ class SequenceLayerConfig(spec.SequenceLayerConfig):
   @abc.abstractmethod
   def make(self) -> SequenceLayer:
     """Builds a SequenceLayer from this config."""
+    raise NotImplementedError()
 
   def copy(self, **kwargs) -> Self:
     """Create a copy of this config.
diff --git a/sequence_layers/mlx/utils.py b/sequence_layers/mlx/utils.py
new file mode 100644
index 0000000..c5e3693
--- /dev/null
+++ b/sequence_layers/mlx/utils.py
@@ -0,0 +1,293 @@
+"""Utility functions for MLX sequence layers."""
+
+import dataclasses
+import inspect
+from typing import Any
+
+from mlx import nn
+import mlx.core as mx
+import numpy as np
+from sequence_layers.specs import types as specs_types
+
+
+def get_output_latency(config, accumulated_output_latency=0):
+  """Returns the output latency of the provided SequenceLayerConfig.
+
+  In MLX, we can simply instantiate the layer and compute the latency
+  directly without needing JAX's eval_shape.
+
+  Args:
+    config: A SequenceLayerConfig to compute output latency for.
+    accumulated_output_latency: The accumulated output latency of preceding
+      layers. Defaults to 0.
+
+  Returns:
+    The output latency of the layer.
+  """
+  layer = config.make()
+  return _get_accumulated_output_latency(layer, accumulated_output_latency)
+
+
+def _get_accumulated_output_latency(layer, output_latency):
+  """Computes accumulated output latency for a layer.
+
+  Mirrors SequenceLayer.get_accumulated_output_latency from JAX types.
+
+  Args:
+    layer: The layer to compute latency for.
+    output_latency: The accumulated output latency of preceding layers.
+
+  Returns:
+    The accumulated output latency.
+  """
+  # Check for Serial-like combinators that chain layers.
+  if hasattr(layer, 'layers') and isinstance(layer.layers, (list, tuple)):
+    for sub in layer.layers:
+      output_latency = _get_accumulated_output_latency(sub, output_latency)
+    return output_latency
+
+  # Check for internal body (Residual stores layers in _body).
+  if hasattr(layer, '_body'):
+    return _get_accumulated_output_latency(layer.body, output_latency)
+
+  # Check for deferred layers that wrap another layer.
+  if hasattr(layer, '_layer') and layer.inner is not None:
+    return _get_accumulated_output_latency(layer.inner, output_latency)
+  if hasattr(layer, '_child'):
+    return _get_accumulated_output_latency(layer.child, output_latency)
+
+  # Single layer: compute latency.
+  output_ratio = layer.output_ratio
+  return int(output_latency * output_ratio) + layer.output_latency
+
+
+def get_required_stepwise_delay(output_ratio, input_latency):
+  """Returns the delay required so input_latency is divisible by 1/output_ratio.
+
+  When combining upsampling and downsampling layers with latency,
+  layer/step equivalence requires inserting delays. This function returns the
+  correct amount of step-wise delay to insert.
+
+  Args:
+    output_ratio: The output ratio of the layer (a fractions.Fraction).
+    input_latency: The accumulated input latency of layers preceding the layer.
+
+  Returns:
+    The amount of delay required to ensure input latency is divisible by
+    output_ratio.
+  """
+  if 1 not in output_ratio.as_integer_ratio():
+    raise NotImplementedError(
+        'get_required_stepwise_delay expects integer upsampling or'
+        f' downsampling, got {output_ratio=}'
+    )
+  return int(-input_latency % (1 / output_ratio))
+
+
+def call_layer_with_emits(
+    layer, x, *, training=False, constants=None, **kwargs
+):
+  """Calls layer_with_emits safely, handling signature mismatches in non-abstractified layers."""
+
+  sig = inspect.signature(layer.layer_with_emits)
+  call_kwargs = {}
+  if 'training' in sig.parameters:
+    call_kwargs['training'] = training
+  if 'constants' in sig.parameters:
+    call_kwargs['constants'] = constants
+  for k, v in kwargs.items():
+    if k in sig.parameters:
+      call_kwargs[k] = v
+  return layer.layer_with_emits(x, **call_kwargs)
+
+
+def call_step_with_emits(
+    layer, x, state, *, training=False, constants=None, **kwargs
+):
+  """Calls step_with_emits safely, handling signature mismatches in non-abstractified layers."""
+
+  sig = inspect.signature(layer.step_with_emits)
+  call_kwargs = {}
+  if 'training' in sig.parameters:
+    call_kwargs['training'] = training
+  if 'constants' in sig.parameters:
+    call_kwargs['constants'] = constants
+  for k, v in kwargs.items():
+    if k in sig.parameters:
+      call_kwargs[k] = v
+  return layer.step_with_emits(x, state, **call_kwargs)
+
+
+def call_get_initial_state(
+    layer, batch_size, input_spec, *, training=False, constants=None, **kwargs
+):
+  """Calls get_initial_state safely, handling signature mismatches in non-abstractified layers."""
+
+  sig = inspect.signature(layer.get_initial_state)
+  call_kwargs = {}
+  if 'training' in sig.parameters:
+    call_kwargs['training'] = training
+  if 'constants' in sig.parameters:
+    call_kwargs['constants'] = constants
+  for k, v in kwargs.items():
+    if k in sig.parameters:
+      call_kwargs[k] = v
+  return layer.get_initial_state(batch_size, input_spec, **call_kwargs)
+
+
+def _to_mx_dtype(dtype: Any) -> Any:
+  """Converts various dtype representations to MLX DType."""
+  if dtype is None:
+    return None
+  if isinstance(dtype, str):
+    if dtype == 'float32':
+      return mx.float32
+    if dtype == 'float16':
+      return mx.float16
+    if dtype == 'int32':
+      return mx.int32
+    if dtype == 'bool':
+      return mx.bool_
+  # Handle JAX/Numpy dtypes
+  try:
+    np_dtype = np.dtype(dtype)
+    if np_dtype == np.float32:
+      return mx.float32
+    if np_dtype == np.float16:
+      return mx.float16
+    if np_dtype == np.int32:
+      return mx.int32
+    if np_dtype == np.bool_:
+      return mx.bool_
+  except (TypeError, ValueError):
+    pass
+  return dtype
+
+
+def _map_activation(act: Any) -> Any:
+  """Maps an activation function or its name to the corresponding MLX activation."""
+  if act is None:
+    return None
+  if not callable(act):
+    return act
+
+  name = getattr(act, '__name__', None)
+  if name is None:
+    return act
+
+  activations = {
+      'relu': nn.relu,
+      'gelu': nn.gelu,
+      'silu': nn.silu,
+      'swish': nn.silu,
+      'sigmoid': mx.sigmoid,
+      'tanh': mx.tanh,
+      'elu': nn.elu,
+      'softmax': mx.softmax,
+      'softplus': nn.softplus,
+  }
+  return activations.get(name, act)
+
+
+# pylint: disable=too-many-nested-blocks
+def make_layer(config, backend='mlx') -> Any:
+  """Instantiates an MLX layer from a JAX or Spec config."""
+
+  # 1. Try calling config.make() if it supports backend argument.
+  if (
+      hasattr(config, 'make')
+      and type(config).make != specs_types.SequenceLayerConfig.make
+  ):
+    sig = inspect.signature(config.make)
+    if 'backend' in sig.parameters:
+      layer = config.make(backend=backend)
+      if layer is not None:
+        return layer
+    # If it's an MLX-specific config, it might have no-arg make() returning
+    # MLX layer.
+    config_module = config.__class__.__module__
+    if 'mlx' in config_module:
+      layer = config.make()
+      if layer is not None:
+        return layer
+
+  # 2. Fallback to from_config resolution.
+  config_class = config.__class__
+  parts = config_class.__qualname__.split('.')
+  if len(parts) > 1 and parts[-1] == 'Config':
+    class_name = parts[-2]
+  else:
+    class_name = config_class.__name__
+    if class_name.endswith('Config'):
+      class_name = class_name[:-6]
+
+  import sequence_layers.mlx as mlx_module  # pylint: disable=import-outside-toplevel,g-import-not-at-top
+
+  if not hasattr(mlx_module, class_name):
+    raise AttributeError(
+        f"Concrete MLX class '{class_name}' not found in sequence_layers.mlx."
+        ' Make sure it is imported and exposed in'
+        ' sequence_layers/mlx/__init__.py.'
+    )
+  mlx_class = getattr(mlx_module, class_name)
+
+  if hasattr(mlx_class, 'from_config'):
+    sig = inspect.signature(mlx_class.from_config)
+    if 'backend' in sig.parameters:
+      return mlx_class.from_config(config, backend=backend)
+    return mlx_class.from_config(config)
+
+  # 3. Dynamic conversion fallback for leaf layers without from_config.
+  if hasattr(mlx_class, 'Config') and dataclasses.is_dataclass(
+      mlx_class.Config
+  ):
+    mlx_config_class = mlx_class.Config
+    mlx_fields = {f.name: f for f in dataclasses.fields(mlx_config_class)}
+
+    kwargs = {}
+    for f in dataclasses.fields(config):
+      if f.name in mlx_fields:
+        val = getattr(config, f.name)
+
+        # Map activations and dtypes
+        if f.name == 'activation':
+          val = _map_activation(val)
+        elif 'dtype' in f.name:
+          val = _to_mx_dtype(val)
+
+        # Recursively convert nested configs
+        if isinstance(val, (list, tuple)):
+          new_val = []
+          for item in val:
+            if hasattr(item, '__class__') and dataclasses.is_dataclass(item):
+              try:
+                new_val.append(make_layer(item, backend=backend))
+              except Exception:  # pylint: disable=broad-exception-caught
+                new_val.append(item)
+            else:
+              new_val.append(item)
+          val = type(val)(new_val)
+        elif hasattr(val, '__class__') and dataclasses.is_dataclass(val):
+          try:
+            val = make_layer(val, backend=backend)
+          except Exception:  # pylint: disable=broad-exception-caught
+            pass
+
+        kwargs[f.name] = val
+
+    try:
+      mlx_config = mlx_config_class(**kwargs)
+      return mlx_config.make()
+    except Exception as e:  # pylint: disable=broad-exception-caught
+      raise AttributeError(
+          f"Concrete MLX class '{class_name}' does not implement from_config "
+          f'and dynamic instantiation failed: {e}'
+      ) from e
+
+  raise AttributeError(
+      f"Concrete MLX class '{class_name}' does not implement from_config "
+      'and has no Config dataclass for dynamic instantiation.'
+  )
+
+
+# pylint: enable=too-many-nested-blocks
diff --git a/sequence_layers/specs/types.py b/sequence_layers/specs/types.py
index a9b0fe7..d2b216f 100644
--- a/sequence_layers/specs/types.py
+++ b/sequence_layers/specs/types.py
@@ -977,10 +977,6 @@ def ChannelSpec(self) -> type[_ChannelSpecType]:
   def ShapeDType(self) -> type[_ChannelSpecType]:
     ...
 
-  @property
-  def HashableArray(self) -> type[HashableArray]:
-    ...
-
   @property
   def Sequence(self) -> type[_SequenceType[Any, Any]]:
     ...
diff --git a/sequence_layers/specs/types_behaviors.py b/sequence_layers/specs/types_behaviors.py
index b2b3a99..8bdc6ae 100644
--- a/sequence_layers/specs/types_behaviors.py
+++ b/sequence_layers/specs/types_behaviors.py
@@ -164,6 +164,15 @@ def get_output_shape(
   ) -> types_spec.Shape:
     return tuple(input_shape) + (1,)
 
+  @override
+  def get_output_shape_for_sequence(
+      self,
+      x: types_spec.Sequence,
+      *,
+      constants: types_spec.Constants | None = None,
+  ) -> types_spec.Shape:
+    return self.get_output_shape(x.channel_shape, constants=constants)
+
   @override
   def get_output_dtype(
       self,

From d95566ea2f2c2036f700d060a3e14ff84f5e7ade Mon Sep 17 00:00:00 2001
From: Julian Salazar <julsal@google.com>
Date: Mon, 1 Jun 2026 19:20:30 +0000
Subject: [PATCH 10/29] feat: squash-merge mlx modules (simple through logging)
 onto rebased foundation

TAG=agy
CONV=21ada17b-3411-4090-8450-e69d8ebfeae6
---
 sequence_layers/abstract/__init__.py          |    2 +
 sequence_layers/converters/__init__.py        |    1 +
 sequence_layers/converters/jax_to_mlx.py      |  923 +++++++
 sequence_layers/converters/jax_to_mlx_test.py | 2405 ++++++++++++++++
 sequence_layers/jax/attention/common.py       |  332 ++-
 .../jax/attention/dot_product_attention.py    |   15 +-
 .../attention/dot_product_attention_test.py   |    6 +-
 .../attention/dot_product_self_attention.py   |   15 +-
 .../dot_product_self_attention_test.py        |   77 +-
 .../local_dot_product_self_attention.py       |   12 +-
 .../local_dot_product_self_attention_test.py  |   64 +-
 .../streaming_dot_product_attention.py        |   14 +-
 .../streaming_dot_product_attention_test.py   |   67 +-
 .../streaming_local_dot_product_attention.py  |   12 +-
 sequence_layers/jax/combinators.py            |   37 +-
 sequence_layers/jax/combinators_test.py       |    5 +
 sequence_layers/jax/conditioning.py           |   86 +-
 sequence_layers/jax/conditioning_test.py      |    8 +-
 sequence_layers/jax/convolution.py            |   38 +-
 sequence_layers/jax/convolution1d_test.py     |   15 +-
 sequence_layers/jax/convolution2d_test.py     |    8 +-
 sequence_layers/jax/dense.py                  |   17 +-
 sequence_layers/jax/dense_test.py             |   81 +-
 sequence_layers/jax/dsp.py                    |   76 +-
 sequence_layers/jax/dsp_test.py               |  656 +----
 sequence_layers/jax/normalization.py          |   55 +-
 sequence_layers/jax/normalization_test.py     |  319 +--
 sequence_layers/jax/pooling.py                |   79 +-
 sequence_layers/jax/pooling_test.py           |  379 +--
 sequence_layers/jax/position.py               |   47 +-
 sequence_layers/jax/position_test.py          |  337 +--
 sequence_layers/jax/simple.py                 |  968 +++++--
 sequence_layers/jax/simple_test.py            |  670 +----
 sequence_layers/jax/utils.py                  |   51 +-
 sequence_layers/mlx/__init__.py               |  198 +-
 sequence_layers/mlx/attention.py              | 2422 +++++++++++++++++
 sequence_layers/mlx/attention_test.py         |  281 ++
 sequence_layers/mlx/combinators.py            |  712 +++++
 sequence_layers/mlx/combinators_test.py       |  284 ++
 sequence_layers/mlx/conditioning.py           |  456 ++++
 sequence_layers/mlx/conditioning_test.py      |   33 +
 sequence_layers/mlx/convolution.py            | 1150 ++++++++
 sequence_layers/mlx/convolution2d.py          | 1269 +++++++++
 sequence_layers/mlx/convolution_test.py       |  103 +
 .../mlx/decoder_transformer_test.py           |  248 ++
 sequence_layers/mlx/dense.py                  |  327 +++
 sequence_layers/mlx/dense_test.py             |   28 +
 sequence_layers/mlx/dsp.py                    | 1477 ++++++++++
 sequence_layers/mlx/dsp_test.py               |  182 ++
 sequence_layers/mlx/export.py                 |  198 ++
 sequence_layers/mlx/export_test.py            |  297 ++
 sequence_layers/mlx/init_mapping.py           |  239 ++
 sequence_layers/mlx/normalization.py          |  463 ++++
 sequence_layers/mlx/normalization_test.py     |  262 ++
 sequence_layers/mlx/pooling.py                |  696 +++++
 sequence_layers/mlx/pooling_test.py           |  110 +
 sequence_layers/mlx/position.py               |  511 ++++
 sequence_layers/mlx/position_test.py          |  142 +
 sequence_layers/mlx/projection_configs.py     |  137 +
 sequence_layers/mlx/signal.py                 |   62 +
 sequence_layers/mlx/simple.py                 | 1591 +++++++++++
 sequence_layers/mlx/simple_test.py            |  200 ++
 sequence_layers/mlx/typing.py                 |   43 +
 sequence_layers/mlx/utils.py                  |  178 +-
 sequence_layers/specs/attention.py            |  285 ++
 sequence_layers/specs/attention_behaviors.py  |  775 ++++++
 sequence_layers/specs/combinators.py          |  165 ++
 .../specs/combinators_behaviors.py            |  477 ++++
 sequence_layers/specs/conditioning.py         |  107 +
 .../specs/conditioning_behaviors.py           |  362 +++
 sequence_layers/specs/convolution.py          |  211 ++
 .../specs/convolution_behaviors.py            |  347 +++
 sequence_layers/specs/dense.py                |   61 +
 sequence_layers/specs/dense_behaviors.py      |  109 +
 sequence_layers/specs/dsp.py                  |  356 +++
 sequence_layers/specs/dsp_behaviors.py        |  428 +++
 sequence_layers/specs/normalization.py        |  136 +
 .../specs/normalization_behaviors.py          |  312 +++
 sequence_layers/specs/pooling.py              |  281 ++
 sequence_layers/specs/pooling_behaviors.py    |  859 ++++++
 sequence_layers/specs/position.py             |   85 +
 sequence_layers/specs/position_behaviors.py   |  290 ++
 sequence_layers/specs/simple.py               |  635 +++++
 sequence_layers/specs/simple_behaviors.py     |  795 ++++++
 sequence_layers/specs/types.py                |    5 +
 sequence_layers/specs/types_behaviors.py      |   56 +
 86 files changed, 26282 insertions(+), 3031 deletions(-)
 create mode 100644 sequence_layers/abstract/__init__.py
 create mode 100644 sequence_layers/converters/__init__.py
 create mode 100644 sequence_layers/converters/jax_to_mlx.py
 create mode 100644 sequence_layers/converters/jax_to_mlx_test.py
 create mode 100644 sequence_layers/mlx/attention.py
 create mode 100644 sequence_layers/mlx/attention_test.py
 create mode 100644 sequence_layers/mlx/combinators.py
 create mode 100644 sequence_layers/mlx/combinators_test.py
 create mode 100644 sequence_layers/mlx/conditioning.py
 create mode 100644 sequence_layers/mlx/conditioning_test.py
 create mode 100644 sequence_layers/mlx/convolution.py
 create mode 100644 sequence_layers/mlx/convolution2d.py
 create mode 100644 sequence_layers/mlx/convolution_test.py
 create mode 100644 sequence_layers/mlx/decoder_transformer_test.py
 create mode 100644 sequence_layers/mlx/dense.py
 create mode 100644 sequence_layers/mlx/dense_test.py
 create mode 100644 sequence_layers/mlx/dsp.py
 create mode 100644 sequence_layers/mlx/dsp_test.py
 create mode 100644 sequence_layers/mlx/export.py
 create mode 100644 sequence_layers/mlx/export_test.py
 create mode 100644 sequence_layers/mlx/init_mapping.py
 create mode 100644 sequence_layers/mlx/normalization.py
 create mode 100644 sequence_layers/mlx/normalization_test.py
 create mode 100644 sequence_layers/mlx/pooling.py
 create mode 100644 sequence_layers/mlx/pooling_test.py
 create mode 100644 sequence_layers/mlx/position.py
 create mode 100644 sequence_layers/mlx/position_test.py
 create mode 100644 sequence_layers/mlx/projection_configs.py
 create mode 100644 sequence_layers/mlx/signal.py
 create mode 100644 sequence_layers/mlx/simple.py
 create mode 100644 sequence_layers/mlx/simple_test.py
 create mode 100644 sequence_layers/mlx/typing.py
 create mode 100644 sequence_layers/specs/attention.py
 create mode 100644 sequence_layers/specs/attention_behaviors.py
 create mode 100644 sequence_layers/specs/combinators.py
 create mode 100644 sequence_layers/specs/combinators_behaviors.py
 create mode 100644 sequence_layers/specs/conditioning.py
 create mode 100644 sequence_layers/specs/conditioning_behaviors.py
 create mode 100644 sequence_layers/specs/convolution.py
 create mode 100644 sequence_layers/specs/convolution_behaviors.py
 create mode 100644 sequence_layers/specs/dense.py
 create mode 100644 sequence_layers/specs/dense_behaviors.py
 create mode 100644 sequence_layers/specs/dsp.py
 create mode 100644 sequence_layers/specs/dsp_behaviors.py
 create mode 100644 sequence_layers/specs/normalization.py
 create mode 100644 sequence_layers/specs/normalization_behaviors.py
 create mode 100644 sequence_layers/specs/pooling.py
 create mode 100644 sequence_layers/specs/pooling_behaviors.py
 create mode 100644 sequence_layers/specs/position.py
 create mode 100644 sequence_layers/specs/position_behaviors.py
 create mode 100644 sequence_layers/specs/simple.py
 create mode 100644 sequence_layers/specs/simple_behaviors.py

diff --git a/sequence_layers/abstract/__init__.py b/sequence_layers/abstract/__init__.py
new file mode 100644
index 0000000..70c0c87
--- /dev/null
+++ b/sequence_layers/abstract/__init__.py
@@ -0,0 +1,2 @@
+from sequence_layers.abstract import types
+from sequence_layers.abstract import types_test_base
diff --git a/sequence_layers/converters/__init__.py b/sequence_layers/converters/__init__.py
new file mode 100644
index 0000000..cda8403
--- /dev/null
+++ b/sequence_layers/converters/__init__.py
@@ -0,0 +1 @@
+"""Converters package."""
diff --git a/sequence_layers/converters/jax_to_mlx.py b/sequence_layers/converters/jax_to_mlx.py
new file mode 100644
index 0000000..41376e3
--- /dev/null
+++ b/sequence_layers/converters/jax_to_mlx.py
@@ -0,0 +1,923 @@
+"""Convert Linen-trained params to MLX model weights.
+
+Handles the structural differences between Linen (JAX/Flax) and MLX:
+  - Linen Dense kernel [in, out] → MLX nn.Linear weight [out, in]
+  - Linen combined QKV kernel [in, 3, heads, uph] → separate q/k/v
+  - Linen Repeat stacked params [N, ...] → per-copy params [...]
+  - Linen Partitioned wrappers → unwrapped arrays
+"""
+
+# pylint: disable=protected-access,redefined-outer-name,unused-argument,unnecessary-lambda,unbalanced-tuple-unpacking
+
+import importlib
+
+from flax import linen as flax_nn
+import jax
+import mlx.core as mx
+import mlx.nn as mlx_nn
+import numpy as np
+
+from sequence_layers.jax import attention as jax_attn
+from sequence_layers.jax import combinators as jax_comb
+from sequence_layers.jax import conditioning as jax_cond
+from sequence_layers.jax import convolution as jax_conv
+from sequence_layers.jax import dense as jax_dense
+from sequence_layers.jax import normalization as jax_norm
+from sequence_layers.jax import pooling as jax_pooling
+from sequence_layers.jax import simple as jax_simple
+from sequence_layers.jax.attention import common as attn_common
+from sequence_layers.mlx import attention as mlx_attn
+from sequence_layers.mlx import combinators as mlx_comb
+from sequence_layers.mlx import convolution2d as mlx_conv
+from sequence_layers.mlx import dense as mlx_dense
+from sequence_layers.mlx import dsp as mlx_dsp
+from sequence_layers.mlx import export as mlx_export
+from sequence_layers.mlx import normalization as mlx_norm
+from sequence_layers.mlx import projection_configs as mlx_proj
+from sequence_layers.mlx import simple as mlx_simple
+from sequence_layers.mlx import types as bt
+
+
+def _get_inner(layer):
+  """Unwrap Deferred wrapper to get the actual layer."""
+  inner = layer
+  if hasattr(inner, "inner") and inner.inner is not None:
+    inner = inner.inner
+  return inner
+
+
+def _unbox_params(params):
+  """Unwrap Flax Partitioned wrappers and convert to numpy.
+
+  Args:
+    params: A Linen param dict (possibly with Partitioned values).
+
+  Returns:
+    A nested dict of numpy arrays.
+  """
+  params = flax_nn.unbox(params)
+  return jax.tree_util.tree_map(lambda x: np.array(x), params)
+
+
+def _set_weight(module, attr_name, value):
+  """Set a weight on an MLX module.
+
+  Handles both direct array attributes and nn.Module child params.
+
+  Args:
+    module: An MLX nn.Module.
+    attr_name: Dot-separated attribute path (e.g. '_linear.weight').
+    value: An mx.array value.
+  """
+  parts = attr_name.split(".")
+  obj = module
+  for part in parts[:-1]:
+    obj = getattr(obj, part)
+  setattr(obj, parts[-1], value)
+
+
+def load_linen_params(
+    mlx_model,
+    linen_params,
+    config,
+    *,
+    input_spec=None,
+    batch_stats=None,
+    constants=None,
+    skip_materialization=False,
+):
+  """Load Linen-trained params into an MLX model.
+
+  Uses the config tree to guide the conversion, handling structural
+  differences between Linen and MLX parameter layouts.
+
+  Args:
+    mlx_model: An MLX SequenceLayer (already initialized via
+        config.make(backend='mlx')).
+    linen_params: A Linen param dict from model.init(...)['params'].
+    config: The SequenceLayerConfig used to create both models.
+    input_spec: Optional ShapeDType for the input. Defaults to scalar int32
+        (for token models). For float models (e.g. convolution), pass
+        ShapeDType((channels,), mx.float32).
+    batch_stats: Optional batch_stats dict from model.init(...)['batch_stats'].
+        Required for BatchNormalization layers.
+    constants: Optional constants dict for layers that need a source sequence
+        during deferred initialization (e.g. cross-attention).
+    skip_materialization: If True, skip materializing deferred layers.
+  """
+  if not skip_materialization:
+    if input_spec is None:
+      input_spec = bt.ShapeDType((), mx.int32)
+
+    # Materialize deferred layers with a dummy forward pass.
+    # Slice constants to time=1 to match the dummy input.
+    init_constants = None
+    if constants is not None:
+      init_constants = {}
+      for k, v in constants.items():
+        if hasattr(v, "values") and hasattr(v, "mask"):
+          # Slice Sequence to time=1.
+          init_constants[k] = bt.Sequence(v.values[:1, :1], v.mask[:1, :1])
+        else:
+          init_constants[k] = v
+    mlx_export._materialize_deferred(
+        mlx_model,
+        batch_size=1,
+        input_spec=input_spec,
+        constants=init_constants,
+    )
+
+  # Unbox and convert to numpy.
+  params = _unbox_params(linen_params)
+  bs = _unbox_params(batch_stats) if batch_stats is not None else None
+
+  # Walk the config tree and load params.
+  _load_config(mlx_model, params, config, batch_stats=bs)
+  mx.eval(mlx_model.parameters())
+
+
+def collect_all_params(module):
+  """Collect all parameters, including those in unregistered lists."""
+  params = {}
+
+  def _recurse(obj, path):
+    if isinstance(obj, mx.array):
+      params[path] = obj
+    elif isinstance(obj, dict):
+      for k, v in obj.items():
+        _recurse(v, path + (k,))
+    elif isinstance(obj, list):
+      for i, v in enumerate(obj):
+        _recurse(v, path + (str(i),))
+    elif isinstance(obj, mlx_nn.Module):
+      _recurse(obj.parameters(), path)
+
+  _recurse(module, ())
+  return params
+
+
+def check_unupdated_params(module, params_before):
+  """Check if any parameters in the module were NOT updated (same id)."""
+  params_after = collect_all_params(module)
+  unupdated = []
+  for k in params_before:
+    if k in params_after and id(params_before[k]) == id(params_after[k]):
+      unupdated.append(k)
+  return unupdated
+
+
+_MLX_CONVERTERS = {}
+
+
+def register_converter(config_cls, mapping_or_fn):
+  """Registers a converter for a given Config class."""
+  _MLX_CONVERTERS[config_cls] = mapping_or_fn
+
+
+def _load_config(mlx_module, linen_params, config, batch_stats=None):
+  """Recursively load params guided by config type."""
+  inner = mlx_module
+  inner = _get_inner(inner)
+
+  config_cls = type(config)
+  original_config_cls = config_cls
+  if ".mlx." in config_cls.__module__:
+    jax_module_name = config_cls.__module__.replace(".mlx.", ".jax.")
+    try:
+      jax_module = importlib.import_module(jax_module_name)
+      obj = jax_module
+      for part in config_cls.__qualname__.split("."):
+        obj = getattr(obj, part)
+      config_cls = obj
+    except (ImportError, AttributeError):
+      pass  # Fall back to the actual class if JAX counterpart not found
+
+  mapping_or_fn = None
+  if hasattr(config, "name") and config.name:
+    mapping_or_fn = _MLX_CONVERTERS.get(config.name)
+
+  if mapping_or_fn is None:
+    mapping_or_fn = _MLX_CONVERTERS.get(config_cls)
+
+  if mapping_or_fn is None and original_config_cls != config_cls:
+    mapping_or_fn = _MLX_CONVERTERS.get(original_config_cls)
+
+  if mapping_or_fn is None:
+    raise ValueError(f"No converter registered for config type: {config_cls}")
+
+  if callable(mapping_or_fn):
+    # Custom function fallback
+    mapping_or_fn(inner, linen_params, config, batch_stats=batch_stats)
+  else:
+    # Declarative mapping (dict)
+    for jax_name, (mlx_path, transpose) in mapping_or_fn.items():
+      if jax_name in linen_params:
+        val = mx.array(linen_params[jax_name])
+        if transpose:
+          val = val.T
+        _set_weight(inner, mlx_path, val)
+  # Stateless layers (Flatten, Identity, RoPE, pooling, etc.) have no params.
+
+
+def _load_serial(mlx_serial, linen_params, config, batch_stats=None):
+  """Load Serial: try name first, fallback to layers_{i}."""
+  print(
+      f"\n[LOAD_SERIAL] mlx_module={mlx_serial.__class__.__name__}"
+      f" (name={config.name})"
+  )
+  print(f"  linen_params keys: {list(linen_params.keys())}")
+  for i, layer_config in enumerate(config.layers):
+    name = mlx_serial._layer_names[i]
+
+    # Try semantic name first, fallback to positional key.
+    if name in linen_params:
+      key = name
+    else:
+      key = f"layers_{i}"
+
+    print(
+        f"  -> Index {i}: mlx_name={name} -> JAX key={key} (JAX params exist:"
+        f" {key in linen_params})"
+    )
+    child_params = linen_params.get(key, {})
+    child_bs = batch_stats.get(key, {}) if batch_stats else None
+
+    _load_config(
+        getattr(mlx_serial, name),
+        child_params,
+        layer_config,
+        batch_stats=child_bs,
+    )
+
+
+def _load_parallel(mlx_parallel, linen_params, config, batch_stats=None):
+  """Load Parallel: walk layer names or fallback to layers_{i}."""
+  print(
+      f"\n[LOAD_PARALLEL] mlx_module={mlx_parallel.__class__.__name__}"
+      f" (name={config.name})"
+  )
+  print(f"  linen_params keys: {list(linen_params.keys())}")
+  for i, layer_config in enumerate(config.layers):
+    name = getattr(layer_config, "name", None)
+    if name and name in linen_params:
+      key = name
+    else:
+      key = f"layers_{i}"
+    print(
+        f"  -> Index {i}: Parallel branch"
+        f" config={layer_config.__class__.__name__} config_name={name} -> JAX"
+        f" key={key} (JAX params exist: {key in linen_params})"
+    )
+    child_params = linen_params.get(key, {})
+    child_bs = batch_stats.get(key, {}) if batch_stats else None
+    _load_config(
+        mlx_parallel.layers[i],
+        child_params,
+        layer_config,
+        batch_stats=child_bs,
+    )
+
+
+def _load_repeat(mlx_repeat, linen_params, config, batch_stats=None):
+  """Load Repeat: slice stacked Linen params for each MLX copy."""
+  child_params = linen_params.get("child_layer", {})
+  child_bs = batch_stats.get("child_layer", {}) if batch_stats else None
+
+  # Linen Repeat stacks all child params with leading [num_repeats].
+  # Slice axis 0 for each copy.
+  for i in range(config.num_repeats):
+    sliced = _slice_params(child_params, i)
+    sliced_bs = _slice_params(child_bs, i) if child_bs else None
+    _load_config(
+        mlx_repeat.layers[i],
+        sliced,
+        config.layer,
+        batch_stats=sliced_bs,
+    )
+
+
+def _slice_params(params, index):
+  """Slice the leading axis of all arrays in a param dict."""
+  result = {}
+  for key, value in params.items():
+    if isinstance(value, dict):
+      result[key] = _slice_params(value, index)
+    elif isinstance(value, np.ndarray):
+      result[key] = value[index]
+    else:
+      result[key] = value
+  return result
+
+
+def _load_residual(mlx_residual, linen_params, config, batch_stats=None):
+  """Load Residual: body is Serial, shortcut is shortcut_layer."""
+  # Body is a Serial inside the Residual.
+  body = mlx_residual.body
+  for i, layer_config in enumerate(config.layers):
+    name = body._layer_names[i]
+
+    # Try semantic name first, fallback to positional key.
+    if name in linen_params:
+      key = name
+    else:
+      key = f"layers_{i}"
+
+    child_params = linen_params.get(key, {})
+    child_bs = batch_stats.get(key, {}) if batch_stats else None
+
+    _load_config(
+        getattr(body, name),
+        child_params,
+        layer_config,
+        batch_stats=child_bs,
+    )
+
+  # Shortcut (usually Identity — no params).
+  if config.shortcut_layers:
+    shortcut_params = linen_params.get("shortcut_layer", {})
+    shortcut_bs = batch_stats.get("shortcut_layer", {}) if batch_stats else None
+    if len(config.shortcut_layers) == 1:
+      shortcut_layers_mlx = [mlx_residual.shortcut]
+    else:
+      shortcut_layers_mlx = mlx_residual.shortcut.layers
+
+    for i, sc_config in enumerate(config.shortcut_layers):
+      if len(config.shortcut_layers) == 1:
+        name = mlx_residual.shortcut.name
+      else:
+        name = mlx_residual.shortcut._layer_names[i]
+
+      if name in shortcut_params:
+        sc_key = name
+      else:
+        sc_key = f"layers_{i}"
+
+      sc_bs = shortcut_bs.get(sc_key, {}) if shortcut_bs else None
+      _load_config(
+          shortcut_layers_mlx[i],
+          shortcut_params.get(sc_key, {}),
+          sc_config,
+          batch_stats=sc_bs,
+      )
+
+
+def _load_dense(mlx_dense, linen_params, config, batch_stats=None):
+  """Load Dense: transpose kernel [in, out] → [out, in]."""
+  # Handle DenseDeferred wrapper.
+  inner = mlx_dense
+  inner = _get_inner(inner)
+
+  kernel = linen_params.get("kernel")
+  if kernel is not None:
+    # Linen: [in, out], MLX nn.Linear: [out, in]
+    weight = mx.array(kernel.T)
+    inner._linear.weight = weight
+
+  bias = linen_params.get("bias")
+  if bias is not None:
+    inner._linear.bias = mx.array(bias)
+
+
+def _load_einsum_dense(mlx_einsum, linen_params, config, batch_stats=None):
+  """Load EinsumDense: kernel shape matches directly (einsum notation)."""
+  kernel = linen_params.get("kernel")
+  if kernel is not None:
+    mlx_einsum.kernel = mx.array(kernel)
+    mlx_einsum._initialized = True
+  bias = linen_params.get("bias")
+  if bias is not None:
+    mlx_einsum.bias = mx.array(bias)
+
+
+def _load_attention(mlx_attn, linen_params, config, batch_stats=None):
+  # pylint: disable=unused-argument
+  """Load DotProductSelfAttention.
+
+  Handles:
+    - CombinedQueryKeyValueProjection:
+        query_key_value_projection/kernel [in, 3, heads, uph]
+    - SeparateQueryKeyValueProjection:
+        query_projection/kernel [in, heads, uph]
+        key_projection/kernel [in, kv_heads, uph]
+        value_projection/kernel [in, kv_heads, uph]
+  """
+
+  # Handle Deferred wrapper.
+  inner = mlx_attn
+  inner = _get_inner(inner)
+
+  input_projection = config.input_projection
+
+  if isinstance(
+      input_projection,
+      (
+          attn_common.CombinedQueryKeyValueProjection,
+          mlx_proj.CombinedQueryKeyValueProjection,
+      ),
+  ):
+    # Combined QKV: kernel [in, 3, heads, uph]
+    qkv_params = linen_params.get("query_key_value_projection", {})
+    combined_kernel = qkv_params.get("kernel")
+    if combined_kernel is not None:
+      in_features = combined_kernel.shape[0]
+      if hasattr(inner, "qkv_proj"):
+        inner.qkv_proj = mx.array(combined_kernel.reshape(in_features, -1))
+      else:
+        # Separate Q + combined KV layout.
+        q, k, v = np.split(combined_kernel, 3, axis=1)
+        inner.q_proj = mx.array(q.reshape(in_features, -1))
+        k_flat = k.reshape(in_features, -1)
+        v_flat = v.reshape(in_features, -1)
+        inner.kv_proj = mx.array(np.concatenate([k_flat, v_flat], axis=-1))
+
+    combined_bias = qkv_params.get("bias")
+    if combined_bias is not None:
+      if hasattr(inner, "qkv_bias"):
+        inner.qkv_bias = mx.array(combined_bias.reshape(-1))
+      else:
+        qb, kb, vb = np.split(combined_bias, 3, axis=0)
+        inner.q_bias = mx.array(qb.reshape(-1))
+        inner.kv_bias = mx.array(
+            np.concatenate([kb.reshape(-1), vb.reshape(-1)], axis=-1)
+        )
+
+  elif isinstance(
+      input_projection,
+      (
+          attn_common.SeparateQueryKeyValueProjection,
+          mlx_proj.SeparateQueryKeyValueProjection,
+      ),
+  ):
+    # Separate Q/K/V projections (used for GQA where num_kv_heads < num_heads).
+    q_params = linen_params.get("query_projection", {})
+    q_kernel = q_params.get("kernel")
+    if q_kernel is not None:
+      in_features = q_kernel.shape[0]
+      inner.q_proj = mx.array(q_kernel.reshape(in_features, -1))
+    q_bias = q_params.get("bias")
+    if q_bias is not None:
+      inner.q_bias = mx.array(q_bias.reshape(-1))
+
+    k_params = linen_params.get("key_projection", {})
+    k_kernel = k_params.get("kernel")
+    v_params = linen_params.get("value_projection", {})
+    v_kernel = v_params.get("kernel")
+    if k_kernel is not None and v_kernel is not None:
+      in_features = k_kernel.shape[0]
+      k_flat = k_kernel.reshape(in_features, -1)
+      v_flat = v_kernel.reshape(in_features, -1)
+      inner.kv_proj = mx.array(np.concatenate([k_flat, v_flat], axis=-1))
+    k_bias = k_params.get("bias")
+    v_bias = v_params.get("bias")
+    if k_bias is not None and v_bias is not None:
+      inner.kv_bias = mx.array(
+          np.concatenate([k_bias.reshape(-1), v_bias.reshape(-1)], axis=-1)
+      )
+
+  # per_dim_scale: learned [units_per_head] query scale.
+  per_dim_scale = linen_params.get("per_dim_scale")
+  if per_dim_scale is not None:
+    inner._per_dim_scale = mx.array(per_dim_scale)
+
+  # Attention sink embeddings.
+  sink_key = linen_params.get("sink_key_embeddings")
+  if sink_key is not None:
+    inner.sink_key_embeddings = mx.array(sink_key)
+  sink_value = linen_params.get("sink_value_embeddings")
+  if sink_value is not None:
+    inner.sink_value_embeddings = mx.array(sink_value)
+
+  # Q/K/V processing networks have no trainable params
+  # (RoPE is stateless with no learned weights).
+
+
+def _load_streaming_attention(mlx_attn, linen_params, config, batch_stats=None):
+  # pylint: disable=unused-argument
+  """Load StreamingDotProductAttention.
+
+  Handles different projection layouts:
+    - QueryAndKeyValueProjection (default):
+        query_projection/kernel [in, heads, uph]
+        key_value_projection/kernel [source, 2, heads, uph]
+    - SeparateQueryKeyValueProjection:
+        query_projection/kernel [in, heads, uph]
+        key_projection/kernel [source, heads, uph]
+        value_projection/kernel [source, heads, uph]
+    - QueryAndSharedKeyValueProjection:
+        query_projection/kernel [in, heads, uph]
+        shared_key_value_projection/kernel [source, heads, uph]
+  """
+
+  # Handle Deferred wrapper.
+  inner = mlx_attn
+  inner = _get_inner(inner)
+
+  input_projection = config.input_projection
+
+  # Load query projection.
+  q_params = linen_params.get("query_projection", {})
+  q_kernel = q_params.get("kernel")
+  if q_kernel is not None:
+    # Shape: [in_features, num_heads, units_per_head] → [in, heads*uph]
+    in_features = q_kernel.shape[0]
+    inner.q_proj = mx.array(q_kernel.reshape(in_features, -1))
+  q_bias = q_params.get("bias")
+  if q_bias is not None:
+    inner.q_bias = mx.array(q_bias.reshape(-1))
+
+  if isinstance(
+      input_projection,
+      (
+          attn_common.QueryAndKeyValueProjection,
+          mlx_proj.QueryAndKeyValueProjection,
+      ),
+  ):
+    # Combined KV: kernel [source, 2, heads, uph] → combined kv_proj.
+    kv_params = linen_params.get("key_value_projection", {})
+    kv_kernel = kv_params.get("kernel")
+    if kv_kernel is not None:
+      source_features = kv_kernel.shape[0]
+      # Split along axis 1 (the '2' axis for K/V), flatten, recombine.
+      k, v = np.split(kv_kernel, 2, axis=1)
+      k_flat = k.reshape(source_features, -1)
+      v_flat = v.reshape(source_features, -1)
+      inner.kv_proj = mx.array(np.concatenate([k_flat, v_flat], axis=-1))
+    kv_bias = kv_params.get("bias")
+    if kv_bias is not None:
+      kb, vb = np.split(kv_bias, 2, axis=0)
+      inner.kv_bias = mx.array(
+          np.concatenate([kb.reshape(-1), vb.reshape(-1)], axis=-1)
+      )
+
+  elif isinstance(
+      input_projection,
+      (
+          attn_common.SeparateQueryKeyValueProjection,
+          mlx_proj.SeparateQueryKeyValueProjection,
+      ),
+  ):
+    # Separate K and V projections → combined kv_proj.
+    k_params = linen_params.get("key_projection", {})
+    k_kernel = k_params.get("kernel")
+    v_params = linen_params.get("value_projection", {})
+    v_kernel = v_params.get("kernel")
+    if k_kernel is not None and v_kernel is not None:
+      source_features = k_kernel.shape[0]
+      k_flat = k_kernel.reshape(source_features, -1)
+      v_flat = v_kernel.reshape(source_features, -1)
+      inner.kv_proj = mx.array(np.concatenate([k_flat, v_flat], axis=-1))
+    k_bias = k_params.get("bias")
+    v_bias = v_params.get("bias")
+    if k_bias is not None and v_bias is not None:
+      inner.kv_bias = mx.array(
+          np.concatenate([k_bias.reshape(-1), v_bias.reshape(-1)], axis=-1)
+      )
+
+  elif isinstance(
+      input_projection,
+      (
+          attn_common.QueryAndSharedKeyValueProjection,
+          mlx_proj.QueryAndSharedKeyValueProjection,
+      ),
+  ):
+    # Shared K/V projection: same weights for both K and V → combined kv_proj.
+    shared_params = linen_params.get("shared_key_value_projection", {})
+    shared_kernel = shared_params.get("kernel")
+    if shared_kernel is not None:
+      source_features = shared_kernel.shape[0]
+      proj = shared_kernel.reshape(source_features, -1)
+      inner.kv_proj = mx.array(np.concatenate([proj, proj], axis=-1))
+    shared_bias = shared_params.get("bias")
+    if shared_bias is not None:
+      b = shared_bias.reshape(-1)
+      inner.kv_bias = mx.array(np.concatenate([b, b], axis=-1))
+
+  # per_dim_scale: learned [units_per_head] query scale.
+  per_dim_scale = linen_params.get("per_dim_scale")
+  if per_dim_scale is not None:
+    inner._per_dim_scale = mx.array(per_dim_scale)
+
+  # Attention sink embeddings.
+  sink_key = linen_params.get("sink_key_embeddings")
+  if sink_key is not None:
+    inner.sink_key_embeddings = mx.array(sink_key)
+  sink_value = linen_params.get("sink_value_embeddings")
+  if sink_value is not None:
+    inner.sink_value_embeddings = mx.array(sink_value)
+
+
+def _load_rms_norm(mlx_norm, linen_params, config, batch_stats=None):
+  # pylint: disable=unused-argument
+  """Load RMSNormalization: scale [dim] → same."""
+  scale = linen_params.get("scale")
+  if scale is not None:
+    scale_mx = mx.array(scale)
+    if mlx_norm._use_builtin and mlx_norm._rms_norm is not None:
+      mlx_norm._rms_norm.weight = scale_mx
+    elif hasattr(mlx_norm, "_scale"):
+      mlx_norm._scale = scale_mx
+
+
+def _load_layer_norm(mlx_norm, linen_params, config, batch_stats=None):
+  # pylint: disable=unused-argument
+  """Load LayerNormalization: scale and bias."""
+  scale = linen_params.get("scale")
+  bias = linen_params.get("bias")
+
+  if mlx_norm._use_builtin and mlx_norm._layer_norm is not None:
+    if scale is not None:
+      mlx_norm._layer_norm.weight = mx.array(scale)
+    if bias is not None:
+      mlx_norm._layer_norm.bias = mx.array(bias)
+  else:
+    if scale is not None and mlx_norm._manual_scale is not None:
+      mlx_norm._manual_scale = mx.array(scale)
+    if bias is not None and mlx_norm._manual_bias is not None:
+      mlx_norm._manual_bias = mx.array(bias)
+
+
+def _load_embedding(mlx_emb, linen_params, config, batch_stats=None):
+  # pylint: disable=unused-argument
+  """Load Embedding: table [vocab, dim] → same."""
+  embedding = linen_params.get("embedding")
+  if embedding is not None:
+    mlx_emb._embedding.weight = mx.array(embedding)
+
+
+def _load_batch_norm(mlx_bn, linen_params, config, batch_stats=None):
+  """Load BatchNormalization: scale/bias from params, mean/var from batch_stats."""
+  scale = linen_params.get("scale")
+  bias = linen_params.get("bias")
+
+  if scale is not None and mlx_bn.use_scale:
+    mlx_bn._scale = mx.array(scale)
+  if bias is not None and mlx_bn.use_bias:
+    mlx_bn._bias = mx.array(bias)
+
+  if batch_stats is not None:
+    mean = batch_stats.get("mean")
+    var = batch_stats.get("var")
+    if mean is not None:
+      mlx_bn._running_mean = mx.array(mean)
+    if var is not None:
+      mlx_bn._running_var = mx.array(var)
+
+
+def _load_group_norm(mlx_gn, linen_params, config, batch_stats=None):
+  """Load GroupNormalization: scale and bias."""
+  scale = linen_params.get("scale")
+  if scale is not None and mlx_gn.use_scale:
+    mlx_gn._scale = mx.array(scale)
+  bias = linen_params.get("bias")
+  if bias is not None and mlx_gn.use_bias:
+    mlx_gn._bias = mx.array(bias)
+
+
+def _load_conv1d(mlx_conv, linen_params, config, batch_stats=None):
+  # pylint: disable=unused-argument
+  """Load Conv1D: kernel [k, in, out] → [out, k, in]."""
+  inner = mlx_conv
+  inner = _get_inner(inner)
+
+  kernel = linen_params.get("kernel")
+  if kernel is not None:
+    inner._conv.weight = mx.array(kernel.transpose(2, 0, 1))
+
+  bias = linen_params.get("bias")
+  if bias is not None:
+    inner._conv.bias = mx.array(bias)
+
+
+def _load_depthwise_conv1d(mlx_conv, linen_params, config, batch_stats=None):
+  # pylint: disable=unused-argument
+  """Load DepthwiseConv1D: same kernel layout as Conv1D."""
+  _load_conv1d(mlx_conv, linen_params, config)
+
+
+def _load_conv1d_transpose(mlx_conv, linen_params, config, batch_stats=None):
+  # pylint: disable=unused-argument
+  """Load Conv1DTranspose: kernel [k, in, out] → [out, k, in].
+
+  The kernel is flipped along the spatial axis because Linen uses
+  conv_general_dilated with lhs_dilation (correlation), while MLX uses
+  conv_transpose1d which reverses the kernel direction.
+  """
+  inner = mlx_conv
+  inner = _get_inner(inner)
+
+  kernel = linen_params.get("kernel")
+  if kernel is not None:
+    # Flip spatial axis, then transpose to MLX layout.
+    inner.kernel = mx.array(kernel[::-1].transpose(2, 0, 1))
+
+  bias = linen_params.get("bias")
+  if bias is not None:
+    inner.bias = mx.array(bias)
+
+
+def _load_conv2d(mlx_conv, linen_params, config, batch_stats=None):
+  # pylint: disable=unused-argument
+  """Load Conv2D: kernel [kH, kW, Cin, Cout] -> [Cout, kH, kW, Cin]."""
+  inner = mlx_conv
+  inner = _get_inner(inner)
+
+  kernel = linen_params.get("kernel")
+  if kernel is not None:
+    # Reorder: [kH, kW, Cin, Cout] -> [Cout, kH, kW, Cin]
+    inner.kernel = mx.array(kernel.transpose(3, 0, 1, 2))
+
+  bias = linen_params.get("bias")
+  if bias is not None:
+    inner.bias = mx.array(bias)
+
+
+def _load_conv2d_transpose(mlx_conv, linen_params, config, batch_stats=None):
+  # pylint: disable=unused-argument
+  """Load Conv2DTranspose: kernel [kH, kW, Cin, Cout] -> [Cout, kH, kW, Cin].
+
+  JAX's conv_transpose flips the kernel (true mathematical transposition),
+  but MLX's conv_transpose2d does NOT. We must flip the kernel along both
+  spatial dimensions (kH, kW) to compensate.
+  """
+  inner = mlx_conv
+  inner = _get_inner(inner)
+
+  kernel = linen_params.get("kernel")
+  if kernel is not None:
+    # Flip along kH and kW to match JAX's implicit kernel flip.
+    kernel = kernel[::-1, ::-1, :, :]
+    # Reorder: [kH, kW, Cin, Cout] -> [Cout, kH, kW, Cin]
+    inner.kernel = mx.array(kernel.transpose(3, 0, 1, 2))
+
+  bias = linen_params.get("bias")
+  if bias is not None:
+    inner.bias = mx.array(bias)
+
+
+def _load_conditioning(mlx_cond, linen_params, config, batch_stats=None):
+  # pylint: disable=unused-argument
+  """Load Conditioning: projection Dense kernel/bias from 'dense' subdict.
+
+  Linen Conditioning creates a DenseShaped under the name 'dense' for
+  LINEAR and LINEAR_AFFINE projections. The kernel shape matches directly
+  (input_kernel_shape + output_kernel_shape) since we use the same einsum
+  equation.
+  """
+
+  projection = config.projection
+  if projection == jax_cond.BaseConditioning.Projection.IDENTITY:
+    return  # No params for identity projection.
+
+  dense_params = linen_params.get("dense", {})
+  kernel = dense_params.get("kernel")
+  if kernel is not None:
+    mlx_cond.kernel = mx.array(kernel)
+    mlx_cond._proj_initialized = True
+  bias = dense_params.get("bias")
+  if bias is not None:
+    mlx_cond.bias = mx.array(bias)
+
+
+# === Registrations ===
+
+
+register_converter(jax_comb.Serial.Config, _load_serial)
+register_converter(mlx_comb.Serial.Config, _load_serial)
+register_converter(jax_comb.Parallel.Config, _load_parallel)
+register_converter(mlx_comb.Parallel.Config, _load_parallel)
+register_converter(jax_comb.Repeat.Config, _load_repeat)
+register_converter(jax_comb.Residual.Config, _load_residual)
+register_converter(mlx_comb.Residual.Config, _load_residual)
+
+register_converter(
+    jax_dense.Dense.Config,
+    {
+        "kernel": ("_linear.weight", True),
+        "bias": ("_linear.bias", False),
+    },
+)
+register_converter(
+    mlx_dense.Dense.Config,
+    {
+        "kernel": ("_linear.weight", True),
+        "bias": ("_linear.bias", False),
+    },
+)
+
+register_converter(jax_dense.EinsumDense.Config, _load_einsum_dense)
+register_converter(jax_norm.RMSNormalization.Config, _load_rms_norm)
+register_converter(mlx_norm.RMSNormalization.Config, _load_rms_norm)
+register_converter(jax_norm.LayerNormalization.Config, _load_layer_norm)
+register_converter(mlx_norm.LayerNormalization.Config, _load_layer_norm)
+register_converter(jax_norm.BatchNormalization.Config, _load_batch_norm)
+register_converter(jax_norm.GroupNormalization.Config, _load_group_norm)
+
+register_converter(jax_conv.Conv1D.Config, _load_conv1d)
+register_converter(jax_conv.DepthwiseConv1D.Config, _load_depthwise_conv1d)
+register_converter(jax_conv.Conv1DTranspose.Config, _load_conv1d_transpose)
+register_converter(jax_conv.Conv2D.Config, _load_conv2d)
+register_converter(mlx_conv.Conv2D.Config, _load_conv2d)
+register_converter(jax_conv.Conv2DTranspose.Config, _load_conv2d_transpose)
+register_converter(mlx_conv.Conv2DTranspose.Config, _load_conv2d_transpose)
+
+register_converter(
+    jax_attn.DotProductAttention.Config, _load_streaming_attention
+)
+register_converter(
+    jax_attn.StreamingDotProductAttention.Config,
+    _load_streaming_attention,
+)
+register_converter(
+    jax_attn.StreamingLocalDotProductAttention.Config,
+    _load_streaming_attention,
+)
+register_converter(
+    jax_attn.LocalDotProductSelfAttention.Config, _load_attention
+)
+register_converter(jax_attn.DotProductSelfAttention.Config, _load_attention)
+
+register_converter(jax_cond.Conditioning.Config, _load_conditioning)
+register_converter(
+    jax_simple.Embedding.Config,
+    {
+        "embedding": ("_embedding.weight", False),
+    },
+)
+register_converter(
+    mlx_simple.Embedding.Config,
+    {
+        "embedding": ("_embedding.weight", False),
+    },
+)
+
+
+register_converter(
+    mlx_attn.LocalDotProductSelfAttention.Config, _load_attention
+)
+register_converter(
+    mlx_attn.StreamingDotProductAttention.Config,
+    _load_streaming_attention,
+)
+register_converter(mlx_dense.EinsumDense.Config, _load_einsum_dense)
+register_converter(mlx_norm.GroupNormalization.Config, _load_group_norm)
+
+
+# Stateless layers (no params to load).
+for config_cls in [
+    jax_simple.Identity.Config,
+    jax_simple.Logging.Config,
+    jax_simple.Relu.Config,
+    jax_simple.Gelu.Config,
+    jax_simple.Abs.Config,
+    jax_simple.Exp.Config,
+    jax_simple.Log.Config,
+    jax_simple.Swish.Config,
+    jax_simple.Tanh.Config,
+    jax_simple.Sigmoid.Config,
+    jax_simple.LeakyRelu.Config,
+    jax_simple.Elu.Config,
+    jax_simple.Softmax.Config,
+    jax_simple.Softplus.Config,
+    jax_simple.Cast.Config,
+    jax_simple.Flatten.Config,
+    jax_simple.Reshape.Config,
+    jax_simple.ExpandDims.Config,
+    jax_simple.Squeeze.Config,
+    jax_simple.Transpose.Config,
+    jax_simple.OneHot.Config,
+    jax_simple.Lambda.Config,
+    jax_simple.Upsample2D.Config,
+    jax_pooling.MaxPooling1D.Config,
+    jax_pooling.MinPooling1D.Config,
+    jax_pooling.AveragePooling1D.Config,
+    jax_pooling.AveragePooling2D.Config,
+    # MLX-native stateless layers.
+    mlx_simple.Identity.Config,
+    mlx_simple.Logging.Config,
+    mlx_simple.Scale.Config,
+    mlx_simple.Add.Config,
+    mlx_simple.Relu.Config,
+    mlx_simple.Gelu.Config,
+    mlx_simple.Swish.Config,
+    mlx_simple.Elu.Config,
+    mlx_simple.Flatten.Config,
+    mlx_simple.Reshape.Config,
+    mlx_simple.ExpandDims.Config,
+    mlx_simple.GatedUnit.Config,
+    mlx_simple.GatedLinearUnit.Config,
+    mlx_simple.GatedTanhUnit.Config,
+    mlx_simple.Lambda.Config,
+    mlx_conv.Upsample2D.Config,
+    mlx_conv.AveragePooling2D.Config,
+]:
+  register_converter(config_cls, {})
+
+# MLX-native layers with special handling.
+
+
+for config_cls in [
+    mlx_simple.CheckpointName.Config,
+    mlx_simple.Dropout.Config,
+    mlx_dsp.Delay.Config,
+]:
+  register_converter(config_cls, {})
diff --git a/sequence_layers/converters/jax_to_mlx_test.py b/sequence_layers/converters/jax_to_mlx_test.py
new file mode 100644
index 0000000..f449702
--- /dev/null
+++ b/sequence_layers/converters/jax_to_mlx_test.py
@@ -0,0 +1,2405 @@
+# pyrefly: ignore-errors
+# pylint: disable=protected-access,redefined-outer-name,no-member,unused-argument,missing-function-docstring,import-outside-toplevel,no-else-return,broad-exception-caught,unused-import,unnecessary-lambda
+
+"""Cross-backend numerical tests: JAX (Linen) vs MLX.
+
+Verifies that both backends produce numerically identical outputs for all
+ported layer types when initialised from the same random Linen parameters.
+"""
+
+import dataclasses
+import importlib
+import unittest
+
+from absl.testing import absltest
+from absl.testing import parameterized
+import jax
+import jax.numpy as jnp
+import mlx.core as mx
+import numpy as np
+
+from sequence_layers.converters import jax_to_mlx as weight_converter
+from sequence_layers.jax import types as jax_types
+import sequence_layers.jax as sl
+from sequence_layers.jax.attention import common as attn_common
+from sequence_layers.mlx import export
+from sequence_layers.mlx import types as bt
+
+Sequence = bt.Sequence
+ShapeDType = bt.ShapeDType
+
+
+def _make_mlx_config(config):
+  """Convert a JAX config to an MLX config."""
+  cls = config.__class__
+  qual_name = cls.__qualname__
+  module_name = cls.__module__
+
+  if '.' in qual_name:
+    parts = qual_name.split('.')
+    if len(parts) == 2 and parts[1] == 'Config':
+      outer_class_name = parts[0]
+      mlx_module_name = module_name.replace('.jax', '.mlx')
+      # Special case for attention: JAX uses a directory, MLX uses a file.
+      if '.attention.' in mlx_module_name:
+        mlx_module_name = 'sequence_layers.mlx.attention'
+      try:
+        mlx_module = importlib.import_module(mlx_module_name)
+        outer_cls = getattr(mlx_module, outer_class_name)
+        mlx_config_cls = getattr(outer_cls, 'Config')
+
+        # Get valid fields for MLX config
+        mlx_fields = {f.name for f in dataclasses.fields(mlx_config_cls)}
+
+        # Only copy fields that exist in MLX config
+        fields = {
+            f.name: getattr(config, f.name)
+            for f in dataclasses.fields(config)
+            if f.name in mlx_fields
+        }
+
+        def _convert(obj):
+          if (
+              dataclasses.is_dataclass(obj)
+              and obj.__class__.__name__ == 'Config'
+          ):
+            return _make_mlx_config(obj)
+          elif isinstance(obj, list):
+            return [_convert(x) for x in obj]
+          elif isinstance(obj, tuple):
+            return tuple(_convert(x) for x in obj)
+          elif isinstance(obj, dict):
+            return {k: _convert(v) for k, v in obj.items()}
+          return obj
+
+        converted_fields = {k: _convert(v) for k, v in fields.items()}
+        return mlx_config_cls(**converted_fields)
+      except (ImportError, AttributeError):
+        pass
+
+  raise NotImplementedError(f'Cannot convert {qual_name} to MLX')
+
+
+def _make_mlx_model(config):
+  """Create an MLX model from a JAX config."""
+  from sequence_layers.mlx import utils as mlx_utils
+
+  try:
+    return mlx_utils.make_layer(config)
+  except Exception:
+    mlx_config = _make_mlx_config(config)
+    return mlx_config.make()
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _compare_stateless_float(
+    test_case,
+    config,
+    input_shape,
+    *,
+    batch_size=2,
+    time=8,
+    atol=1e-5,
+    rtol=1e-5,
+    seed=42,
+):
+  """Compare a stateless layer that requires no parameters (float inputs)."""
+  rng = np.random.RandomState(seed)
+  values = rng.randn(batch_size, time, *input_shape).astype(np.float32)
+  mask = np.ones((batch_size, time), dtype=bool)
+
+  # JAX.
+  jax_model = config.make()
+  x_jax = jax_types.Sequence(
+      jnp.array(values), jnp.array(mask, dtype=jnp.bool_)
+  )
+  variables = jax_model.init(jax.random.PRNGKey(0), x_jax, training=False)
+  jax_out = np.array(jax_model.apply(variables, x_jax, training=False).values)
+
+  # MLX.
+  mlx_model = _make_mlx_model(config)
+  x_mx = Sequence(mx.array(values), mx.array(mask, dtype=mx.bool_))
+  mlx_out = np.array(mlx_model.layer(x_mx, training=False).values)
+
+  np.testing.assert_allclose(
+      mlx_out,
+      jax_out,
+      atol=atol,
+      rtol=rtol,
+      err_msg=f'{config.__class__.__qualname__}: outputs differ',
+  )
+
+
+def _compare_parametric_float(
+    test_case,
+    config,
+    input_shape,
+    *,
+    batch_size=2,
+    time=8,
+    atol=1e-5,
+    rtol=1e-5,
+    seed=42,
+):
+  """Compare a parametric layer with float inputs (Conv, Dense, Norm, etc.)."""
+  rng = np.random.RandomState(seed)
+  values = rng.randn(batch_size, time, *input_shape).astype(np.float32)
+  mask = np.ones((batch_size, time), dtype=bool)
+
+  # JAX: init + run.
+  jax_model = config.make()
+  x_jax = jax_types.Sequence(
+      jnp.array(values), jnp.array(mask, dtype=jnp.bool_)
+  )
+  variables = jax_model.init(jax.random.PRNGKey(0), x_jax, training=False)
+  jax_params = variables['params']
+  jax_out = np.array(
+      jax_model.apply({'params': jax_params}, x_jax, training=False).values
+  )
+
+  # MLX: create, load weights, run.
+  mlx_model = _make_mlx_model(config)
+  weight_converter.load_linen_params(
+      mlx_model,
+      jax_params,
+      config,
+      input_spec=ShapeDType(input_shape, mx.float32),
+  )
+  x_mx = Sequence(mx.array(values), mx.array(mask, dtype=mx.bool_))
+  mlx_out = np.array(mlx_model.layer(x_mx, training=False).values)
+
+  np.testing.assert_allclose(
+      mlx_out,
+      jax_out,
+      atol=atol,
+      rtol=rtol,
+      err_msg=f'{config.__class__.__qualname__}: outputs differ',
+  )
+
+
+def _compare_parametric_int(
+    test_case,
+    config,
+    *,
+    batch_size=2,
+    time=8,
+    atol=1e-5,
+    rtol=1e-5,
+    seed=42,
+):
+  """Compare a parametric layer with integer token inputs (Embedding)."""
+  rng = np.random.RandomState(seed)
+  # Infer vocab size from config.
+  vocab = getattr(config, 'num_embeddings', 32)
+  tokens = rng.randint(0, vocab, size=(batch_size, time)).astype(np.int32)
+  mask = np.ones((batch_size, time), dtype=bool)
+
+  # JAX.
+  jax_model = config.make()
+  x_jax = jax_types.Sequence(
+      jnp.array(tokens), jnp.array(mask, dtype=jnp.bool_)
+  )
+  variables = jax_model.init(jax.random.PRNGKey(0), x_jax, training=False)
+  jax_params = variables['params']
+  jax_out = np.array(
+      jax_model.apply({'params': jax_params}, x_jax, training=False).values
+  )
+
+  # MLX.
+  mlx_model = _make_mlx_model(config)
+  weight_converter.load_linen_params(mlx_model, jax_params, config)
+  x_mx = Sequence(
+      mx.array(tokens, dtype=mx.int32),
+      mx.array(mask, dtype=mx.bool_),
+  )
+  mlx_out = np.array(mlx_model.layer(x_mx, training=False).values)
+
+  np.testing.assert_allclose(
+      mlx_out,
+      jax_out,
+      atol=atol,
+      rtol=rtol,
+      err_msg=f'{config.__class__.__qualname__}: outputs differ',
+  )
+
+
+def _compare_with_constants(
+    test_case,
+    config,
+    input_shape,
+    constants_fn,
+    *,
+    batch_size=2,
+    time=8,
+    atol=1e-4,
+    rtol=1e-4,
+    seed=42,
+):
+  """Compare a parametric layer that needs constants (cross-attention)."""
+  rng = np.random.RandomState(seed)
+  values = rng.randn(batch_size, time, *input_shape).astype(np.float32)
+  mask = np.ones((batch_size, time), dtype=bool)
+
+  jax_constants, mlx_constants = constants_fn(batch_size, time, rng)
+
+  # JAX.
+  jax_model = config.make()
+  x_jax = jax_types.Sequence(
+      jnp.array(values), jnp.array(mask, dtype=jnp.bool_)
+  )
+  variables = jax_model.init(
+      jax.random.PRNGKey(0),
+      x_jax,
+      training=False,
+      constants=jax_constants,
+  )
+  jax_params = variables['params']
+  jax_out = np.array(
+      jax_model.apply(
+          {'params': jax_params},
+          x_jax,
+          training=False,
+          constants=jax_constants,
+      ).values
+  )
+
+  # MLX.
+  mlx_model = _make_mlx_model(config)
+  weight_converter.load_linen_params(
+      mlx_model,
+      jax_params,
+      config,
+      input_spec=ShapeDType(input_shape, mx.float32),
+      constants=mlx_constants,
+  )
+  x_mx = Sequence(mx.array(values), mx.array(mask, dtype=mx.bool_))
+  mlx_out = np.array(
+      mlx_model.layer(x_mx, training=False, constants=mlx_constants).values
+  )
+
+  np.testing.assert_allclose(
+      mlx_out,
+      jax_out,
+      atol=atol,
+      rtol=rtol,
+      err_msg=f'{config.__class__.__qualname__}: outputs differ',
+  )
+
+
+# ---------------------------------------------------------------------------
+# Test Classes
+# ---------------------------------------------------------------------------
+
+
+class StatelessActivationsTest(parameterized.TestCase):
+  """Stateless activations: JAX vs MLX."""
+
+  @parameterized.named_parameters(
+      ('relu', sl.Relu.Config()),
+      ('gelu', sl.Gelu.Config(approximate=False)),
+      ('swish', sl.Swish.Config()),
+      ('tanh', sl.Tanh.Config()),
+      ('sigmoid', sl.Sigmoid.Config()),
+      ('leaky_relu', sl.LeakyRelu.Config()),
+      ('elu', sl.Elu.Config()),
+      ('softmax', sl.Softmax.Config()),
+      ('softplus', sl.Softplus.Config()),
+  )
+  def test_activation(self, config):
+    _compare_stateless_float(self, config, (16,))
+
+
+class StatelessShapeOpsTest(parameterized.TestCase):
+  """Stateless shape operations: JAX vs MLX."""
+
+  @parameterized.named_parameters(
+      ('flatten_2d', sl.Flatten.Config(), (4, 3)),
+      ('reshape', sl.Reshape.Config(output_shape=(2, 4)), (8,)),
+      ('expand_dims', sl.ExpandDims.Config(axis=-1), (8,)),
+      ('squeeze', sl.Squeeze.Config(), (8, 1)),
+      ('transpose', sl.Transpose.Config(), (4, 3)),
+  )
+  def test_shape_op(self, config, input_shape):
+    _compare_stateless_float(self, config, input_shape)
+
+
+class StatelessMiscTest(parameterized.TestCase):
+  """Stateless misc layers: JAX vs MLX."""
+
+  @parameterized.named_parameters(
+      ('scale', sl.Scale.Config(scale=0.5), (8,)),
+      ('add', sl.Add.Config(shift=1.0), (8,)),
+      ('gated_linear_unit', sl.GatedLinearUnit.Config(), (16,)),
+      ('gated_tanh_unit', sl.GatedTanhUnit.Config(), (16,)),
+  )
+  def test_misc(self, config, input_shape):
+    _compare_stateless_float(self, config, input_shape)
+
+  def test_one_hot(self):
+    config = sl.OneHot.Config(depth=8)
+    rng = np.random.RandomState(42)
+    tokens = rng.randint(0, 8, size=(2, 8)).astype(np.int32)
+    mask = np.ones((2, 8), dtype=bool)
+
+    # JAX.
+    jax_model = config.make()
+    x_jax = jax_types.Sequence(
+        jnp.array(tokens), jnp.array(mask, dtype=jnp.bool_)
+    )
+    variables = jax_model.init(jax.random.PRNGKey(0), x_jax, training=False)
+    jax_out = np.array(jax_model.apply(variables, x_jax, training=False).values)
+
+    # MLX.
+    mlx_model = _make_mlx_model(config)
+    x_mx = Sequence(
+        mx.array(tokens, dtype=mx.int32),
+        mx.array(mask, dtype=mx.bool_),
+    )
+    mlx_out = np.array(mlx_model.layer(x_mx, training=False).values)
+
+    np.testing.assert_allclose(
+        mlx_out,
+        jax_out,
+        atol=1e-5,
+        rtol=1e-5,
+        err_msg='OneHot outputs differ',
+    )
+
+
+class SamplingTest(parameterized.TestCase):
+  """Downsample1D / Upsample1D: JAX vs MLX."""
+
+  @parameterized.named_parameters(
+      ('downsample_2', sl.Downsample1D.Config(rate=2), (8,)),
+      ('downsample_3', sl.Downsample1D.Config(rate=3), (8,)),
+      ('upsample_2', sl.Upsample1D.Config(rate=2), (8,)),
+      ('upsample_3', sl.Upsample1D.Config(rate=3), (8,)),
+      ('downsample_4', sl.Downsample1D.Config(rate=4), (16,)),
+  )
+  def test_sampling(self, config, input_shape):
+    _compare_stateless_float(self, config, input_shape, time=12)
+
+
+class PoolingCrossBackendTest(parameterized.TestCase):
+  """Pooling layers: JAX vs MLX."""
+
+  @parameterized.named_parameters(
+      (
+          'max_pool_2_valid',
+          sl.MaxPooling1D.Config(pool_size=2, padding='valid'),
+          (8,),
+      ),
+      (
+          'max_pool_3_causal',
+          sl.MaxPooling1D.Config(pool_size=3, padding='causal'),
+          (8,),
+      ),
+      (
+          'min_pool_2_valid',
+          sl.MinPooling1D.Config(pool_size=2, padding='valid'),
+          (8,),
+      ),
+      (
+          'min_pool_3_causal',
+          sl.MinPooling1D.Config(pool_size=3, padding='causal'),
+          (8,),
+      ),
+      (
+          'avg_pool_2_valid',
+          sl.AveragePooling1D.Config(pool_size=2, padding='valid'),
+          (8,),
+      ),
+      (
+          'avg_pool_3_causal',
+          sl.AveragePooling1D.Config(pool_size=3, padding='causal'),
+          (8,),
+      ),
+      (
+          'max_pool_stride2',
+          sl.MaxPooling1D.Config(pool_size=2, strides=2, padding='valid'),
+          (8,),
+      ),
+      (
+          'avg_pool_masked',
+          sl.AveragePooling1D.Config(
+              pool_size=2, padding='valid', masked_average=True
+          ),
+          (8,),
+      ),
+  )
+  def test_pooling(self, config, input_shape):
+    _compare_stateless_float(self, config, input_shape)
+
+
+class EmbeddingCrossBackendTest(parameterized.TestCase):
+  """Embedding: JAX vs MLX."""
+
+  def test_embedding(self):
+    config = sl.Embedding.Config(num_embeddings=32, dimension=16)
+    _compare_parametric_int(self, config)
+
+
+class DenseCrossBackendTest(parameterized.TestCase):
+  """Dense: JAX vs MLX."""
+
+  def test_dense_plain(self):
+    config = sl.Dense.Config(features=16)
+    _compare_parametric_float(self, config, (8,))
+
+  def test_dense_with_bias(self):
+    config = sl.Dense.Config(features=16, use_bias=True)
+    _compare_parametric_float(self, config, (8,))
+
+  def test_dense_with_activation(self):
+    config = sl.Dense.Config(features=16, activation=jax.nn.relu)
+    _compare_parametric_float(self, config, (8,))
+
+
+class ConvolutionCrossBackendTest(parameterized.TestCase):
+  """Convolution: JAX vs MLX."""
+
+  def test_conv1d_causal(self):
+    config = sl.Conv1D.Config(filters=8, kernel_size=3, padding='causal')
+    _compare_parametric_float(self, config, (4,))
+
+  def test_conv1d_causal_valid(self):
+    config = sl.Conv1D.Config(filters=8, kernel_size=3, padding='causal_valid')
+    _compare_parametric_float(self, config, (4,))
+
+  def test_depthwise_conv1d(self):
+    config = sl.DepthwiseConv1D.Config(kernel_size=3, padding='causal')
+    _compare_parametric_float(self, config, (4,))
+
+  def test_conv1d_transpose(self):
+    config = sl.Conv1DTranspose.Config(
+        filters=8, kernel_size=3, strides=2, padding='causal'
+    )
+    _compare_parametric_float(self, config, (4,))
+
+  def test_conv1d_with_bias(self):
+    config = sl.Conv1D.Config(
+        filters=8, kernel_size=3, padding='causal', use_bias=True
+    )
+    _compare_parametric_float(self, config, (4,))
+
+
+class NormalizationCrossBackendTest(parameterized.TestCase):
+  """Normalization: JAX vs MLX."""
+
+  def test_rms_norm(self):
+    config = sl.RMSNormalization.Config()
+    _compare_parametric_float(self, config, (16,))
+
+  def test_layer_norm(self):
+    config = sl.LayerNormalization.Config()
+    _compare_parametric_float(self, config, (16,))
+
+  def test_l2_normalize(self):
+    config = sl.L2Normalize.Config()
+    _compare_stateless_float(self, config, (16,))
+
+  def test_l2_normalize_multi_axis(self):
+    config = sl.L2Normalize.Config(axis=(-2, -1))
+    _compare_stateless_float(self, config, (4, 3))
+
+  def test_batch_norm(self):
+    config = sl.BatchNormalization.Config()
+    rng = np.random.RandomState(42)
+    batch_size, time = 2, 8
+    input_shape = (16,)
+    values = rng.randn(batch_size, time, *input_shape).astype(np.float32)
+    mask = np.ones((batch_size, time), dtype=bool)
+
+    # JAX: init returns both 'params' and 'batch_stats'.
+    jax_model = config.make()
+    x_jax = jax_types.Sequence(
+        jnp.array(values), jnp.array(mask, dtype=jnp.bool_)
+    )
+    variables = jax_model.init(jax.random.PRNGKey(0), x_jax, training=False)
+    jax_params = variables['params']
+    jax_batch_stats = variables['batch_stats']
+    jax_out = np.array(jax_model.apply(variables, x_jax, training=False).values)
+
+    # MLX: load params + batch_stats.
+    mlx_model = _make_mlx_model(config)
+    weight_converter.load_linen_params(
+        mlx_model,
+        jax_params,
+        config,
+        input_spec=ShapeDType(input_shape, mx.float32),
+        batch_stats=jax_batch_stats,
+    )
+    x_mx = Sequence(mx.array(values), mx.array(mask, dtype=mx.bool_))
+    mlx_out = np.array(mlx_model.layer(x_mx, training=False).values)
+
+    np.testing.assert_allclose(mlx_out, jax_out, atol=1e-5, rtol=1e-5)
+
+  def test_batch_norm_no_affine(self):
+    config = sl.BatchNormalization.Config(use_scale=False, use_bias=False)
+    rng = np.random.RandomState(42)
+    batch_size, time = 2, 8
+    input_shape = (16,)
+    values = rng.randn(batch_size, time, *input_shape).astype(np.float32)
+    mask = np.ones((batch_size, time), dtype=bool)
+
+    jax_model = config.make()
+    x_jax = jax_types.Sequence(
+        jnp.array(values), jnp.array(mask, dtype=jnp.bool_)
+    )
+    variables = jax_model.init(jax.random.PRNGKey(0), x_jax, training=False)
+    jax_batch_stats = variables.get('batch_stats', {})
+    # No params when scale/bias disabled — only batch_stats.
+    jax_params = variables.get('params', {})
+    jax_out = np.array(jax_model.apply(variables, x_jax, training=False).values)
+
+    mlx_model = _make_mlx_model(config)
+    weight_converter.load_linen_params(
+        mlx_model,
+        jax_params,
+        config,
+        input_spec=ShapeDType(input_shape, mx.float32),
+        batch_stats=jax_batch_stats,
+    )
+    x_mx = Sequence(mx.array(values), mx.array(mask, dtype=mx.bool_))
+    mlx_out = np.array(mlx_model.layer(x_mx, training=False).values)
+
+    np.testing.assert_allclose(mlx_out, jax_out, atol=1e-5, rtol=1e-5)
+
+  # GroupNorm: JAX layer() reduces over time (non-cumulative), MLX normalizes
+  # per-timestep by design. Cross-backend comparison requires cumulative mode
+  # which differs semantically. Skipped.
+
+
+class SelfAttentionCrossBackendTest(parameterized.TestCase):
+  """Self-attention: JAX vs MLX."""
+
+  def test_basic(self):
+    config = sl.DotProductSelfAttention.Config(
+        num_heads=2,
+        units_per_head=8,
+        max_past_horizon=16,
+        max_future_horizon=0,
+    )
+    _compare_parametric_float(self, config, (16,), atol=1e-4, rtol=1e-4)
+
+  def test_with_rope(self):
+    config = sl.DotProductSelfAttention.Config(
+        num_heads=2,
+        units_per_head=8,
+        max_past_horizon=16,
+        max_future_horizon=0,
+        query_network=sl.ApplyRotaryPositionalEncoding.Config(
+            max_wavelength=10_000.0,
+        ),
+        key_network=sl.ApplyRotaryPositionalEncoding.Config(
+            max_wavelength=10_000.0,
+        ),
+    )
+    _compare_parametric_float(self, config, (16,), atol=1e-4, rtol=1e-4)
+
+
+class LocalSelfAttentionCrossBackendTest(parameterized.TestCase):
+  """Local self-attention: JAX vs MLX."""
+
+  def test_basic(self):
+    from sequence_layers.jax.attention import \
+        local_dot_product_self_attention as jax_local_attn
+
+    config = jax_local_attn.LocalDotProductSelfAttention.Config(
+        num_heads=2,
+        units_per_head=4,
+        block_size=1,
+        max_past_horizon=8,
+        max_future_horizon=0,
+    )
+    _compare_parametric_float(self, config, (8,), atol=1e-4, rtol=1e-4)
+
+  def test_with_soft_cap(self):
+    from sequence_layers.jax.attention import \
+        local_dot_product_self_attention as jax_local_attn
+
+    config = jax_local_attn.LocalDotProductSelfAttention.Config(
+        num_heads=2,
+        units_per_head=4,
+        block_size=1,
+        max_past_horizon=8,
+        max_future_horizon=0,
+        attention_logits_soft_cap=50.0,
+    )
+    _compare_parametric_float(self, config, (8,), atol=1e-4, rtol=1e-4)
+
+
+class StepModeLocalSelfAttentionTest(parameterized.TestCase):
+  """Step-mode cross-backend: local self-attention."""
+
+  def test_causal(self):
+    from sequence_layers.jax.attention import \
+        local_dot_product_self_attention as jax_local_attn
+
+    config = jax_local_attn.LocalDotProductSelfAttention.Config(
+        num_heads=2,
+        units_per_head=4,
+        block_size=1,
+        max_past_horizon=8,
+        max_future_horizon=0,
+    )
+    _compare_step_mode(self, config, (8,), atol=1e-4, rtol=1e-4)
+
+
+class DSPCrossBackendTest(parameterized.TestCase):
+  """DSP layers: JAX vs MLX."""
+
+  def test_delay(self):
+    config = sl.Delay.Config(length=2)
+    _compare_stateless_float(self, config, (8,))
+
+  def test_lookahead(self):
+    config = sl.Lookahead.Config(length=3)
+    _compare_stateless_float(self, config, (8,))
+
+  def test_window(self):
+    config = sl.Window.Config(axis=-1)
+    _compare_stateless_float(self, config, (8,))
+
+  def test_frame(self):
+    config = sl.Frame.Config(frame_length=4, frame_step=2)
+    _compare_stateless_float(self, config, (1,), time=8)
+
+  def test_frame_causal(self):
+    config = sl.Frame.Config(frame_length=4, frame_step=2, padding='causal')
+    _compare_stateless_float(self, config, (1,), time=8)
+
+  def test_overlap_add_causal(self):
+    config = sl.OverlapAdd.Config(
+        frame_length=4, frame_step=2, padding='causal'
+    )
+    _compare_stateless_float(self, config, (4,), time=8)
+
+  def test_fft(self):
+    config = sl.FFT.Config()
+    _compare_stateless_float(self, config, (8,), atol=1e-4, rtol=1e-4)
+
+  def test_ifft(self):
+    config = sl.IFFT.Config()
+    _compare_stateless_float(self, config, (8,), atol=1e-4, rtol=1e-4)
+
+  def test_rfft(self):
+    config = sl.RFFT.Config()
+    _compare_stateless_float(self, config, (8,), atol=1e-4, rtol=1e-4)
+
+  def test_rfft_irfft_roundtrip(self):
+    # IRFFT needs complex input; test via RFFT→IRFFT roundtrip.
+    config = sl.Serial.Config([
+        sl.RFFT.Config(),
+        sl.IRFFT.Config(),
+    ])
+    _compare_stateless_float(self, config, (8,), atol=1e-4, rtol=1e-4)
+
+  def test_stft(self):
+    config = sl.STFT.Config(
+        frame_length=8,
+        frame_step=4,
+        fft_length=8,
+        output_magnitude=True,
+    )
+    _compare_stateless_float(self, config, (1,), time=16, atol=1e-4, rtol=1e-4)
+
+  def test_stft_complex(self):
+    config = sl.STFT.Config(
+        frame_length=8,
+        frame_step=4,
+        fft_length=8,
+        output_magnitude=False,
+    )
+    _compare_stateless_float(self, config, (1,), time=16, atol=1e-4, rtol=1e-4)
+
+  def test_stft_inverse_stft_roundtrip(self):
+    # InverseSTFT needs complex input; test via STFT→InverseSTFT roundtrip.
+    config = sl.Serial.Config([
+        sl.STFT.Config(
+            frame_length=8,
+            frame_step=4,
+            fft_length=8,
+            output_magnitude=False,
+        ),
+        sl.InverseSTFT.Config(
+            frame_length=8,
+            frame_step=4,
+            fft_length=8,
+            time_padding='causal',
+        ),
+    ])
+    _compare_stateless_float(self, config, (1,), time=16, atol=1e-4, rtol=1e-4)
+
+  def test_mel_spectrogram(self):
+    config = sl.LinearToMelSpectrogram.Config(
+        num_mel_bins=10,
+        sample_rate=16000.0,
+        lower_edge_hertz=80.0,
+        upper_edge_hertz=7600.0,
+    )
+    # Mel filterbank computation may differ slightly between backends
+    # due to different float64 vs float32 precision paths.
+    _compare_stateless_float(self, config, (5,), atol=0.05, rtol=0.1)
+
+
+class CombinatorsCrossBackendTest(parameterized.TestCase):
+  """Combinators: JAX vs MLX."""
+
+  def test_serial(self):
+    config = sl.Serial.Config([
+        sl.Dense.Config(features=16),
+        sl.Relu.Config(),
+        sl.Dense.Config(features=8),
+    ])
+    _compare_parametric_float(self, config, (8,))
+
+  def test_residual(self):
+    config = sl.Residual.Config([
+        sl.Dense.Config(features=8),
+        sl.Relu.Config(),
+    ])
+    _compare_parametric_float(self, config, (8,))
+
+  def test_repeat(self):
+    config = sl.Repeat.Config(
+        num_repeats=2,
+        layer=sl.Serial.Config([
+            sl.Dense.Config(features=8),
+            sl.Relu.Config(),
+        ]),
+    )
+    _compare_parametric_float(self, config, (8,), atol=1e-4, rtol=1e-4)
+
+
+class CrossAttentionCrossBackendTest(parameterized.TestCase):
+  """Cross-attention (DotProductAttention): JAX vs MLX."""
+
+  def _make_constants(self, batch_size, time, source_features, rng):
+    source_values = rng.randn(batch_size, time, source_features).astype(
+        np.float32
+    )
+    source_mask = np.ones((batch_size, time), dtype=bool)
+    jax_source = jax_types.Sequence(
+        jnp.array(source_values), jnp.array(source_mask, dtype=jnp.bool_)
+    )
+    mlx_source = Sequence(
+        mx.array(source_values), mx.array(source_mask, dtype=mx.bool_)
+    )
+    return {'enc': jax_source}, {'enc': mlx_source}
+
+  def test_basic(self):
+    from sequence_layers.jax.attention import \
+        dot_product_attention as jax_cross_attn
+
+    config = jax_cross_attn.DotProductAttention.Config(
+        source_name='enc',
+        num_heads=2,
+        units_per_head=4,
+    )
+    _compare_with_constants(
+        self,
+        config,
+        (8,),
+        lambda b, t, rng: self._make_constants(b, t, 12, rng),
+        atol=1e-4,
+        rtol=1e-4,
+    )
+
+  def test_same_features(self):
+    """Source and input have the same feature dimension."""
+    from sequence_layers.jax.attention import \
+        dot_product_attention as jax_cross_attn
+
+    config = jax_cross_attn.DotProductAttention.Config(
+        source_name='enc',
+        num_heads=4,
+        units_per_head=4,
+    )
+    _compare_with_constants(
+        self,
+        config,
+        (16,),
+        lambda b, t, rng: self._make_constants(b, t, 16, rng),
+        atol=1e-4,
+        rtol=1e-4,
+    )
+
+
+class StreamingAttentionCrossBackendTest(parameterized.TestCase):
+  """Streaming cross-attention: JAX vs MLX weight conversion."""
+
+  def _make_constants(self, batch_size, time, source_features, rng):
+    source_values = rng.randn(batch_size, time, source_features).astype(
+        np.float32
+    )
+    source_mask = np.ones((batch_size, time), dtype=bool)
+    jax_source = jax_types.Sequence(
+        jnp.array(source_values), jnp.array(source_mask, dtype=jnp.bool_)
+    )
+    mlx_source = Sequence(
+        mx.array(source_values), mx.array(source_mask, dtype=mx.bool_)
+    )
+    return {'src': jax_source}, {'src': mlx_source}
+
+  def test_basic(self):
+    from sequence_layers.jax.attention import \
+        streaming_dot_product_attention as jax_streaming_attn
+
+    config = jax_streaming_attn.StreamingDotProductAttention.Config(
+        source_name='src',
+        num_heads=2,
+        units_per_head=4,
+        max_past_horizon=8,
+    )
+    _compare_with_constants(
+        self,
+        config,
+        (8,),
+        lambda b, t, rng: self._make_constants(b, t, 12, rng),
+        atol=1e-4,
+        rtol=1e-4,
+    )
+
+  def test_with_future_horizon(self):
+    from sequence_layers.jax.attention import \
+        streaming_dot_product_attention as jax_streaming_attn
+
+    config = jax_streaming_attn.StreamingDotProductAttention.Config(
+        source_name='src',
+        num_heads=2,
+        units_per_head=4,
+        max_past_horizon=4,
+        max_future_horizon=2,
+    )
+    _compare_with_constants(
+        self,
+        config,
+        (8,),
+        lambda b, t, rng: self._make_constants(b, t, 8, rng),
+        atol=1e-4,
+        rtol=1e-4,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Step-mode cross-backend tests
+# ---------------------------------------------------------------------------
+
+
+def _compare_step_mode(
+    test_case,
+    config,
+    input_shape,
+    *,
+    batch_size=1,
+    num_steps=6,
+    block_size=1,
+    atol=1e-5,
+    rtol=1e-5,
+    seed=42,
+    constants_fn=None,
+    stream_constants_fn=None,
+):
+  """Compare JAX and MLX step-by-step outputs with shared weights.
+
+  Args:
+    test_case: A TestCase instance.
+    config: A SequenceLayerConfig.
+    input_shape: Channel shape, e.g. (8,).
+    batch_size: Batch dimension.
+    num_steps: Number of step invocations.
+    block_size: Number of timesteps per step. Must match the layer's
+        block_size for layers that require it (e.g. Frame, OverlapAdd).
+    atol: Absolute tolerance.
+    rtol: Relative tolerance.
+    seed: Random seed.
+    constants_fn: For static cross-attention. Returns (jax_constants,
+        mlx_constants) given (batch_size, rng).
+    stream_constants_fn: For streaming cross-attention. Returns
+        (jax_constants, mlx_constants) given (batch_size, time, rng).
+        Each has shape [batch, time, features]. Will be sliced per step.
+  """
+  rng = np.random.RandomState(seed)
+  step_values = [
+      rng.randn(batch_size, block_size, *input_shape).astype(np.float32)
+      for _ in range(num_steps)
+  ]
+  step_masks = [
+      np.ones((batch_size, block_size), dtype=bool) for _ in range(num_steps)
+  ]
+  total_time = num_steps * block_size
+
+  jax_constants = None
+  mlx_constants = None
+  jax_stream_constants = None
+  mlx_stream_constants = None
+
+  if constants_fn is not None:
+    jax_constants, mlx_constants = constants_fn(batch_size, rng)
+
+  if stream_constants_fn is not None:
+    jax_stream_constants, mlx_stream_constants = stream_constants_fn(
+        batch_size, total_time, rng
+    )
+
+  # --- JAX init + step ---
+  jax_model = config.make()
+  # Init with a full sequence to get params.
+  full_values = np.concatenate(step_values, axis=1)
+  full_mask = np.ones((batch_size, total_time), dtype=bool)
+  x_init = jax_types.Sequence(
+      jnp.array(full_values), jnp.array(full_mask, dtype=jnp.bool_)
+  )
+  init_constants = jax_constants
+  if init_constants is None and jax_stream_constants is not None:
+    init_constants = jax_stream_constants
+  variables = jax_model.init(
+      jax.random.PRNGKey(0),
+      x_init,
+      training=False,
+      constants=init_constants,
+  )
+  jax_params = variables.get('params', {})
+  jax_variables = {'params': jax_params} if jax_params else variables
+
+  jax_spec = jax.ShapeDtypeStruct(input_shape, jnp.float32)
+  jax_state = jax_model.apply(
+      jax_variables,
+      batch_size,
+      jax_spec,
+      training=False,
+      constants=init_constants,
+      method=jax_model.get_initial_state,
+  )
+
+  jax_outputs = []
+  for i in range(num_steps):
+    x_jax = jax_types.Sequence(
+        jnp.array(step_values[i]),
+        jnp.array(step_masks[i], dtype=jnp.bool_),
+    )
+    step_c = jax_constants
+    if jax_stream_constants is not None:
+      s = i * block_size
+      e = s + block_size
+      step_c = {
+          k: jax_types.Sequence(v.values[:, s:e], v.mask[:, s:e])
+          for k, v in jax_stream_constants.items()
+      }
+    y_jax, jax_state = jax_model.apply(
+        jax_variables,
+        x_jax,
+        jax_state,
+        training=False,
+        constants=step_c,
+        method=jax_model.step,
+    )
+    jax_outputs.append(np.array(y_jax.values))
+
+  # --- MLX init + step ---
+  mlx_model = _make_mlx_model(config)
+  mlx_init_constants = mlx_constants
+  if mlx_init_constants is None and mlx_stream_constants is not None:
+    mlx_init_constants = mlx_stream_constants
+  if jax_params:
+    weight_converter.load_linen_params(
+        mlx_model,
+        jax_params,
+        config,
+        input_spec=ShapeDType(input_shape, mx.float32),
+        constants=mlx_init_constants,
+    )
+  # Skip _materialize_deferred for param-less layers — no deferred weights.
+
+  mlx_spec = ShapeDType(input_shape, mx.float32)
+  # Slice stream constants to time=1 for get_initial_state.
+  state_constants = mlx_constants
+  if mlx_stream_constants is not None:
+    state_constants = {
+        k: Sequence(v.values[:, :1], v.mask[:, :1])
+        for k, v in mlx_stream_constants.items()
+    }
+  mlx_state = mlx_model.get_initial_state(
+      batch_size, mlx_spec, training=False, constants=state_constants
+  )
+
+  mlx_outputs = []
+  for i in range(num_steps):
+    x_mx = Sequence(
+        mx.array(step_values[i]),
+        mx.array(step_masks[i], dtype=mx.bool_),
+    )
+    step_c = mlx_constants
+    if mlx_stream_constants is not None:
+      s = i * block_size
+      e = s + block_size
+      step_c = {
+          k: Sequence(v.values[:, s:e], v.mask[:, s:e])
+          for k, v in mlx_stream_constants.items()
+      }
+    y_mx, mlx_state = mlx_model.step(
+        x_mx, mlx_state, training=False, constants=step_c
+    )
+    mx.eval(y_mx.values)
+    mlx_outputs.append(np.array(y_mx.values))
+
+  # --- Compare ---
+  for i, (jax_out, mlx_out) in enumerate(zip(jax_outputs, mlx_outputs)):
+    np.testing.assert_allclose(
+        mlx_out,
+        jax_out,
+        atol=atol,
+        rtol=rtol,
+        err_msg=f'{config.__class__.__qualname__} step {i}: outputs differ',
+    )
+
+
+class StepModeConvolutionTest(parameterized.TestCase):
+  """Step-mode cross-backend: convolution layers."""
+
+  def test_conv1d_causal(self):
+    config = sl.Conv1D.Config(filters=8, kernel_size=3, padding='causal')
+    _compare_step_mode(self, config, (4,))
+
+  def test_depthwise_conv1d_causal(self):
+    config = sl.DepthwiseConv1D.Config(kernel_size=3, padding='causal')
+    _compare_step_mode(self, config, (4,))
+
+  def test_conv1d_transpose_causal(self):
+    config = sl.Conv1DTranspose.Config(
+        filters=8, kernel_size=3, strides=2, padding='causal'
+    )
+    _compare_step_mode(self, config, (4,))
+
+
+class StepModeDenseNormTest(parameterized.TestCase):
+  """Step-mode cross-backend: Dense and normalization."""
+
+  def test_dense(self):
+    config = sl.Dense.Config(features=16)
+    _compare_step_mode(self, config, (8,))
+
+  def test_rms_norm(self):
+    config = sl.RMSNormalization.Config()
+    _compare_step_mode(self, config, (16,))
+
+  def test_layer_norm(self):
+    config = sl.LayerNormalization.Config()
+    _compare_step_mode(self, config, (16,))
+
+
+class StepModeSelfAttentionTest(parameterized.TestCase):
+  """Step-mode cross-backend: self-attention."""
+
+  def test_causal(self):
+    config = sl.DotProductSelfAttention.Config(
+        num_heads=2,
+        units_per_head=4,
+        max_past_horizon=16,
+        max_future_horizon=0,
+    )
+    _compare_step_mode(self, config, (8,), atol=1e-4, rtol=1e-4)
+
+  def test_causal_with_bias(self):
+    config = sl.DotProductSelfAttention.Config(
+        num_heads=2,
+        units_per_head=4,
+        max_past_horizon=16,
+        max_future_horizon=0,
+        use_bias=True,
+    )
+    _compare_step_mode(self, config, (8,), atol=1e-4, rtol=1e-4)
+
+  def test_causal_with_rope(self):
+    config = sl.DotProductSelfAttention.Config(
+        num_heads=2,
+        units_per_head=4,
+        max_past_horizon=16,
+        max_future_horizon=0,
+        query_network=sl.ApplyRotaryPositionalEncoding.Config(
+            max_wavelength=10_000.0,
+        ),
+        key_network=sl.ApplyRotaryPositionalEncoding.Config(
+            max_wavelength=10_000.0,
+        ),
+    )
+    _compare_step_mode(self, config, (8,), atol=1e-4, rtol=1e-4)
+
+  def test_gqa_with_rope(self):
+    config = sl.DotProductSelfAttention.Config(
+        num_heads=4,
+        units_per_head=4,
+        max_past_horizon=16,
+        max_future_horizon=0,
+        num_kv_heads=2,
+        input_projection=attn_common.SeparateQueryKeyValueProjection(),
+        query_network=sl.ApplyRotaryPositionalEncoding.Config(
+            max_wavelength=10_000.0,
+        ),
+        key_network=sl.ApplyRotaryPositionalEncoding.Config(
+            max_wavelength=10_000.0,
+        ),
+    )
+    _compare_step_mode(self, config, (16,), atol=1e-4, rtol=1e-4)
+
+
+class StepModeCrossAttentionTest(parameterized.TestCase):
+  """Step-mode cross-backend: cross-attention."""
+
+  def _make_constants(self, batch_size, rng, source_features=12, source_time=8):
+    from sequence_layers.jax.attention import \
+        dot_product_attention as jax_cross_attn
+
+    source_values = rng.randn(batch_size, source_time, source_features).astype(
+        np.float32
+    )
+    source_mask = np.ones((batch_size, source_time), dtype=bool)
+    jax_source = jax_types.Sequence(
+        jnp.array(source_values), jnp.array(source_mask, dtype=jnp.bool_)
+    )
+    mlx_source = Sequence(
+        mx.array(source_values), mx.array(source_mask, dtype=mx.bool_)
+    )
+    return {'enc': jax_source}, {'enc': mlx_source}
+
+  def test_cross_attention(self):
+    from sequence_layers.jax.attention import \
+        dot_product_attention as jax_cross_attn
+
+    config = jax_cross_attn.DotProductAttention.Config(
+        source_name='enc',
+        num_heads=2,
+        units_per_head=4,
+    )
+    _compare_step_mode(
+        self,
+        config,
+        (8,),
+        constants_fn=lambda b, rng: self._make_constants(b, rng),
+        atol=1e-4,
+        rtol=1e-4,
+    )
+
+  def test_cross_attention_different_dims(self):
+    from sequence_layers.jax.attention import \
+        dot_product_attention as jax_cross_attn
+
+    config = jax_cross_attn.DotProductAttention.Config(
+        source_name='enc',
+        num_heads=2,
+        units_per_head=4,
+    )
+    _compare_step_mode(
+        self,
+        config,
+        (16,),
+        constants_fn=lambda b, rng: self._make_constants(
+            b, rng, source_features=8, source_time=12
+        ),
+        atol=1e-4,
+        rtol=1e-4,
+    )
+
+  def test_cross_attention_with_bias(self):
+    from sequence_layers.jax.attention import \
+        dot_product_attention as jax_cross_attn
+
+    config = jax_cross_attn.DotProductAttention.Config(
+        source_name='enc',
+        num_heads=2,
+        units_per_head=4,
+        use_bias=True,
+    )
+    _compare_step_mode(
+        self,
+        config,
+        (8,),
+        constants_fn=lambda b, rng: self._make_constants(b, rng),
+        atol=1e-4,
+        rtol=1e-4,
+    )
+
+
+class StepModeStreamingAttentionTest(parameterized.TestCase):
+  """Step-mode cross-backend: streaming cross-attention."""
+
+  def _make_stream_constants(self, batch_size, time, rng, source_features=12):
+    source_values = rng.randn(batch_size, time, source_features).astype(
+        np.float32
+    )
+    source_mask = np.ones((batch_size, time), dtype=bool)
+    jax_source = jax_types.Sequence(
+        jnp.array(source_values), jnp.array(source_mask, dtype=jnp.bool_)
+    )
+    mlx_source = Sequence(
+        mx.array(source_values), mx.array(source_mask, dtype=mx.bool_)
+    )
+    return {'src': jax_source}, {'src': mlx_source}
+
+  def test_streaming_attention(self):
+    from sequence_layers.jax.attention import \
+        streaming_dot_product_attention as jax_streaming_attn
+
+    config = jax_streaming_attn.StreamingDotProductAttention.Config(
+        source_name='src',
+        num_heads=2,
+        units_per_head=4,
+        max_past_horizon=8,
+    )
+    _compare_step_mode(
+        self,
+        config,
+        (8,),
+        stream_constants_fn=lambda b, t, rng: self._make_stream_constants(
+            b, t, rng, source_features=12
+        ),
+        atol=1e-4,
+        rtol=1e-4,
+    )
+
+  def test_streaming_with_future_horizon(self):
+    from sequence_layers.jax.attention import \
+        streaming_dot_product_attention as jax_streaming_attn
+
+    config = jax_streaming_attn.StreamingDotProductAttention.Config(
+        source_name='src',
+        num_heads=2,
+        units_per_head=4,
+        max_past_horizon=6,
+        max_future_horizon=2,
+    )
+    _compare_step_mode(
+        self,
+        config,
+        (8,),
+        stream_constants_fn=lambda b, t, rng: self._make_stream_constants(
+            b, t, rng, source_features=12
+        ),
+        atol=1e-4,
+        rtol=1e-4,
+    )
+
+  def test_streaming_with_rope(self):
+    from sequence_layers.jax.attention import \
+        streaming_dot_product_attention as jax_streaming_attn
+
+    config = jax_streaming_attn.StreamingDotProductAttention.Config(
+        source_name='src',
+        num_heads=2,
+        units_per_head=4,
+        max_past_horizon=8,
+        query_network=sl.ApplyRotaryPositionalEncoding.Config(
+            max_wavelength=10_000.0,
+        ),
+        key_network=sl.ApplyRotaryPositionalEncoding.Config(
+            max_wavelength=10_000.0,
+        ),
+    )
+    _compare_step_mode(
+        self,
+        config,
+        (8,),
+        stream_constants_fn=lambda b, t, rng: self._make_stream_constants(
+            b, t, rng, source_features=12
+        ),
+        atol=1e-4,
+        rtol=1e-4,
+    )
+
+  @unittest.skip('StreamingLocalDotProductAttention not implemented in MLX')
+  def test_streaming_local(self):
+    from sequence_layers.jax.attention import \
+        streaming_local_dot_product_attention as jax_streaming_local_attn
+
+    config = jax_streaming_local_attn.StreamingLocalDotProductAttention.Config(
+        source_name='src',
+        num_heads=2,
+        units_per_head=4,
+        max_past_horizon=8,
+        block_size=1,
+    )
+    _compare_step_mode(
+        self,
+        config,
+        (8,),
+        stream_constants_fn=lambda b, t, rng: self._make_stream_constants(
+            b, t, rng, source_features=12
+        ),
+        atol=1e-4,
+        rtol=1e-4,
+    )
+
+  def test_streaming_with_bias(self):
+    from sequence_layers.jax.attention import \
+        streaming_dot_product_attention as jax_streaming_attn
+
+    config = jax_streaming_attn.StreamingDotProductAttention.Config(
+        source_name='src',
+        num_heads=2,
+        units_per_head=4,
+        max_past_horizon=8,
+        use_bias=True,
+    )
+    _compare_step_mode(
+        self,
+        config,
+        (8,),
+        stream_constants_fn=lambda b, t, rng: self._make_stream_constants(
+            b, t, rng, source_features=12
+        ),
+        atol=1e-4,
+        rtol=1e-4,
+    )
+
+
+class StepModeDSPTest(parameterized.TestCase):
+  """Step-mode cross-backend: DSP layers."""
+
+  def test_delay(self):
+    config = sl.Delay.Config(length=3)
+    _compare_step_mode(self, config, (8,))
+
+  def test_lookahead(self):
+    config = sl.Lookahead.Config(length=3)
+    _compare_step_mode(self, config, (8,))
+
+  def test_window(self):
+    config = sl.Window.Config(axis=-1)
+    _compare_step_mode(self, config, (8,))
+
+  def test_frame_causal(self):
+    config = sl.Frame.Config(frame_length=4, frame_step=2, padding='causal')
+    _compare_step_mode(self, config, (1,), block_size=2, num_steps=6)
+
+  def test_overlap_add_causal(self):
+    config = sl.OverlapAdd.Config(
+        frame_length=4, frame_step=2, padding='causal'
+    )
+    _compare_step_mode(self, config, (4,), num_steps=6)
+
+  def test_overlap_add_causal_large(self):
+    config = sl.OverlapAdd.Config(
+        frame_length=8, frame_step=4, padding='causal'
+    )
+    _compare_step_mode(self, config, (8,), num_steps=6)
+
+
+class StepModeCombinatorTest(parameterized.TestCase):
+  """Step-mode cross-backend: combinators."""
+
+  def test_serial(self):
+    config = sl.Serial.Config([
+        sl.Dense.Config(features=16),
+        sl.Relu.Config(),
+        sl.Dense.Config(features=8),
+    ])
+    _compare_step_mode(self, config, (8,))
+
+  def test_residual(self):
+    config = sl.Residual.Config([
+        sl.Dense.Config(features=8),
+        sl.Relu.Config(),
+    ])
+    _compare_step_mode(self, config, (8,))
+
+  def test_repeat_with_attention(self):
+    config = sl.Repeat.Config(
+        num_repeats=2,
+        layer=sl.Serial.Config([
+            sl.Residual.Config([
+                sl.RMSNormalization.Config(),
+                sl.DotProductSelfAttention.Config(
+                    num_heads=2,
+                    units_per_head=4,
+                    max_past_horizon=16,
+                    max_future_horizon=0,
+                ),
+                sl.Flatten.Config(),
+            ]),
+            sl.Residual.Config([
+                sl.RMSNormalization.Config(),
+                sl.Dense.Config(features=8),
+            ]),
+        ]),
+    )
+    _compare_step_mode(self, config, (8,), atol=1e-3, rtol=1e-3)
+
+
+class StepModePoolingTest(parameterized.TestCase):
+  """Step-mode cross-backend: pooling layers."""
+
+  def test_max_pool_causal(self):
+    config = sl.MaxPooling1D.Config(pool_size=3, padding='causal')
+    _compare_step_mode(self, config, (8,))
+
+  def test_min_pool_causal(self):
+    config = sl.MinPooling1D.Config(pool_size=3, padding='causal')
+    _compare_step_mode(self, config, (8,))
+
+  def test_avg_pool_causal(self):
+    config = sl.AveragePooling1D.Config(pool_size=3, padding='causal')
+    _compare_step_mode(self, config, (8,))
+
+
+class StepModeGQATest(parameterized.TestCase):
+  """Step-mode cross-backend: grouped query attention."""
+
+  def test_gqa(self):
+    config = sl.DotProductSelfAttention.Config(
+        num_heads=4,
+        units_per_head=4,
+        max_past_horizon=16,
+        max_future_horizon=0,
+        num_kv_heads=2,
+        input_projection=attn_common.SeparateQueryKeyValueProjection(),
+    )
+    _compare_step_mode(self, config, (16,), atol=1e-4, rtol=1e-4)
+
+
+class GQACrossBackendTest(parameterized.TestCase):
+  """Layer-mode cross-backend: grouped query attention."""
+
+  def test_gqa(self):
+    config = sl.DotProductSelfAttention.Config(
+        num_heads=4,
+        units_per_head=4,
+        max_past_horizon=16,
+        max_future_horizon=0,
+        num_kv_heads=2,
+        input_projection=attn_common.SeparateQueryKeyValueProjection(),
+    )
+    _compare_parametric_float(self, config, (16,), atol=1e-4, rtol=1e-4)
+
+
+# ---------------------------------------------------------------------------
+# Parallel combinator cross-backend tests
+# ---------------------------------------------------------------------------
+
+
+class ParallelCrossBackendTest(parameterized.TestCase):
+  """Cross-backend: Parallel combinator (layer + step)."""
+
+  def test_parallel_add_layer(self):
+    config = sl.Parallel.Config(
+        layers=[
+            sl.Dense.Config(features=8),
+            sl.Dense.Config(features=8),
+        ],
+        combination=sl.CombinationMode.ADD,
+    )
+    _compare_parametric_float(self, config, (8,))
+
+  def test_parallel_concat_layer(self):
+    config = sl.Parallel.Config(
+        layers=[
+            sl.Dense.Config(features=4),
+            sl.Dense.Config(features=4),
+        ],
+        combination=sl.CombinationMode.CONCAT,
+    )
+    _compare_parametric_float(self, config, (8,))
+
+  def test_parallel_stack_layer(self):
+    config = sl.Parallel.Config(
+        layers=[
+            sl.Dense.Config(features=8),
+            sl.Dense.Config(features=8),
+        ],
+        combination=sl.CombinationMode.STACK,
+    )
+    _compare_parametric_float(self, config, (8,))
+
+  def test_parallel_add_step(self):
+    config = sl.Parallel.Config(
+        layers=[
+            sl.Dense.Config(features=8),
+            sl.Dense.Config(features=8),
+        ],
+        combination=sl.CombinationMode.ADD,
+    )
+    _compare_step_mode(self, config, (8,))
+
+  def test_parallel_concat_step(self):
+    config = sl.Parallel.Config(
+        layers=[
+            sl.Dense.Config(features=4),
+            sl.Dense.Config(features=4),
+        ],
+        combination=sl.CombinationMode.CONCAT,
+    )
+    _compare_step_mode(self, config, (8,))
+
+
+# ---------------------------------------------------------------------------
+# Partially-masked input tests
+# ---------------------------------------------------------------------------
+
+
+def _compare_parametric_float_masked(
+    test_case,
+    config,
+    input_shape,
+    *,
+    batch_size=2,
+    time=8,
+    atol=1e-5,
+    rtol=1e-5,
+    seed=42,
+):
+  """Like _compare_parametric_float but with partially-masked inputs."""
+  rng = np.random.RandomState(seed)
+  values = rng.randn(batch_size, time, *input_shape).astype(np.float32)
+  # Create a mask where ~25% of timesteps are invalid.
+  mask = rng.rand(batch_size, time) > 0.25
+
+  # JAX.
+  jax_model = config.make()
+  x_jax = jax_types.Sequence(
+      jnp.array(values), jnp.array(mask, dtype=jnp.bool_)
+  )
+  variables = jax_model.init(jax.random.PRNGKey(0), x_jax, training=False)
+  jax_params = variables.get('params', {})
+  jax_variables = {'params': jax_params} if jax_params else variables
+  jax_out = jax_model.apply(jax_variables, x_jax, training=False)
+  jax_values = np.array(jax_out.values)
+  jax_mask = np.array(jax_out.mask)
+
+  # MLX.
+  mlx_model = _make_mlx_model(config)
+  if jax_params:
+    weight_converter.load_linen_params(
+        mlx_model,
+        jax_params,
+        config,
+        input_spec=ShapeDType(input_shape, mx.float32),
+    )
+  else:
+    export._materialize_deferred(
+        mlx_model,
+        batch_size=1,
+        input_spec=ShapeDType(input_shape, mx.float32),
+    )
+  x_mx = Sequence(mx.array(values), mx.array(mask, dtype=mx.bool_))
+  mlx_out = mlx_model.layer(x_mx, training=False)
+  mlx_values = np.array(mlx_out.values)
+  mlx_mask = np.array(mlx_out.mask)
+
+  # Compare valid timesteps only.
+  out_mask = jax_mask
+  if jax_values.shape != mlx_values.shape:
+    test_case.fail(
+        f'{config.__class__.__qualname__}: shape mismatch'
+        f' jax={jax_values.shape} vs mlx={mlx_values.shape}'
+    )
+
+  # Flatten and compare only valid positions.
+  for b in range(batch_size):
+    for t in range(out_mask.shape[1]):
+      if out_mask[b, t]:
+        np.testing.assert_allclose(
+            mlx_values[b, t],
+            jax_values[b, t],
+            atol=atol,
+            rtol=rtol,
+            err_msg=(
+                f'{config.__class__.__qualname__} batch={b} time={t}:'
+                ' valid outputs differ'
+            ),
+        )
+
+  # Masks should match.
+  np.testing.assert_array_equal(
+      mlx_mask,
+      jax_mask,
+      err_msg=f'{config.__class__.__qualname__}: masks differ',
+  )
+
+
+class MaskedInputDenseTest(parameterized.TestCase):
+  """Cross-backend with partially-masked inputs: Dense."""
+
+  def test_dense_masked(self):
+    config = sl.Dense.Config(features=16)
+    _compare_parametric_float_masked(self, config, (8,))
+
+
+class MaskedInputConvTest(parameterized.TestCase):
+  """Cross-backend with partially-masked inputs: Conv1D."""
+
+  def test_conv1d_causal_masked(self):
+    config = sl.Conv1D.Config(filters=8, kernel_size=3, padding='causal')
+    _compare_parametric_float_masked(self, config, (4,))
+
+  def test_depthwise_conv1d_masked(self):
+    config = sl.DepthwiseConv1D.Config(kernel_size=3, padding='causal')
+    _compare_parametric_float_masked(self, config, (4,))
+
+
+class MaskedInputNormTest(parameterized.TestCase):
+  """Cross-backend with partially-masked inputs: normalization."""
+
+  def test_rms_norm_masked(self):
+    config = sl.RMSNormalization.Config()
+    _compare_parametric_float_masked(self, config, (16,))
+
+  def test_layer_norm_masked(self):
+    config = sl.LayerNormalization.Config()
+    _compare_parametric_float_masked(self, config, (16,))
+
+
+class MaskedInputSelfAttentionTest(parameterized.TestCase):
+  """Cross-backend with partially-masked inputs: self-attention."""
+
+  def test_causal_masked(self):
+    config = sl.DotProductSelfAttention.Config(
+        num_heads=2,
+        units_per_head=4,
+        max_past_horizon=16,
+        max_future_horizon=0,
+    )
+    _compare_parametric_float_masked(self, config, (8,), atol=1e-4, rtol=1e-4)
+
+
+class MaskedInputPoolingTest(parameterized.TestCase):
+  """Cross-backend with partially-masked inputs: pooling."""
+
+  def test_max_pool_masked(self):
+    config = sl.MaxPooling1D.Config(pool_size=2, padding='causal')
+    _compare_parametric_float_masked(self, config, (8,))
+
+  def test_avg_pool_masked(self):
+    config = sl.AveragePooling1D.Config(pool_size=2, padding='causal')
+    _compare_parametric_float_masked(self, config, (8,))
+
+
+# ---------------------------------------------------------------------------
+# Integration tests: full model cross-backend comparison
+# ---------------------------------------------------------------------------
+
+
+def _compare_integration_float(
+    test_case,
+    config,
+    input_shape,
+    *,
+    batch_size=2,
+    time=8,
+    atol=1e-3,
+    rtol=1e-3,
+    seed=42,
+    constants_fn=None,
+):
+  """Compare a full model (layer mode) between JAX and MLX."""
+  rng = np.random.RandomState(seed)
+  values = rng.randn(batch_size, time, *input_shape).astype(np.float32)
+  mask = np.ones((batch_size, time), dtype=bool)
+
+  jax_constants = None
+  mlx_constants = None
+  if constants_fn is not None:
+    jax_constants, mlx_constants = constants_fn(batch_size, time, rng)
+
+  # JAX.
+  jax_model = config.make()
+  x_jax = jax_types.Sequence(
+      jnp.array(values), jnp.array(mask, dtype=jnp.bool_)
+  )
+  variables = jax_model.init(
+      jax.random.PRNGKey(0), x_jax, training=False, constants=jax_constants
+  )
+  jax_params = variables['params']
+  jax_out = jax_model.apply(
+      {'params': jax_params},
+      x_jax,
+      training=False,
+      constants=jax_constants,
+  )
+  jax_values = np.array(jax_out.values)
+
+  # MLX.
+  mlx_model = _make_mlx_model(config)
+  weight_converter.load_linen_params(
+      mlx_model,
+      jax_params,
+      config,
+      input_spec=ShapeDType(input_shape, mx.float32),
+      constants=mlx_constants,
+  )
+  x_mx = Sequence(mx.array(values), mx.array(mask, dtype=mx.bool_))
+  mlx_out = mlx_model.layer(x_mx, training=False, constants=mlx_constants)
+  mlx_values = np.array(mlx_out.values)
+
+  test_case.assertEqual(
+      jax_values.shape,
+      mlx_values.shape,
+      f'Shape mismatch: jax={jax_values.shape} vs mlx={mlx_values.shape}',
+  )
+  np.testing.assert_allclose(
+      mlx_values,
+      jax_values,
+      atol=atol,
+      rtol=rtol,
+      err_msg='Integration test: JAX vs MLX outputs differ',
+  )
+  return jax_params, jax_constants, mlx_constants
+
+
+def _compare_integration_int(
+    test_case,
+    config,
+    *,
+    vocab_size=256,
+    batch_size=2,
+    time=8,
+    atol=1e-3,
+    rtol=1e-3,
+    seed=42,
+):
+  """Compare a full model with integer token inputs (layer mode)."""
+  rng = np.random.RandomState(seed)
+  tokens = rng.randint(0, vocab_size, size=(batch_size, time)).astype(np.int32)
+  mask = np.ones((batch_size, time), dtype=bool)
+
+  # JAX.
+  jax_model = config.make()
+  x_jax = jax_types.Sequence(
+      jnp.array(tokens), jnp.array(mask, dtype=jnp.bool_)
+  )
+  variables = jax_model.init(jax.random.PRNGKey(0), x_jax, training=False)
+  jax_params = variables['params']
+  jax_out = np.array(
+      jax_model.apply({'params': jax_params}, x_jax, training=False).values
+  )
+
+  # MLX.
+  mlx_model = _make_mlx_model(config)
+  weight_converter.load_linen_params(mlx_model, jax_params, config)
+  x_mx = Sequence(
+      mx.array(tokens, dtype=mx.int32), mx.array(mask, dtype=mx.bool_)
+  )
+  mlx_out = np.array(mlx_model.layer(x_mx, training=False).values)
+
+  test_case.assertEqual(
+      jax_out.shape,
+      mlx_out.shape,
+      f'Shape mismatch: jax={jax_out.shape} vs mlx={mlx_out.shape}',
+  )
+  np.testing.assert_allclose(
+      mlx_out,
+      jax_out,
+      atol=atol,
+      rtol=rtol,
+      err_msg='Integration test: JAX vs MLX outputs differ',
+  )
+  return jax_params
+
+
+def _compare_integration_step(
+    test_case,
+    config,
+    input_shape,
+    jax_params,
+    *,
+    batch_size=2,
+    num_steps=8,
+    atol=1e-3,
+    rtol=1e-3,
+    seed=42,
+    jax_constants=None,
+    mlx_constants=None,
+):
+  """Compare step-by-step output of a full model between JAX and MLX."""
+  rng = np.random.RandomState(seed + 1)
+  step_values = [
+      rng.randn(batch_size, 1, *input_shape).astype(np.float32)
+      for _ in range(num_steps)
+  ]
+  step_masks = [np.ones((batch_size, 1), dtype=bool) for _ in range(num_steps)]
+
+  # JAX step.
+  jax_model = config.make()
+  jax_spec = jax.ShapeDtypeStruct(input_shape, jnp.float32)
+  jax_state = jax_model.apply(
+      {'params': jax_params},
+      batch_size,
+      jax_spec,
+      training=False,
+      constants=jax_constants,
+      method=jax_model.get_initial_state,
+  )
+  jax_outputs = []
+  for i in range(num_steps):
+    x_jax = jax_types.Sequence(
+        jnp.array(step_values[i]),
+        jnp.array(step_masks[i], dtype=jnp.bool_),
+    )
+    y_jax, jax_state = jax_model.apply(
+        {'params': jax_params},
+        x_jax,
+        jax_state,
+        training=False,
+        constants=jax_constants,
+        method=jax_model.step,
+    )
+    jax_outputs.append(np.array(y_jax.values))
+
+  # MLX step.
+  mlx_model = _make_mlx_model(config)
+  weight_converter.load_linen_params(
+      mlx_model,
+      jax_params,
+      config,
+      input_spec=ShapeDType(input_shape, mx.float32),
+      constants=mlx_constants,
+  )
+  mlx_spec = ShapeDType(input_shape, mx.float32)
+  mlx_state = mlx_model.get_initial_state(
+      batch_size, mlx_spec, training=False, constants=mlx_constants
+  )
+  mlx_outputs = []
+  for i in range(num_steps):
+    x_mx = Sequence(
+        mx.array(step_values[i]),
+        mx.array(step_masks[i], dtype=mx.bool_),
+    )
+    y_mx, mlx_state = mlx_model.step(
+        x_mx, mlx_state, training=False, constants=mlx_constants
+    )
+    mx.eval(y_mx.values)
+    mlx_outputs.append(np.array(y_mx.values))
+
+  for i, (jax_out, mlx_out) in enumerate(zip(jax_outputs, mlx_outputs)):
+    np.testing.assert_allclose(
+        mlx_out,
+        jax_out,
+        atol=atol,
+        rtol=rtol,
+        err_msg=f'Integration step {i}: JAX vs MLX outputs differ',
+    )
+
+
+def _compare_integration_int_step(
+    test_case,
+    config,
+    jax_params,
+    *,
+    vocab_size=256,
+    batch_size=1,
+    num_steps=8,
+    atol=1e-3,
+    rtol=1e-3,
+    seed=42,
+):
+  """Compare step-by-step output of a token model between JAX and MLX."""
+  rng = np.random.RandomState(seed + 1)
+  step_tokens = [
+      rng.randint(0, vocab_size, size=(batch_size, 1)).astype(np.int32)
+      for _ in range(num_steps)
+  ]
+  step_masks = [np.ones((batch_size, 1), dtype=bool) for _ in range(num_steps)]
+
+  # JAX step.
+  jax_model = config.make()
+  jax_spec = jax.ShapeDtypeStruct((), jnp.int32)
+  jax_state = jax_model.apply(
+      {'params': jax_params},
+      batch_size,
+      jax_spec,
+      training=False,
+      method=jax_model.get_initial_state,
+  )
+  jax_outputs = []
+  for i in range(num_steps):
+    x_jax = jax_types.Sequence(
+        jnp.array(step_tokens[i]),
+        jnp.array(step_masks[i], dtype=jnp.bool_),
+    )
+    y_jax, jax_state = jax_model.apply(
+        {'params': jax_params},
+        x_jax,
+        jax_state,
+        training=False,
+        method=jax_model.step,
+    )
+    jax_outputs.append(np.array(y_jax.values))
+
+  # MLX step.
+  mlx_model = _make_mlx_model(config)
+  weight_converter.load_linen_params(mlx_model, jax_params, config)
+  mlx_spec = ShapeDType((), mx.int32)
+  mlx_state = mlx_model.get_initial_state(batch_size, mlx_spec, training=False)
+  mlx_outputs = []
+  for i in range(num_steps):
+    x_mx = Sequence(
+        mx.array(step_tokens[i], dtype=mx.int32),
+        mx.array(step_masks[i], dtype=mx.bool_),
+    )
+    y_mx, mlx_state = mlx_model.step(x_mx, mlx_state, training=False)
+    mx.eval(y_mx.values)
+    mlx_outputs.append(np.array(y_mx.values))
+
+  for i, (jax_out, mlx_out) in enumerate(zip(jax_outputs, mlx_outputs)):
+    np.testing.assert_allclose(
+        mlx_out,
+        jax_out,
+        atol=atol,
+        rtol=rtol,
+        err_msg=f'Integration step {i}: JAX vs MLX outputs differ',
+    )
+
+
+class DecoderTransformerIntegrationTest(parameterized.TestCase):
+  """Cross-backend: decoder-only transformer (token input)."""
+
+  def _config(self, dim=32, num_heads=4, num_layers=2, vocab_size=64):
+    return sl.Serial.Config([
+        sl.Embedding.Config(num_embeddings=vocab_size, dimension=dim),
+        sl.Repeat.Config(
+            num_repeats=num_layers,
+            layer=sl.Serial.Config([
+                sl.Residual.Config([
+                    sl.RMSNormalization.Config(),
+                    sl.DotProductSelfAttention.Config(
+                        num_heads=num_heads,
+                        units_per_head=dim // num_heads,
+                        max_past_horizon=64,
+                        max_future_horizon=0,
+                        query_network=(
+                            sl.ApplyRotaryPositionalEncoding.Config(
+                                max_wavelength=10_000.0,
+                            )
+                        ),
+                        key_network=(
+                            sl.ApplyRotaryPositionalEncoding.Config(
+                                max_wavelength=10_000.0,
+                            )
+                        ),
+                    ),
+                    sl.Flatten.Config(),
+                ]),
+                sl.Residual.Config([
+                    sl.RMSNormalization.Config(),
+                    sl.Dense.Config(features=dim * 4, activation=jax.nn.gelu),
+                    sl.Dense.Config(features=dim),
+                ]),
+            ]),
+        ),
+        sl.RMSNormalization.Config(),
+        sl.Dense.Config(features=vocab_size),
+    ])
+
+  def test_layer(self):
+    config = self._config()
+    _compare_integration_int(self, config, vocab_size=64)
+
+  def test_step(self):
+    config = self._config()
+    jax_params = _compare_integration_int(self, config, vocab_size=64)
+    _compare_integration_int_step(
+        self, config, jax_params, vocab_size=64, num_steps=6
+    )
+
+
+class GQADecoderIntegrationTest(parameterized.TestCase):
+  """Cross-backend: decoder transformer with GQA."""
+
+  def _config(self, dim=32, num_heads=4, num_kv_heads=2, vocab_size=64):
+    return sl.Serial.Config([
+        sl.Embedding.Config(num_embeddings=vocab_size, dimension=dim),
+        sl.Residual.Config([
+            sl.RMSNormalization.Config(),
+            sl.DotProductSelfAttention.Config(
+                num_heads=num_heads,
+                units_per_head=dim // num_heads,
+                max_past_horizon=64,
+                max_future_horizon=0,
+                num_kv_heads=num_kv_heads,
+                input_projection=(
+                    attn_common.SeparateQueryKeyValueProjection()
+                ),
+                query_network=(
+                    sl.ApplyRotaryPositionalEncoding.Config(
+                        max_wavelength=10_000.0,
+                    )
+                ),
+                key_network=(
+                    sl.ApplyRotaryPositionalEncoding.Config(
+                        max_wavelength=10_000.0,
+                    )
+                ),
+            ),
+            sl.Flatten.Config(),
+        ]),
+        sl.Residual.Config([
+            sl.RMSNormalization.Config(),
+            sl.Dense.Config(features=dim * 4, activation=jax.nn.gelu),
+            sl.Dense.Config(features=dim),
+        ]),
+        sl.Dense.Config(features=vocab_size),
+    ])
+
+  def test_layer(self):
+    config = self._config()
+    _compare_integration_int(self, config, vocab_size=64)
+
+  def test_step(self):
+    config = self._config()
+    jax_params = _compare_integration_int(self, config, vocab_size=64)
+    _compare_integration_int_step(
+        self, config, jax_params, vocab_size=64, num_steps=6
+    )
+
+
+class ConvEncoderIntegrationTest(parameterized.TestCase):
+  """Cross-backend: conv + dense encoder (float input)."""
+
+  def _config(self, dim=16):
+    return sl.Serial.Config([
+        sl.Conv1D.Config(filters=dim, kernel_size=3, padding='causal'),
+        sl.Relu.Config(),
+        sl.Conv1D.Config(filters=dim, kernel_size=3, padding='causal'),
+        sl.Relu.Config(),
+        sl.LayerNormalization.Config(),
+        sl.Dense.Config(features=dim * 2, activation=jax.nn.gelu),
+        sl.Dense.Config(features=dim),
+    ])
+
+  def test_layer(self):
+    config = self._config()
+    _compare_integration_float(self, config, (8,))
+
+  def test_step(self):
+    config = self._config()
+    jax_params, _, _ = _compare_integration_float(self, config, (8,))
+    _compare_integration_step(self, config, (8,), jax_params)
+
+
+class ConvAttentionIntegrationTest(parameterized.TestCase):
+  """Cross-backend: conv + self-attention + pooling model (float input)."""
+
+  def _config(self, dim=16):
+    return sl.Serial.Config([
+        sl.Conv1D.Config(filters=dim, kernel_size=3, padding='causal'),
+        sl.Swish.Config(),
+        sl.Residual.Config([
+            sl.RMSNormalization.Config(),
+            sl.DotProductSelfAttention.Config(
+                num_heads=2,
+                units_per_head=dim // 2,
+                max_past_horizon=32,
+                max_future_horizon=0,
+            ),
+            sl.Flatten.Config(),
+        ]),
+        sl.MaxPooling1D.Config(pool_size=2, padding='causal'),
+        sl.Dense.Config(features=dim),
+    ])
+
+  def test_layer(self):
+    config = self._config()
+    _compare_integration_float(self, config, (8,), time=8)
+
+  def test_step(self):
+    config = self._config()
+    jax_params, _, _ = _compare_integration_float(self, config, (8,), time=8)
+    _compare_integration_step(
+        self, config, (8,), jax_params, num_steps=8, atol=5e-3, rtol=5e-3
+    )
+
+
+class EncoderDecoderIntegrationTest(parameterized.TestCase):
+  """Cross-backend: encoder-decoder with cross-attention (float input)."""
+
+  def _encoder_config(self, dim=16):
+    return sl.Serial.Config([
+        sl.Dense.Config(features=dim, activation=jax.nn.relu),
+        sl.Dense.Config(features=dim),
+    ])
+
+  def _decoder_config(self, dim=16):
+    from sequence_layers.jax.attention import \
+        dot_product_attention as jax_cross_attn
+
+    return sl.Serial.Config([
+        sl.Residual.Config([
+            sl.RMSNormalization.Config(),
+            sl.DotProductSelfAttention.Config(
+                num_heads=2,
+                units_per_head=dim // 2,
+                max_past_horizon=32,
+                max_future_horizon=0,
+            ),
+            sl.Flatten.Config(),
+        ]),
+        sl.Residual.Config([
+            sl.RMSNormalization.Config(),
+            jax_cross_attn.DotProductAttention.Config(
+                source_name='encoder',
+                num_heads=2,
+                units_per_head=dim // 2,
+            ),
+            sl.Flatten.Config(),
+        ]),
+        sl.Residual.Config([
+            sl.RMSNormalization.Config(),
+            sl.Dense.Config(features=dim * 2, activation=jax.nn.gelu),
+            sl.Dense.Config(features=dim),
+        ]),
+    ])
+
+  def _make_constants(self, batch_size, time, rng, dim=16):
+    source_values = rng.randn(batch_size, time, dim).astype(np.float32)
+    source_mask = np.ones((batch_size, time), dtype=bool)
+    jax_source = jax_types.Sequence(
+        jnp.array(source_values), jnp.array(source_mask, dtype=jnp.bool_)
+    )
+    mlx_source = Sequence(
+        mx.array(source_values), mx.array(source_mask, dtype=mx.bool_)
+    )
+    return {'encoder': jax_source}, {'encoder': mlx_source}
+
+  def test_layer(self):
+    config = self._decoder_config()
+    _compare_integration_float(
+        self,
+        config,
+        (16,),
+        constants_fn=lambda b, t, rng: self._make_constants(b, t, rng),
+    )
+
+  def test_step(self):
+    config = self._decoder_config()
+    jax_params, jax_constants, mlx_constants = _compare_integration_float(
+        self,
+        config,
+        (16,),
+        constants_fn=lambda b, t, rng: self._make_constants(b, t, rng),
+    )
+    _compare_integration_step(
+        self,
+        config,
+        (16,),
+        jax_params,
+        jax_constants=jax_constants,
+        mlx_constants=mlx_constants,
+    )
+
+
+class DepthwiseConvPipelineIntegrationTest(parameterized.TestCase):
+  """Cross-backend: depthwise conv + dense + normalization pipeline."""
+
+  def _config(self, dim=16):
+    return sl.Serial.Config([
+        sl.Dense.Config(features=dim),
+        sl.DepthwiseConv1D.Config(kernel_size=3, padding='causal'),
+        sl.Swish.Config(),
+        sl.LayerNormalization.Config(),
+        sl.Dense.Config(features=dim * 2, activation=jax.nn.gelu),
+        sl.Dense.Config(features=dim),
+        sl.DepthwiseConv1D.Config(kernel_size=5, padding='causal'),
+        sl.RMSNormalization.Config(),
+        sl.Dense.Config(features=dim),
+    ])
+
+  def test_layer(self):
+    config = self._config()
+    _compare_integration_float(self, config, (8,), atol=2e-3, rtol=2e-3)
+
+  def test_step(self):
+    config = self._config()
+    jax_params, _, _ = _compare_integration_float(
+        self, config, (8,), atol=2e-3, rtol=2e-3
+    )
+    _compare_integration_step(
+        self, config, (8,), jax_params, atol=2e-3, rtol=2e-3
+    )
+
+
+class ParallelBranchIntegrationTest(parameterized.TestCase):
+  """Cross-backend: parallel branches with different processing."""
+
+  def _config(self, dim=8):
+    return sl.Serial.Config([
+        sl.Parallel.Config(
+            layers=[
+                sl.Serial.Config([
+                    sl.Dense.Config(features=dim, activation=jax.nn.relu),
+                    sl.Dense.Config(features=dim),
+                ]),
+                sl.Serial.Config([
+                    sl.Dense.Config(features=dim, activation=jax.nn.gelu),
+                    sl.Dense.Config(features=dim),
+                ]),
+            ],
+            combination=sl.CombinationMode.ADD,
+        ),
+        sl.RMSNormalization.Config(),
+        sl.Dense.Config(features=dim),
+    ])
+
+  def test_layer(self):
+    config = self._config()
+    _compare_integration_float(self, config, (8,))
+
+  def test_step(self):
+    config = self._config()
+    jax_params, _, _ = _compare_integration_float(self, config, (8,))
+    _compare_integration_step(self, config, (8,), jax_params)
+
+
+def _compare_conditioning(
+    test_case,
+    config,
+    input_shape,
+    cond_shape,
+    *,
+    batch_size=2,
+    time=8,
+    atol=1e-5,
+    rtol=1e-5,
+    seed=42,
+):
+  """Compare Conditioning layer: JAX vs MLX."""
+  rng = np.random.RandomState(seed)
+  values = rng.randn(batch_size, time, *input_shape).astype(np.float32)
+  mask = np.ones((batch_size, time), dtype=bool)
+  cond_values = rng.randn(batch_size, time, *cond_shape).astype(np.float32)
+  cond_mask = np.ones((batch_size, time), dtype=bool)
+
+  jax_constants = {
+      'cond': jax_types.Sequence(
+          jnp.array(cond_values), jnp.array(cond_mask, dtype=jnp.bool_)
+      )
+  }
+  mlx_constants = {
+      'cond': Sequence(
+          mx.array(cond_values), mx.array(cond_mask, dtype=mx.bool_)
+      )
+  }
+
+  # JAX.
+  jax_model = config.make()
+  x_jax = jax_types.Sequence(
+      jnp.array(values), jnp.array(mask, dtype=jnp.bool_)
+  )
+  variables = jax_model.init(
+      jax.random.PRNGKey(0),
+      x_jax,
+      training=False,
+      constants=jax_constants,
+  )
+  jax_params = variables.get('params', {})
+  jax_out = np.array(
+      jax_model.apply(
+          variables,
+          x_jax,
+          training=False,
+          constants=jax_constants,
+      ).values
+  )
+
+  # MLX.
+  mlx_model = _make_mlx_model(config)
+  if jax_params:
+    weight_converter.load_linen_params(
+        mlx_model,
+        jax_params,
+        config,
+        input_spec=ShapeDType(input_shape, mx.float32),
+        constants=mlx_constants,
+    )
+  x_mx = Sequence(mx.array(values), mx.array(mask, dtype=mx.bool_))
+  mlx_out = np.array(
+      mlx_model.layer(x_mx, training=False, constants=mlx_constants).values
+  )
+
+  np.testing.assert_allclose(
+      mlx_out,
+      jax_out,
+      atol=atol,
+      rtol=rtol,
+      err_msg=f'{config.__class__.__qualname__}: outputs differ',
+  )
+
+
+class ConditioningCrossBackendTest(parameterized.TestCase):
+  """Conditioning: JAX vs MLX layer-mode."""
+
+  def test_identity_add(self):
+    from sequence_layers.jax import conditioning as jax_cond
+
+    config = jax_cond.Conditioning.Config(
+        conditioning_name='cond',
+        projection=jax_cond.BaseConditioning.Projection.IDENTITY,
+        combination=jax_cond.BaseConditioning.Combination.ADD,
+    )
+    _compare_conditioning(self, config, (8,), (8,))
+
+  def test_identity_mul(self):
+    from sequence_layers.jax import conditioning as jax_cond
+
+    config = jax_cond.Conditioning.Config(
+        conditioning_name='cond',
+        projection=jax_cond.BaseConditioning.Projection.IDENTITY,
+        combination=jax_cond.BaseConditioning.Combination.MUL,
+    )
+    _compare_conditioning(self, config, (8,), (8,))
+
+  def test_identity_concat(self):
+    from sequence_layers.jax import conditioning as jax_cond
+
+    config = jax_cond.Conditioning.Config(
+        conditioning_name='cond',
+        projection=jax_cond.BaseConditioning.Projection.IDENTITY,
+        combination=jax_cond.BaseConditioning.Combination.CONCAT,
+    )
+    _compare_conditioning(self, config, (4,), (6,))
+
+  def test_linear_add(self):
+    from sequence_layers.jax import conditioning as jax_cond
+
+    config = jax_cond.Conditioning.Config(
+        conditioning_name='cond',
+        projection=jax_cond.BaseConditioning.Projection.LINEAR,
+        combination=jax_cond.BaseConditioning.Combination.ADD,
+    )
+    _compare_conditioning(self, config, (4,), (6,))
+
+  def test_linear_affine_shift(self):
+    from sequence_layers.jax import conditioning as jax_cond
+
+    config = jax_cond.Conditioning.Config(
+        conditioning_name='cond',
+        projection=jax_cond.BaseConditioning.Projection.LINEAR,
+        combination=jax_cond.BaseConditioning.Combination.AFFINE_SHIFT,
+    )
+    _compare_conditioning(self, config, (4,), (6,))
+
+  def test_linear_affine_scale(self):
+    from sequence_layers.jax import conditioning as jax_cond
+
+    config = jax_cond.Conditioning.Config(
+        conditioning_name='cond',
+        projection=jax_cond.BaseConditioning.Projection.LINEAR,
+        combination=jax_cond.BaseConditioning.Combination.AFFINE_SCALE,
+    )
+    _compare_conditioning(self, config, (4,), (6,))
+
+  def test_linear_affine(self):
+    from sequence_layers.jax import conditioning as jax_cond
+
+    config = jax_cond.Conditioning.Config(
+        conditioning_name='cond',
+        projection=jax_cond.BaseConditioning.Projection.LINEAR_AFFINE,
+        combination=jax_cond.BaseConditioning.Combination.AFFINE,
+    )
+    _compare_conditioning(self, config, (4,), (6,))
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/sequence_layers/jax/attention/common.py b/sequence_layers/jax/attention/common.py
index 6c78900..782359c 100644
--- a/sequence_layers/jax/attention/common.py
+++ b/sequence_layers/jax/attention/common.py
@@ -18,6 +18,8 @@
 import functools
 from typing import Any, Callable, Mapping, Protocol
 
+from sequence_layers.specs import attention as attention_spec
+
 from flax import linen as nn
 from flax import struct
 import jax
@@ -32,20 +34,6 @@
 from sequence_layers.jax import typing as jt
 from sequence_layers.jax import utils
 
-# These are the ones which also get exposed in __init__.py. Import other members
-# via sequence_layers.jax.attention.common.
-__all__ = [
-    # go/keep-sorted start
-    'CombinedQueryKeyValueProjection',
-    'CrossAttentionEmits',
-    'InputProjectionModule',
-    'QueryAndKeyValueProjection',
-    'QueryAndSharedKeyValueProjection',
-    'RelativePositionEmbedding',
-    'SelfAttentionEmits',
-    'SeparateQueryKeyValueProjection',
-    # go/keep-sorted end
-]
 
 
 # A negative enough value such that it underflows to a hard zero in softmax.
@@ -172,8 +160,7 @@ def get_source(
 
 
 @dataclasses.dataclass(frozen=True)
-class QueryKeyValueProjectionConfig:
-  """Configuration for QueryKeyValueProjection."""
+class QueryKeyValueProjectionConfig(attention_spec.QueryKeyValueProjectionConfig):
 
   # Optional callable that returns a jnp.einsum-compatible function to use
   # instead of jnp.einsum for the query, key and value projections.
@@ -305,7 +292,10 @@ def get_kv(self, x: types.Sequence) -> tuple[types.Sequence, types.Sequence]:
 
 
 @dataclasses.dataclass(frozen=True)
-class CombinedQueryKeyValueProjection(QueryKeyValueProjectionConfig):
+class CombinedQueryKeyValueProjection(
+    attention_spec.CombinedQueryKeyValueProjection,
+    QueryKeyValueProjectionConfig,
+):
   """Use a single projection matrix for query/key/value projection.
 
   * Incompatible with Grouped Query Attention (num_query_heads != num_kv_heads).
@@ -447,7 +437,10 @@ def get_kv(self, x: types.Sequence) -> tuple[types.Sequence, types.Sequence]:
 
 
 @dataclasses.dataclass(frozen=True)
-class SeparateQueryKeyValueProjection(QueryKeyValueProjectionConfig):
+class SeparateQueryKeyValueProjection(
+    attention_spec.SeparateQueryKeyValueProjection,
+    QueryKeyValueProjectionConfig,
+):
   """Use separate projection matrices for query/key/value projection.
 
   * Supports Grouped Query Attention (num_query_heads != num_kv_heads).
@@ -578,7 +571,10 @@ def get_kv(self, x: types.Sequence) -> tuple[types.Sequence, types.Sequence]:
 
 
 @dataclasses.dataclass(frozen=True)
-class QueryAndKeyValueProjection(QueryKeyValueProjectionConfig):
+class QueryAndKeyValueProjection(
+    attention_spec.QueryAndKeyValueProjection,
+    QueryKeyValueProjectionConfig,
+):
   """Use separate query and key/value projection matrices.
 
   * Supports Grouped Query Attention (num_query_heads != num_kv_heads).
@@ -710,7 +706,10 @@ def get_kv(self, x: types.Sequence) -> tuple[types.Sequence, types.Sequence]:
 
 
 @dataclasses.dataclass(frozen=True)
-class QueryAndSharedKeyValueProjection(QueryKeyValueProjectionConfig):
+class QueryAndSharedKeyValueProjection(
+    attention_spec.QueryAndSharedKeyValueProjection,
+    QueryKeyValueProjectionConfig,
+):
   """Use separate query and shared key/value projection matrices.
 
   * Supports Grouped Query Attention (num_query_heads != num_kv_heads).
@@ -766,6 +765,299 @@ def make(
     )
 
 
+class AttentionInputProjectionHelper:
+  """Helper class for shared attention input projection logic."""
+
+  def _setup_projection_layers(
+      self,
+      config: QueryKeyValueProjectionConfig,
+      num_query_heads: int,
+      num_kv_heads: int,
+      units_per_head: int,
+      use_bias: bool,
+      precision: jax.lax.PrecisionLike,
+      compute_dtype: types.DType,
+      param_dtype: types.DType,
+      allow_combined_qkv: bool = True,
+  ) -> None:
+    """Creates submodules, must be called from nn.Module.setup in subclasses."""
+    match config:
+      case CombinedQueryKeyValueProjection():
+        if not allow_combined_qkv:
+          raise ValueError(
+              'CombinedQueryKeyValueProjection is not supported. Use'
+              ' SeparateQueryKeyValueProjection or'
+              ' QueryAndSharedKeyValueProjection.'
+          )
+        if num_query_heads != num_kv_heads:
+          raise ValueError(
+              f'num_query_heads={num_query_heads} !='
+              f' num_kv_heads={num_kv_heads}'
+          )
+        num_stacked = 2 if config.share_kv_projection else 3
+        self._qkv = utils.FlaxEinsumDense(
+            equation='...a,abcd->...bcd',
+            output_shape=(num_stacked, num_query_heads, units_per_head),
+            bias_axes='bcd' if use_bias else None,
+            kernel_init=utils.shard_initializer(
+                config.qkv_kernel_init,
+                config.qkv_kernel_sharding,
+                projectable=True,
+                axes_types=(
+                    meta.AxisType.FANIN,
+                    meta.AxisType.STACKED,
+                    None,
+                    None,
+                ),
+            ),
+            bias_init=utils.shard_initializer(
+                config.bias_init, config.bias_sharding
+            ),
+            precision=precision,
+            compute_dtype=compute_dtype,
+            param_dtype=param_dtype,
+            einsum_factory=config.einsum_factory,
+            quantization_provider=config.quantization_provider,
+            name='query_key_value_projection',
+        )
+      case SeparateQueryKeyValueProjection():
+        self._q = utils.FlaxEinsumDense(
+            equation='...a,abc->...bc',
+            output_shape=(num_query_heads, units_per_head),
+            bias_axes='bc' if use_bias else None,
+            kernel_init=utils.shard_initializer(
+                config.q_kernel_init,
+                config.q_kernel_sharding,
+                projectable=True,
+                axes_types=(meta.AxisType.FANIN, None, None),
+            ),
+            bias_init=utils.shard_initializer(
+                config.bias_init, config.bias_sharding
+            ),
+            precision=precision,
+            compute_dtype=compute_dtype,
+            param_dtype=param_dtype,
+            einsum_factory=config.einsum_factory,
+            quantization_provider=config.quantization_provider,
+            name='query_projection',
+        )
+        self._k = utils.FlaxEinsumDense(
+            equation='...a,abc->...bc',
+            output_shape=(num_kv_heads, units_per_head),
+            bias_axes='bc' if use_bias else None,
+            kernel_init=utils.shard_initializer(
+                config.k_kernel_init,
+                config.k_kernel_sharding,
+                projectable=True,
+                axes_types=(meta.AxisType.FANIN, None, None),
+            ),
+            bias_init=utils.shard_initializer(
+                config.bias_init, config.bias_sharding
+            ),
+            precision=precision,
+            compute_dtype=compute_dtype,
+            param_dtype=param_dtype,
+            einsum_factory=config.einsum_factory,
+            quantization_provider=config.quantization_provider,
+            name='key_projection',
+        )
+        self._v = utils.FlaxEinsumDense(
+            equation='...a,abc->...bc',
+            output_shape=(num_kv_heads, units_per_head),
+            bias_axes='bc' if use_bias else None,
+            kernel_init=utils.shard_initializer(
+                config.v_kernel_init,
+                config.v_kernel_sharding,
+                projectable=True,
+                axes_types=(meta.AxisType.FANIN, None, None),
+            ),
+            bias_init=utils.shard_initializer(
+                config.bias_init, config.bias_sharding
+            ),
+            precision=precision,
+            compute_dtype=compute_dtype,
+            param_dtype=param_dtype,
+            einsum_factory=config.einsum_factory,
+            quantization_provider=config.quantization_provider,
+            name='value_projection',
+        )
+      case QueryAndKeyValueProjection():
+        self._q = utils.FlaxEinsumDense(
+            equation='...a,abc->...bc',
+            output_shape=(num_query_heads, units_per_head),
+            bias_axes='bc' if use_bias else None,
+            kernel_init=utils.shard_initializer(
+                config.q_kernel_init,
+                config.q_kernel_sharding,
+                projectable=True,
+                axes_types=(meta.AxisType.FANIN, None, None),
+            ),
+            bias_init=utils.shard_initializer(
+                config.q_bias_init, config.q_bias_sharding
+            ),
+            precision=precision,
+            compute_dtype=compute_dtype,
+            param_dtype=param_dtype,
+            einsum_factory=config.einsum_factory,
+            quantization_provider=config.quantization_provider,
+            name='query_projection',
+        )
+        self._kv = utils.FlaxEinsumDense(
+            equation='...a,abcd->...bcd',
+            output_shape=(2, num_kv_heads, units_per_head),
+            bias_axes='bcd' if use_bias else None,
+            kernel_init=utils.shard_initializer(
+                config.kv_kernel_init,
+                config.kv_kernel_sharding,
+                projectable=True,
+                axes_types=(
+                    meta.AxisType.FANIN,
+                    meta.AxisType.STACKED,
+                    None,
+                    None,
+                ),
+            ),
+            bias_init=utils.shard_initializer(
+                config.kv_bias_init, config.kv_bias_sharding
+            ),
+            precision=precision,
+            compute_dtype=compute_dtype,
+            param_dtype=param_dtype,
+            einsum_factory=config.einsum_factory,
+            quantization_provider=config.quantization_provider,
+            name='key_value_projection',
+        )
+      case QueryAndSharedKeyValueProjection():
+        self._q = utils.FlaxEinsumDense(
+            equation='...a,abc->...bc',
+            output_shape=(num_query_heads, units_per_head),
+            bias_axes='bc' if use_bias else None,
+            kernel_init=utils.shard_initializer(
+                config.q_kernel_init,
+                config.q_kernel_sharding,
+                projectable=True,
+                axes_types=(meta.AxisType.FANIN, None, None),
+            ),
+            bias_init=utils.shard_initializer(
+                config.q_bias_init, config.q_bias_sharding
+            ),
+            precision=precision,
+            compute_dtype=compute_dtype,
+            param_dtype=param_dtype,
+            einsum_factory=config.einsum_factory,
+            quantization_provider=config.quantization_provider,
+            name='query_projection',
+        )
+        self._shared_kv = utils.FlaxEinsumDense(
+            equation='...a,abc->...bc',
+            output_shape=(num_kv_heads, units_per_head),
+            bias_axes='bc' if use_bias else None,
+            kernel_init=utils.shard_initializer(
+                config.kv_kernel_init,
+                config.kv_kernel_sharding,
+                projectable=True,
+                axes_types=(
+                    meta.AxisType.FANIN,
+                    None,
+                    None,
+                ),
+            ),
+            bias_init=utils.shard_initializer(
+                config.kv_bias_init, config.kv_bias_sharding
+            ),
+            precision=precision,
+            compute_dtype=compute_dtype,
+            param_dtype=param_dtype,
+            einsum_factory=config.einsum_factory,
+            quantization_provider=config.quantization_provider,
+            name='shared_key_value_projection',
+        )
+
+  def get_input_projection_output_dtype(
+      self,
+      config: QueryKeyValueProjectionConfig,
+      input_dtype: types.DType,
+      constants: types.Constants | None = None,
+  ) -> types.DType:
+    """Returns the output dtype of the QKV projection."""
+    match config:
+      case CombinedQueryKeyValueProjection():
+        return self._qkv.get_output_dtype(input_dtype, constants=constants)
+      case (
+          SeparateQueryKeyValueProjection()
+          | QueryAndKeyValueProjection()
+          | QueryAndSharedKeyValueProjection()
+      ):
+        return self._q.get_output_dtype(input_dtype, constants=constants)
+      case _:
+        raise NotImplementedError(config)
+
+  def get_qkv(
+      self, config: QueryKeyValueProjectionConfig, x: types.Sequence
+  ) -> tuple[types.Sequence, types.Sequence, types.Sequence]:
+    """Project input to query/key/value sequences."""
+    match config:
+      case CombinedQueryKeyValueProjection():
+        projection = utils.sequence_unstack(
+            self._qkv.project_sequence(x), axis=2
+        )
+
+        if len(projection) == 2:
+          # Shared K and V.
+          queries, keys = projection
+          values = keys
+        else:
+          queries, keys, values = projection
+      case SeparateQueryKeyValueProjection():
+        queries = self._q.project_sequence(x)
+        keys = self._k.project_sequence(x)
+        values = self._v.project_sequence(x)
+      case QueryAndKeyValueProjection():
+        queries = self._q.project_sequence(x)
+        keys, values = utils.sequence_unstack(
+            self._kv.project_sequence(x), axis=2
+        )
+      case QueryAndSharedKeyValueProjection():
+        queries = self._q.project_sequence(x)
+        keys = values = self._shared_kv.project_sequence(x)
+      case _:
+        raise NotImplementedError(config)
+    return queries, keys, values
+
+  def get_q(
+      self, config: QueryKeyValueProjectionConfig, x: types.Sequence
+  ) -> types.Sequence:
+    """Project input to query sequence."""
+    match config:
+      case SeparateQueryKeyValueProjection():
+        queries = self._q.project_sequence(x)
+      case QueryAndKeyValueProjection():
+        queries = self._q.project_sequence(x)
+      case QueryAndSharedKeyValueProjection():
+        queries = self._q.project_sequence(x)
+      case _:
+        raise NotImplementedError(config)
+    return queries
+
+  def get_kv(
+      self, config: QueryKeyValueProjectionConfig, x: types.Sequence
+  ) -> tuple[types.Sequence, types.Sequence]:
+    """Project input to key/value sequences."""
+    match config:
+      case SeparateQueryKeyValueProjection():
+        keys = self._k.project_sequence(x)
+        values = self._v.project_sequence(x)
+      case QueryAndKeyValueProjection():
+        keys, values = utils.sequence_unstack(
+            self._kv.project_sequence(x), axis=2
+        )
+      case QueryAndSharedKeyValueProjection():
+        keys = values = self._shared_kv.project_sequence(x)
+      case _:
+        raise NotImplementedError(config)
+    return keys, values
+
+
 class SelfAttentionEmits(struct.PyTreeNode):
   """A structure for emits produced by self attention layers."""
 
diff --git a/sequence_layers/jax/attention/dot_product_attention.py b/sequence_layers/jax/attention/dot_product_attention.py
index 16f0408..6ff3663 100644
--- a/sequence_layers/jax/attention/dot_product_attention.py
+++ b/sequence_layers/jax/attention/dot_product_attention.py
@@ -14,6 +14,7 @@
 """Dot product attention."""
 
 import dataclasses
+from collections.abc import Sequence as TypingSequence
 from flax import linen as nn
 import jax.numpy as jnp
 import jaxtyping
@@ -22,13 +23,21 @@
 from sequence_layers.jax import typing as jt
 from sequence_layers.jax import utils
 from sequence_layers.jax.attention import common
+from sequence_layers.specs import attention as attention_spec
 
 
-class DotProductAttention(types.Emitting):
+class DotProductAttention(
+    types.Emitting,
+    common.AttentionInputProjectionHelper,
+    attention_spec.DotProductAttention[types.Sequence, types.ChannelSpec],
+):
   """Dot product attention."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(
+      types.SequenceLayerConfig,
+      attention_spec.DotProductAttention.Config,
+  ):
     """Configuration for DotProductAttention."""
 
     # The key to lookup source sequence from constants dictionary.
@@ -122,6 +131,8 @@ class Config(types.SequenceLayerConfig):
     # the encoder is bidirectional and therefore needs to be re-computed after
     # each block.
     recompute_kv_per_step: bool = False
+    # Whether to emit attention weights.
+    emit_attention_weights: bool = False
     # An optional name for the layer.
     name: str | None = None
 
diff --git a/sequence_layers/jax/attention/dot_product_attention_test.py b/sequence_layers/jax/attention/dot_product_attention_test.py
index b070902..f568674 100644
--- a/sequence_layers/jax/attention/dot_product_attention_test.py
+++ b/sequence_layers/jax/attention/dot_product_attention_test.py
@@ -26,6 +26,7 @@
 from sequence_layers.jax.attention import shaw_relative_position_embedding
 from sequence_layers.jax.attention import t5_relative_position_embedding
 from sequence_layers.jax.attention import test_utils as attention_test_utils
+from sequence_layers.specs import attention_behaviors as attention_spec_behaviors
 
 
 # Custom init function so that position bias decreases as absolute
@@ -50,7 +51,10 @@ def _t5_position_bias_mat_init(
   return bias_matrix
 
 
-class DotProductAttentionTest(test_utils.SequenceLayerTest):
+class DotProductAttentionTest(
+    test_utils.SequenceLayerTest,
+    attention_spec_behaviors.DotProductAttentionTest,
+):
 
   @parameterized.parameters(
       (1, 2, 0, False),
diff --git a/sequence_layers/jax/attention/dot_product_self_attention.py b/sequence_layers/jax/attention/dot_product_self_attention.py
index 7e25cec..a1f9a49 100644
--- a/sequence_layers/jax/attention/dot_product_self_attention.py
+++ b/sequence_layers/jax/attention/dot_product_self_attention.py
@@ -14,6 +14,7 @@
 """Dot product self attention layer."""
 
 import dataclasses
+from collections.abc import Sequence as TypingSequence
 from flax import linen as nn
 import jax
 import jax.numpy as jnp
@@ -22,13 +23,21 @@
 from sequence_layers.jax import types
 from sequence_layers.jax import utils
 from sequence_layers.jax.attention import common
+from sequence_layers.specs import attention as attention_spec
 
 
-class DotProductSelfAttention(types.Emitting):
+class DotProductSelfAttention(
+    types.Emitting,
+    common.AttentionInputProjectionHelper,
+    attention_spec.DotProductSelfAttention[types.Sequence, types.ChannelSpec],
+):
   """A multi-headed dot-product self attention layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(
+      types.SequenceLayerConfig,
+      attention_spec.DotProductSelfAttention.Config,
+  ):
     """Configuration for DotProductSelfAttention."""
 
     # The number of attention heads. If num_kv_heads is set, num_heads must be
@@ -136,6 +145,8 @@ class Config(types.SequenceLayerConfig):
     # accumulate the logits in float32 instead of simply upcasting the output of
     # the logits einsum to float32.
     experimental_accumulate_logits_in_float32: bool = False
+    # Whether to emit attention weights.
+    emit_attention_weights: bool = False
     # An optional name for the layer.
     name: str | None = None
 
diff --git a/sequence_layers/jax/attention/dot_product_self_attention_test.py b/sequence_layers/jax/attention/dot_product_self_attention_test.py
index a3a2072..2fe5c7b 100644
--- a/sequence_layers/jax/attention/dot_product_self_attention_test.py
+++ b/sequence_layers/jax/attention/dot_product_self_attention_test.py
@@ -25,9 +25,13 @@
 from sequence_layers.jax.attention import shaw_relative_position_embedding
 from sequence_layers.jax.attention import t5_relative_position_embedding
 from sequence_layers.jax.attention import test_utils as attention_test_utils
+from sequence_layers.specs import attention_behaviors as attention_spec_behaviors
 
 
-class DotProductSelfAttentionTest(test_utils.SequenceLayerTest):
+class DotProductSelfAttentionTest(
+    test_utils.SequenceLayerTest,
+    attention_spec_behaviors.DotProductSelfAttentionTest,
+):
 
   @parameterized.parameters(
       # max_past_horizon > 0, max_future_horizon == 0. Steppable.
@@ -820,77 +824,6 @@ def custom_einsum(equation, *args, **kwargs):
     y_default = l_default.layer(x, training=False).mask_invalid()
     self.assertSequencesNotClose(y_einsum, y_default)
 
-  @parameterized.parameters(
-      # max_past_horizon > 0, max_future_horizon == 0. Steppable.
-      (1, 2, 3, 0, False),
-      (1, 2, 3, 0, True),
-      (3, 5, 3, 0, False),
-      (3, 5, 3, 0, True),
-      # max_past_horizon > 0, max_future_horizon > 0. Steppable.
-      (3, 5, 3, 2, False),
-      (3, 5, 3, 2, True),
-      (3, 5, 3, 5, False),
-      (3, 5, 3, 5, True),
-  )
-  def test_use_kv_cache_ringbuffer(
-      self,
-      num_heads,
-      units_per_head,
-      max_past_horizon,
-      max_future_horizon,
-      random_mask,
-  ):
-    key = jax.random.PRNGKey(1234)
-    batch_size = 2
-    l = dot_product_self_attention.DotProductSelfAttention.Config(
-        num_heads=num_heads,
-        units_per_head=units_per_head,
-        max_past_horizon=max_past_horizon,
-        max_future_horizon=max_future_horizon,
-        precision=jax.lax.Precision.HIGHEST,
-        per_dim_scale=True,
-        use_kv_cache_ringbuffer=True,
-        name='dot_product_self_attention',
-    ).make()
-
-    channels = 1
-    x = test_utils.random_sequence(batch_size, 1, channels)
-    l = self.init_and_bind_layer(key, l, x)
-
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, 1)
-    self.assertEqual(l.name, 'dot_product_self_attention')
-    self.assertEqual(
-        l.get_output_shape_for_sequence(x), (num_heads, units_per_head)
-    )
-    self.assertEqual(
-        l.supports_step, max_past_horizon >= 0 and max_future_horizon >= 0
-    )
-    self.assertEqual(l.input_latency, max(0, max_future_horizon))
-
-    attention_test_utils.assert_param_dtypes_inits_shapes(
-        l,
-        x,
-        num_sink_embeddings=0,
-        input_projection=l.config.input_projection,
-    )
-
-    # Sweep time dimension shorter and longer than max_horizon.
-    for time in [1, 2, 3, 11, 12]:
-      with self.subTest(f'time{time}'):
-        x = test_utils.random_sequence(
-            batch_size, time, channels, random_mask=random_mask
-        )
-        self.verify_contract(
-            l,
-            x,
-            training=False,
-            # Ring buffer does not support step size > 1.
-            test_2x_step=False,
-            grad_atol=1e-5,
-            grad_rtol=1e-5,
-        )
-
 
 if __name__ == '__main__':
   test_utils.main()
diff --git a/sequence_layers/jax/attention/local_dot_product_self_attention.py b/sequence_layers/jax/attention/local_dot_product_self_attention.py
index d837320..0f8fcbe 100644
--- a/sequence_layers/jax/attention/local_dot_product_self_attention.py
+++ b/sequence_layers/jax/attention/local_dot_product_self_attention.py
@@ -22,13 +22,21 @@
 from sequence_layers.jax import types
 from sequence_layers.jax import utils
 from sequence_layers.jax.attention import common
+from sequence_layers.specs import attention as attention_spec
 
 
-class LocalDotProductSelfAttention(types.Emitting):
+class LocalDotProductSelfAttention(
+    types.Emitting,
+    common.AttentionInputProjectionHelper,
+    attention_spec.LocalDotProductSelfAttention[types.Sequence, types.ChannelSpec],
+):
   """A multi-headed dot-product self attention layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(
+      types.SequenceLayerConfig,
+      attention_spec.LocalDotProductSelfAttention.Config,
+  ):
     """Configuration for LocalDotProductSelfAttention."""
 
     # The number of attention heads.
diff --git a/sequence_layers/jax/attention/local_dot_product_self_attention_test.py b/sequence_layers/jax/attention/local_dot_product_self_attention_test.py
index 60dd0a8..7806087 100644
--- a/sequence_layers/jax/attention/local_dot_product_self_attention_test.py
+++ b/sequence_layers/jax/attention/local_dot_product_self_attention_test.py
@@ -14,14 +14,19 @@
 from absl.testing import parameterized
 import jax
 import jax.numpy as jnp
+
 from sequence_layers.jax import position
 from sequence_layers.jax import test_utils
 from sequence_layers.jax.attention import local_dot_product_self_attention
 from sequence_layers.jax.attention import test_utils as attention_test_utils
 from sequence_layers.jax.attention import transformer_xl_relative_position_embedding
+from sequence_layers.specs import attention_behaviors as attention_spec_behaviors
 
 
-class LocalDotProductSelfAttentionTest(test_utils.SequenceLayerTest):
+class LocalDotProductSelfAttentionTest(
+    test_utils.SequenceLayerTest,
+    attention_spec_behaviors.LocalDotProductSelfAttentionTest,
+):
 
   @parameterized.parameters(
       # max_past_horizon > 0, max_future_horizon == 0
@@ -307,63 +312,6 @@ def test_rotary_positional_encoding(
             grad_rtol=1e-5,
         )
 
-  def test_query_key_value_network_supports_step(
-      self,
-  ):
-    key = jax.random.PRNGKey(1234)
-    x = test_utils.random_sequence(2, 1, 3)
-    l = local_dot_product_self_attention.LocalDotProductSelfAttention.Config(
-        num_heads=3,
-        units_per_head=5,
-        max_past_horizon=3,
-        max_future_horizon=0,
-        block_size=1,
-        query_network=position.AddTimingSignal.Config(),
-        key_network=position.AddTimingSignal.Config(),
-        value_network=position.AddTimingSignal.Config(),
-    ).make()
-    l = self.init_and_bind_layer(key, l, x)
-    self.assertTrue(l.supports_step)
-
-    l = local_dot_product_self_attention.LocalDotProductSelfAttention.Config(
-        num_heads=3,
-        units_per_head=5,
-        max_past_horizon=3,
-        max_future_horizon=0,
-        block_size=1,
-        query_network=test_utils.NonSteppableLayer.Config(),
-        key_network=position.AddTimingSignal.Config(),
-        value_network=position.AddTimingSignal.Config(),
-    ).make()
-    l = self.init_and_bind_layer(key, l, x)
-    self.assertFalse(l.supports_step)
-
-    l = local_dot_product_self_attention.LocalDotProductSelfAttention.Config(
-        num_heads=3,
-        units_per_head=5,
-        max_past_horizon=3,
-        max_future_horizon=0,
-        block_size=1,
-        query_network=position.AddTimingSignal.Config(),
-        key_network=test_utils.NonSteppableLayer.Config(),
-        value_network=position.AddTimingSignal.Config(),
-    ).make()
-    l = self.init_and_bind_layer(key, l, x)
-    self.assertFalse(l.supports_step)
-
-    l = local_dot_product_self_attention.LocalDotProductSelfAttention.Config(
-        num_heads=3,
-        units_per_head=5,
-        max_past_horizon=3,
-        max_future_horizon=0,
-        block_size=1,
-        query_network=position.AddTimingSignal.Config(),
-        key_network=position.AddTimingSignal.Config(),
-        value_network=test_utils.NonSteppableLayer.Config(),
-    ).make()
-    l = self.init_and_bind_layer(key, l, x)
-    self.assertFalse(l.supports_step)
-
   @parameterized.product(
       test_utils.standard_dtype_configs(),
       config=(
diff --git a/sequence_layers/jax/attention/streaming_dot_product_attention.py b/sequence_layers/jax/attention/streaming_dot_product_attention.py
index bc16c50..453bae4 100644
--- a/sequence_layers/jax/attention/streaming_dot_product_attention.py
+++ b/sequence_layers/jax/attention/streaming_dot_product_attention.py
@@ -21,9 +21,14 @@
 from sequence_layers.jax import types
 from sequence_layers.jax import utils
 from sequence_layers.jax.attention import common
+from sequence_layers.specs import attention as attention_spec
 
 
-class StreamingDotProductAttention(types.Emitting):
+class StreamingDotProductAttention(
+    types.Emitting,
+    common.AttentionInputProjectionHelper,
+    attention_spec.StreamingDotProductAttention[types.Sequence, types.ChannelSpec],
+):
   """A multi-headed streaming dot-product attention layer.
 
   Unlike most SequenceLayers, this cross-attention layer assumes that when using
@@ -34,7 +39,10 @@ class StreamingDotProductAttention(types.Emitting):
   """
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(
+      types.SequenceLayerConfig,
+      attention_spec.StreamingDotProductAttention.Config,
+  ):
     """Configuration for StreamingDotProductAttention."""
 
     # The key to lookup source sequence from constants dictionary.
@@ -137,6 +145,8 @@ class Config(types.SequenceLayerConfig):
     # accumulate the logits in float32 instead of simply upcasting the output of
     # the logits einsum to float32.
     experimental_accumulate_logits_in_float32: bool = False
+    # Whether to emit attention weights.
+    emit_attention_weights: bool = False
     # An optional name for the layer.
     name: str | None = None
 
diff --git a/sequence_layers/jax/attention/streaming_dot_product_attention_test.py b/sequence_layers/jax/attention/streaming_dot_product_attention_test.py
index 8ed1613..9d3068a 100644
--- a/sequence_layers/jax/attention/streaming_dot_product_attention_test.py
+++ b/sequence_layers/jax/attention/streaming_dot_product_attention_test.py
@@ -13,9 +13,11 @@
 # limitations under the License.
 
 from typing import Literal
+
 from absl.testing import parameterized
 import jax
 import jax.numpy as jnp
+
 from sequence_layers.jax import position
 from sequence_layers.jax import test_utils
 from sequence_layers.jax import types
@@ -23,9 +25,13 @@
 from sequence_layers.jax.attention import common
 from sequence_layers.jax.attention import streaming_dot_product_attention
 from sequence_layers.jax.attention import test_utils as attention_test_utils
+from sequence_layers.specs import attention_behaviors as attention_spec_behaviors
 
 
-class StreamingDotProductAttentionTest(test_utils.SequenceLayerTest):
+class StreamingDotProductAttentionTest(
+    test_utils.SequenceLayerTest,
+    attention_spec_behaviors.StreamingDotProductAttentionTest,
+):
 
   @parameterized.parameters(
       # max_past_horizon > 0, max_future_horizon == 0
@@ -348,65 +354,6 @@ def test_no_query_delay_buffer(self, use_rope: bool):
         y_layer.mask_invalid(), y_step[:, max_future_horizon:].mask_invalid()
     )
 
-  def test_query_key_value_network_supports_step(self):
-    key = jax.random.PRNGKey(1234)
-    x = test_utils.random_sequence(2, 1, 3)
-    source = test_utils.random_sequence(2, 1, 5)
-    constants = {'source': source}
-    l = streaming_dot_product_attention.StreamingDotProductAttention.Config(
-        'source',
-        num_heads=3,
-        units_per_head=5,
-        max_past_horizon=3,
-        max_future_horizon=0,
-        query_network=position.AddTimingSignal.Config(),
-        key_network=position.AddTimingSignal.Config(),
-        value_network=position.AddTimingSignal.Config(),
-    ).make()
-    l = self.init_and_bind_layer(key, l, x, constants=constants)
-    self.assertTrue(l.supports_step)
-
-    l = streaming_dot_product_attention.StreamingDotProductAttention.Config(
-        'source',
-        num_heads=3,
-        units_per_head=5,
-        max_past_horizon=3,
-        max_future_horizon=0,
-        query_network=test_utils.NonSteppableLayer.Config(),
-        key_network=position.AddTimingSignal.Config(),
-        value_network=position.AddTimingSignal.Config(),
-    ).make()
-    l = self.init_and_bind_layer(key, l, x, constants=constants)
-    self.assertFalse(l.supports_step)
-
-    l = streaming_dot_product_attention.StreamingDotProductAttention.Config(
-        'source',
-        num_heads=3,
-        units_per_head=5,
-        max_past_horizon=3,
-        max_future_horizon=0,
-        query_network=position.AddTimingSignal.Config(),
-        key_network=test_utils.NonSteppableLayer.Config(),
-        value_network=position.AddTimingSignal.Config(),
-    ).make()
-    l = self.init_and_bind_layer(key, l, x, constants=constants)
-    # The key/value network must be steppable for streaming.
-    self.assertFalse(l.supports_step)
-
-    l = streaming_dot_product_attention.StreamingDotProductAttention.Config(
-        'source',
-        num_heads=3,
-        units_per_head=5,
-        max_past_horizon=3,
-        max_future_horizon=0,
-        query_network=position.AddTimingSignal.Config(),
-        key_network=position.AddTimingSignal.Config(),
-        value_network=test_utils.NonSteppableLayer.Config(),
-    ).make()
-    l = self.init_and_bind_layer(key, l, x, constants=constants)
-    # The key/value network must be steppable for streaming.
-    self.assertFalse(l.supports_step)
-
   @parameterized.product(
       (
           {
diff --git a/sequence_layers/jax/attention/streaming_local_dot_product_attention.py b/sequence_layers/jax/attention/streaming_local_dot_product_attention.py
index c05f999..5e30f72 100644
--- a/sequence_layers/jax/attention/streaming_local_dot_product_attention.py
+++ b/sequence_layers/jax/attention/streaming_local_dot_product_attention.py
@@ -21,9 +21,14 @@
 from sequence_layers.jax import types
 from sequence_layers.jax import utils
 from sequence_layers.jax.attention import common
+from sequence_layers.specs import attention as attention_spec
 
 
-class StreamingLocalDotProductAttention(types.Emitting):
+class StreamingLocalDotProductAttention(
+    types.Emitting,
+    common.AttentionInputProjectionHelper,
+    attention_spec.StreamingDotProductAttention[types.Sequence, types.ChannelSpec],
+):
   """A multi-headed streaming local dot-product attention layer.
 
   Unlike most SequenceLayers, this cross-attention layer assumes that when using
@@ -37,7 +42,10 @@ class StreamingLocalDotProductAttention(types.Emitting):
   """
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(
+      types.SequenceLayerConfig,
+      attention_spec.StreamingDotProductAttention.Config,
+  ):
     """Configuration for StreamingLocalDotProductAttention."""
 
     # The key to lookup source sequence from constants dictionary.
diff --git a/sequence_layers/jax/combinators.py b/sequence_layers/jax/combinators.py
index 1f63545..b8d2235 100644
--- a/sequence_layers/jax/combinators.py
+++ b/sequence_layers/jax/combinators.py
@@ -18,7 +18,7 @@
 import fractions
 import functools
 import math
-from typing import Callable, Sequence as TypingSequence, TypeVar
+from typing import Callable, override, Sequence as TypingSequence, TypeVar
 
 import flax
 import flax.linen as nn
@@ -29,9 +29,10 @@
 from sequence_layers.jax import simple
 from sequence_layers.jax import types
 from sequence_layers.jax import utils
+from sequence_layers.specs import combinators as spec
 
 
-CombinationMode = utils.CombinationMode
+CombinationMode = spec.CombinationMode
 
 __all__ = (
     # go/keep-sorted start
@@ -324,11 +325,15 @@ def layer_with_emits(
     return x, emits
 
 
-class Serial(SerialCombinatorMixin, types.Emitting):
+class Serial(
+    SerialCombinatorMixin,
+    types.Emitting,
+    spec.Serial[types.Sequence, types.ShapeDType],
+):
   """A combinator that processes SequenceLayers serially."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig, spec.Serial.Config):
     """Configuration for Serial."""
 
     layers: TypingSequence[types.SequenceLayerConfig]
@@ -356,7 +361,11 @@ def setup(self) -> None:
       utils.setup_shared_scope(self, self.layers, self.config.share_scope)
 
 
-class SerialModules(SerialCombinatorMixin, types.Emitting):
+class SerialModules(
+    SerialCombinatorMixin,
+    types.Emitting,
+    spec.SerialModules[types.Sequence, types.ShapeDType],
+):
   """A Serial combinator that processes pre-existing SequenceLayers serially.
 
   Passing pre-constructed modules into another nn.Module can have unintended
@@ -387,18 +396,18 @@ def get_sampler(self) -> sl.SequenceLayer:
   layers: tuple[types.SequenceLayer, ...]
 
 
-class Parallel(types.Emitting):
+class Parallel(types.Emitting, spec.Parallel[types.Sequence, types.ShapeDType]):
   """Applies a sequence of layers in parallel.
 
   Outputs are broadcasted and combined together.
   """
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig, spec.Parallel.Config):
     """Config for Parallel."""
 
     layers: TypingSequence[types.SequenceLayerConfig]
-    combination: utils.CombinationMode = utils.CombinationMode.STACK
+    combination: CombinationMode = CombinationMode.STACK
     # If true, a list of boolean values for each layer in `layers` indicating
     # whether to share this Serial's Flax parameter scope with that layer. This
     # is useful to avoid representing the Serial layer in the parameter tree. If
@@ -801,7 +810,11 @@ def step_with_emits(
     return y, tuple(states), tuple(emits)
 
 
-class Residual(SerialCombinatorMixin, types.Emitting):
+class Residual(
+    SerialCombinatorMixin,
+    types.Emitting,
+    spec.Residual[types.Sequence, types.ShapeDType],
+):
   """A residual wrapper around l that computes `y = l(x) + shortcut(x)`.
 
   If shortcut is not provided, it defaults to an identity or a linear projection
@@ -811,7 +824,7 @@ class Residual(SerialCombinatorMixin, types.Emitting):
   """
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig, spec.Residual.Config):
     """Config for Residual."""
 
     layers: TypingSequence[types.SequenceLayerConfig]
@@ -1049,7 +1062,7 @@ def get_output_dtype(
     return jnp.result_type(layer_dtype, shortcut_dtype)
 
 
-class Repeat(types.Emitting):
+class Repeat(types.Emitting, spec.Repeat[types.Sequence, types.ShapeDType]):
   """A combinator that repeats the specified SequenceLayer N times.
 
   Execution is performed in a loop, enabling reduced compilation times since the
@@ -1067,7 +1080,7 @@ class Repeat(types.Emitting):
   """
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig, spec.Repeat.Config):
     """Configuration for Repeat."""
     layer: types.SequenceLayerConfig
     num_repeats: int
diff --git a/sequence_layers/jax/combinators_test.py b/sequence_layers/jax/combinators_test.py
index ed35a49..539bfb3 100644
--- a/sequence_layers/jax/combinators_test.py
+++ b/sequence_layers/jax/combinators_test.py
@@ -35,6 +35,11 @@
 from sequence_layers.jax import test_utils
 from sequence_layers.jax import types
 from sequence_layers.jax import utils
+from sequence_layers.specs import combinators_behaviors as spec
+
+
+class CombinatorBehaviorsTest(test_utils.SequenceLayerTest, spec.CombinatorBehaviorsTest):
+  """Shared behavior tests for combinators in JAX."""
 
 
 class SerialTest(test_utils.SequenceLayerTest):
diff --git a/sequence_layers/jax/conditioning.py b/sequence_layers/jax/conditioning.py
index c6e5f8e..31caead 100644
--- a/sequence_layers/jax/conditioning.py
+++ b/sequence_layers/jax/conditioning.py
@@ -16,14 +16,16 @@
 import abc
 import dataclasses
 import enum
+from typing import override
 
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+
 from sequence_layers.jax import dense
 from sequence_layers.jax import types
 from sequence_layers.jax import utils
-
+from sequence_layers.specs import conditioning as conditioning_spec
 
 __all__ = (
     # go/keep-sorted start
@@ -61,44 +63,12 @@ def _get_conditioning(
 
 
 class BaseConditioning(
-    types.PreservesType, types.SequenceLayer, metaclass=abc.ABCMeta
+    types.PreservesType,
+    conditioning_spec.BaseConditioning[types.Sequence, types.ChannelSpec],
+    metaclass=abc.ABCMeta,
 ):
   """Base class for conditioning types."""
 
-  @enum.unique
-  class Projection(enum.Enum):
-    """The type of projection to perform."""
-
-    # No projection.
-    IDENTITY = 1
-    # Dense projection from every element of c at a given time step, to a tensor
-    # of the same shape as x at given time step (c.channel_shape.num_elements()
-    # to x.channel_shape.num_elements()).
-    LINEAR = 2
-    # Dense projection from every element of c at a given time step, to a tensor
-    # of shape [2, x.shape...] at given time step (
-    # c.channel_shape.num_elements() to 2 * x.channel_shape.num_elements()).
-    LINEAR_AFFINE = 3
-
-  @enum.unique
-  class Combination(enum.Enum):
-    """The type of combination to perform."""
-
-    # Broadcast-add conditioning.
-    ADD = 1
-    # Broadcast-concat conditioning.
-    CONCAT = 2
-    # Affine conditioning. Requires LINEAR_AFFINE projection.
-    AFFINE = 3
-    # Affine shift conditioning. Requires LINEAR projection.
-    AFFINE_SHIFT = 4
-    # Affine scale conditioning. Requires LINEAR projection.
-    AFFINE_SCALE = 5
-    # Broadcast-multiply conditioning. Requires LINEAR or IDENTITY projection.
-    MUL = 6
-    # Broadcast-concat conditioning via prepending.
-    CONCAT_BEFORE = 7
-
   def _projected_condition_shape(
       self, input_shape: types.Shape, condition_shape: types.Shape
   ) -> types.Shape:
@@ -133,7 +103,7 @@ def _conditioning_name(self) -> str:
 
   @property
   @abc.abstractmethod
-  def _projection(self) -> Projection:
+  def _projection(self) -> conditioning_spec.Projection:
     pass
 
   @property
@@ -143,7 +113,7 @@ def _projection_channel_shape(self) -> types.Shape | None:
 
   @property
   @abc.abstractmethod
-  def _combination(self) -> Combination:
+  def _combination(self) -> conditioning_spec.Combination:
     pass
 
   @property
@@ -332,7 +302,10 @@ def _tensor_to_fake_sequence(t: jax.Array) -> types.MaskedSequence:
   )
 
 
-class Conditioning(BaseConditioning):
+class Conditioning(
+    BaseConditioning,
+    conditioning_spec.Conditioning[types.Sequence, types.ChannelSpec],
+):
   """Conditions the sequence x on a conditioning sequence c.
 
   Conditioning is done in a time-synchronized way, where each time step of x is
@@ -361,46 +334,17 @@ class Conditioning(BaseConditioning):
   """
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(conditioning_spec.Conditioning.Config):
     """Config for Conditioning."""
 
-    # The name of the conditioning sequence or array in the constants
-    # dictionary.
-    conditioning_name: str
-    # The type of projection to perform to project the conditioning before
-    # combination.
-    projection: BaseConditioning.Projection
-    # The type of combination to perform between the projected conditioning and
-    # the input sequence.
-    combination: BaseConditioning.Combination
-    # If projection is LINEAR or LINEAR_AFFINE, the channel shape to project the
-    # conditioning to. If unspecified, projects to the input sequence's channel
-    # shape.
-    projection_channel_shape: types.Shape | None = None
-    # If true, the conditioning sequence is expected to be streamed at the same
-    # block_size as the input sequence.
-    streaming: bool = False
-    # The dtype to use for layer compute.
-    compute_dtype: types.DType | None = None
-    # The dtype to use for layer parameters.
+    # Override defaults or add JAX-specific fields
     param_dtype: types.DType = jnp.float32
-    # Initializer for the kernel.
     kernel_init: nn.initializers.Initializer = nn.linear.default_kernel_init
-    # Optional sharding for the kernel. Any axes that are present in the input
-    # spec are marked as FANIN.
     kernel_sharding: types.Sharding | None = None
-    # Initializer for the bias, if used and not gated by another config option.
     bias_init: nn.initializers.Initializer = nn.initializers.zeros_init()
-    # Optional sharding for the bias.
     bias_sharding: types.Sharding | None = None
-    # An offset to add to the affine scale when `combination` is AFFINE or
-    # AFFINE_SCALE. Typically 1.0 is used with parameter initializations near 0,
-    # as this is close to an identity function and allows the network to learn
-    # residual scaling adjustments more easily.
-    affine_scale_offset: complex = 1.0
-    # An optional name for the layer.
-    name: str | None = None
 
+    @override
     def make(self) -> 'Conditioning':
       return Conditioning(self, name=self.name)
 
diff --git a/sequence_layers/jax/conditioning_test.py b/sequence_layers/jax/conditioning_test.py
index 3c47a92..78b3fa3 100644
--- a/sequence_layers/jax/conditioning_test.py
+++ b/sequence_layers/jax/conditioning_test.py
@@ -19,10 +19,11 @@
 import jax
 import jax.numpy as jnp
 import numpy as np
+
 from sequence_layers.jax import conditioning
 from sequence_layers.jax import test_utils
 from sequence_layers.jax import types
-
+from sequence_layers.specs import conditioning_behaviors
 
 IDENTITY = conditioning.Conditioning.Projection.IDENTITY
 LINEAR = conditioning.Conditioning.Projection.LINEAR
@@ -40,7 +41,10 @@ def _float_tensor(values):
   return jnp.asarray(values, dtype=jnp.float32)
 
 
-class ConditioningTest(test_utils.SequenceLayerTest):
+class ConditioningTest(
+    conditioning_behaviors.ConditioningTest,
+    test_utils.SequenceLayerTest,
+):
 
   @parameterized.parameters(
       (IDENTITY, ADD, tuple(), tuple(), tuple()),
diff --git a/sequence_layers/jax/convolution.py b/sequence_layers/jax/convolution.py
index 56275f4..2796943 100644
--- a/sequence_layers/jax/convolution.py
+++ b/sequence_layers/jax/convolution.py
@@ -18,18 +18,20 @@
 import fractions
 import math
 import typing
-from typing import Callable, Protocol, Sequence as TypingSequence
+from typing import Callable, Protocol
+from typing import Sequence as TypingSequence
 
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
 import numpy as np
+
 from sequence_layers.jax import meta
 from sequence_layers.jax import normalization
 from sequence_layers.jax import types
 from sequence_layers.jax import typing as jt
 from sequence_layers.jax import utils
-
+from sequence_layers.specs import convolution as spec
 
 __all__ = (
     # go/keep-sorted start
@@ -493,7 +495,11 @@ def compute_conv_transpose_mask(
   return jnp.squeeze(test_fn(mask, 0.0), -1)
 
 
-class BaseConv(types.SequenceLayer, metaclass=abc.ABCMeta):
+class BaseConv(
+    spec.BaseConv[types.Sequence, types.ChannelSpec],
+    types.SequenceLayer,
+    metaclass=abc.ABCMeta,
+):
   """Shared base logic for convolution layers."""
 
   @property
@@ -806,11 +812,11 @@ def _apply_kernel_weight_constraints(
   return kernel
 
 
-class Conv1D(BaseConv):
+class Conv1D(spec.Conv1D[types.Sequence, types.ChannelSpec], BaseConv):
   """A 1D strided or dilated convolution layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.Conv1D.Config, types.SequenceLayerConfig):
     """Config for Conv1D."""
 
     filters: int
@@ -960,11 +966,13 @@ def _layer(
     return y
 
 
-class DepthwiseConv1D(BaseConv):
+class DepthwiseConv1D(
+    spec.DepthwiseConv1D[types.Sequence, types.ChannelSpec], BaseConv
+):
   """A 1D depthwise strided or dilated convolution layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.DepthwiseConv1D.Config, types.SequenceLayerConfig):
     """Config for DepthwiseConv1D."""
 
     kernel_size: int
@@ -1122,11 +1130,11 @@ def _layer(
     return y
 
 
-class Conv2D(BaseConv):
+class Conv2D(spec.Conv2D[types.Sequence, types.ChannelSpec], BaseConv):
   """A 2D strided or dilated convolution layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.Conv2D.Config, types.SequenceLayerConfig):
     """Config for Conv2D."""
 
     filters: int
@@ -1597,11 +1605,13 @@ def _layer(
     return y
 
 
-class Conv1DTranspose(types.SequenceLayer):
+class Conv1DTranspose(
+    spec.Conv1DTranspose[types.Sequence, types.ChannelSpec], types.SequenceLayer
+):
   """A 1D transpose convolution layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.Conv1DTranspose.Config, types.SequenceLayerConfig):
     """Config for Conv1DTranspose."""
 
     filters: int
@@ -1885,11 +1895,13 @@ def step(
     return types.Sequence(values, mask), state
 
 
-class Conv2DTranspose(types.SequenceLayer):
+class Conv2DTranspose(
+    spec.Conv2DTranspose[types.Sequence, types.ChannelSpec], types.SequenceLayer
+):
   """A 2D transpose convolution layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.Conv2DTranspose.Config, types.SequenceLayerConfig):
     """Configuration for Conv2DTranspose."""
 
     filters: int
diff --git a/sequence_layers/jax/convolution1d_test.py b/sequence_layers/jax/convolution1d_test.py
index 959b078..b425dec 100644
--- a/sequence_layers/jax/convolution1d_test.py
+++ b/sequence_layers/jax/convolution1d_test.py
@@ -22,6 +22,8 @@
 import jax
 import jax.numpy as jnp
 import numpy as np
+import tensorflow as tf
+
 from sequence_layers.jax import combinators
 from sequence_layers.jax import convolution
 from sequence_layers.jax import dsp
@@ -29,7 +31,7 @@
 from sequence_layers.jax import test_utils
 from sequence_layers.jax import types
 from sequence_layers.jax import utils
-import tensorflow as tf
+from sequence_layers.specs import convolution_behaviors as spec
 
 
 class IdentityArrayConstraint(nn.Module):
@@ -638,7 +640,7 @@ def test_serial_usm(self, take_every_n: int):
     self.verify_contract(l, x, training=False)
 
 
-class Conv1DTest(test_utils.SequenceLayerTest):
+class Conv1DTest(spec.Conv1DTest, test_utils.SequenceLayerTest):
 
   @parameterized.product(
       params=[
@@ -867,6 +869,11 @@ def test_conv1d_dtypes_with_kernel_constraint(
     )
     self.verify_contract(l, x, training=False)
 
+
+class DepthwiseConv1DTest(
+    spec.DepthwiseConv1DTest, test_utils.SequenceLayerTest
+):
+
   @parameterized.product(
       params=[
           # 1x1 conv.
@@ -1169,7 +1176,9 @@ def test_tf_equivalence(self):
       self.assertSequencesClose(y, y_tf)
 
 
-class Conv1DTransposeTest(test_utils.SequenceLayerTest):
+class Conv1DTransposeTest(
+    spec.Conv1DTransposeTest, test_utils.SequenceLayerTest
+):
 
   @parameterized.product(
       params=[
diff --git a/sequence_layers/jax/convolution2d_test.py b/sequence_layers/jax/convolution2d_test.py
index 7fd61ed..683d7f8 100644
--- a/sequence_layers/jax/convolution2d_test.py
+++ b/sequence_layers/jax/convolution2d_test.py
@@ -20,10 +20,12 @@
 import jax
 import jax.numpy as jnp
 import numpy as np
+
 from sequence_layers.jax import convolution
 from sequence_layers.jax import normalization
 from sequence_layers.jax import test_utils
 from sequence_layers.jax import utils
+from sequence_layers.specs import convolution_behaviors as spec
 
 
 class IdentityArrayConstraint(nn.Module):
@@ -51,7 +53,7 @@ class IdentityArrayFactory:
   output_factory = MakeableIdentityArrayConstraint()
 
 
-class Conv2DTest(test_utils.SequenceLayerTest):
+class Conv2DTest(spec.Conv2DTest, test_utils.SequenceLayerTest):
 
   @parameterized.product(
       kernel_size_strides_dilation_rate=(
@@ -320,7 +322,9 @@ def test_conv2d_dtypes_with_kernel_constraint(
     self.verify_contract(l, x, training=False, grad_rtol=1e-5, grad_atol=1e-5)
 
 
-class Conv2DTransposeTest(test_utils.SequenceLayerTest):
+class Conv2DTransposeTest(
+    spec.Conv2DTransposeTest, test_utils.SequenceLayerTest
+):
 
   @parameterized.product(
       params=[
diff --git a/sequence_layers/jax/dense.py b/sequence_layers/jax/dense.py
index 5fbadb0..7abae4c 100644
--- a/sequence_layers/jax/dense.py
+++ b/sequence_layers/jax/dense.py
@@ -15,15 +15,16 @@
 
 import dataclasses
 import typing
-from typing import Callable
+from typing import Callable, override
 
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+
 from sequence_layers.jax import meta
 from sequence_layers.jax import types
 from sequence_layers.jax import utils
-
+from sequence_layers.specs import dense as spec
 
 __all__ = (
     # go/keep-sorted start
@@ -34,11 +35,11 @@
 )
 
 
-class Dense(types.Stateless, utils.EinsumCommon):
+class Dense(types.Stateless, utils.EinsumCommon, spec.Dense):
   """A basic dense layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig, spec.Dense.Config):
     """Dense config."""
 
     # The number of output features for the dense layer.
@@ -73,6 +74,8 @@ class Config(types.SequenceLayerConfig):
     def make(self) -> 'Dense':
       return Dense(self, name=self.name)
 
+
+
   config: Config
 
   @nn.nowrap
@@ -269,7 +272,7 @@ def layer(
     )
 
 
-class EinsumDense(types.Stateless, utils.EinsumCommon):
+class EinsumDense(types.Stateless, utils.EinsumCommon, spec.EinsumDense):
   """A dense layer that transforms the channel shape with an einsum equation.
 
   Equation input and output specs must have leading ellipses to broadcast over
@@ -291,7 +294,7 @@ class EinsumDense(types.Stateless, utils.EinsumCommon):
   """
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig, spec.EinsumDense.Config):
     """EinsumDense config."""
 
     # An equation describing the einsum to perform. This equation must be a
@@ -338,6 +341,8 @@ def __post_init__(self):
     def make(self) -> 'EinsumDense':
       return EinsumDense(self, name=self.name)
 
+
+
   config: Config
 
   @nn.nowrap
diff --git a/sequence_layers/jax/dense_test.py b/sequence_layers/jax/dense_test.py
index 0edd20b..3820c1f 100644
--- a/sequence_layers/jax/dense_test.py
+++ b/sequence_layers/jax/dense_test.py
@@ -19,21 +19,14 @@
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+
 from sequence_layers.jax import dense
 from sequence_layers.jax import test_utils
 from sequence_layers.jax import types
+from sequence_layers.specs import dense_behaviors as spec
 
 
-class DenseTest(test_utils.SequenceLayerTest):
-
-  def test_rank2_unsupported(self):
-    key = jax.random.PRNGKey(1234)
-    l = dense.Dense.Config(
-        3, bias_init=nn.initializers.normal(), name='dense'
-    ).make()
-    x = test_utils.random_sequence(2, 13)
-    with self.assertRaises(ValueError):
-      self.init_and_bind_layer(key, l, x)
+class DenseTest(test_utils.SequenceLayerTest, spec.DenseTest):
 
   @parameterized.parameters(((5,),), ((5, 7),))
   def test_dense(self, channels_shape):
@@ -49,7 +42,7 @@ def test_dense(self, channels_shape):
     self.assertEqual(
         l.get_output_shape_for_sequence(x), channels_shape[:-1] + (3,)
     )
-    self.verify_contract(l, x, training=False, grad_rtol=1e-5, grad_atol=1e-5)
+    self.verify_contract(l, x, training=False, rtol=1e-5, atol=1e-5, grad_rtol=1e-5, grad_atol=1e-5)
 
     chex.assert_trees_all_equal_shapes_and_dtypes(
         flax.core.meta.unbox(l.variables),
@@ -61,17 +54,6 @@ def test_dense(self, channels_shape):
         },
     )
 
-  @parameterized.parameters(True, False)
-  def test_use_bias(self, use_bias):
-    """Check that use_bias controls whether a bias is created."""
-    key = jax.random.PRNGKey(1234)
-    l = dense.Dense.Config(3, use_bias=use_bias).make()
-    x = test_utils.random_sequence(2, 3, 5)
-    l = self.init_and_bind_layer(key, l, x)
-    self.assertCountEqual(
-        l.variables['params'], ['kernel', 'bias'] if use_bias else ['kernel']
-    )
-
   def test_use_einsum_factory(self):
     """Check that einsum_factory produces is used for dense einsum."""
 
@@ -254,7 +236,7 @@ def test_dtypes(self, param_dtype, input_dtype, compute_dtype, use_bias):
     )
 
 
-class EinsumDenseTest(test_utils.SequenceLayerTest):
+class EinsumDenseTest(test_utils.SequenceLayerTest, spec.EinsumDenseTest):
 
   @parameterized.parameters(
       (
@@ -461,22 +443,22 @@ def custom_einsum(equation, *args, **kwargs):
   @parameterized.product(
       test_utils.standard_dtype_configs(),
       (
-          dict(
-              shape=(2, 3, 5, 7, 11),
-              equation='...abc,bd->...bd',
-              output_shape=(None, 13),
-              expected_kernel_shape=(7, 13),
-              bias_axes='',
-              expected_bias_shape=None,
-          ),
-          dict(
-              shape=(2, 3, 5),
-              equation='...a,abcd->...bcd',
-              output_shape=(7, 11, 13),
-              expected_kernel_shape=(5, 7, 11, 13),
-              bias_axes='cd',
-              expected_bias_shape=(11, 13),
-          ),
+          {
+              'shape': (2, 3, 5, 7, 11),
+              'equation': '...abc,bd->...bd',
+              'output_shape': (None, 13),
+              'expected_kernel_shape': (7, 13),
+              'bias_axes': '',
+              'expected_bias_shape': None,
+          },
+          {
+              'shape': (2, 3, 5),
+              'equation': '...a,abcd->...bcd',
+              'output_shape': (7, 11, 13),
+              'expected_kernel_shape': (5, 7, 11, 13),
+              'bias_axes': 'cd',
+              'expected_bias_shape': (11, 13),
+          },
       ),
   )
   def test_dtypes(
@@ -536,27 +518,6 @@ def test_dtypes(
       ).mask_invalid()
     self.assertSequencesClose(y, y_expected)
 
-  def test_einsum_dense_nonbroadcasting_equation(self):
-    with self.assertRaises(ValueError):
-      key = jax.random.PRNGKey(1234)
-      x = test_utils.random_sequence(2, 3, 4, 5, 6)
-      l = dense.EinsumDense.Config(
-          'btabc,bc->btad', output_shape=[None, 2]
-      ).make()
-      self.init_and_bind_layer(key, l, x)
-
-  def test_einsum_dense_inconsistent_input_shape(self):
-    key = jax.random.PRNGKey(1234)
-    x = test_utils.random_sequence(2, 3, 5)
-    l = dense.EinsumDense.Config(
-        '...abc,bc->...ad', output_shape=[None, 2]
-    ).make()
-    with self.assertRaises(ValueError):
-      self.init_and_bind_layer(key, l, x)
-    # Show it works with the right input shape.
-    x = test_utils.random_sequence(2, 3, 5, 7, 11)
-    self.assertEqual(l.get_output_shape_for_sequence(x), (5, 2))
-
 
 if __name__ == '__main__':
   test_utils.main()
diff --git a/sequence_layers/jax/dsp.py b/sequence_layers/jax/dsp.py
index 4b3a6d1..19f6375 100644
--- a/sequence_layers/jax/dsp.py
+++ b/sequence_layers/jax/dsp.py
@@ -17,15 +17,17 @@
 import dataclasses
 import fractions
 import math
-from typing import Callable, Literal
+from typing import Callable, Literal, override
 
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
 import numpy as np
+
 from sequence_layers.jax import convolution
 from sequence_layers.jax import signal
 from sequence_layers.jax import types
+from sequence_layers.specs import dsp as spec
 
 __all__ = (
     # go/keep-sorted start
@@ -49,11 +51,11 @@
 FFTPaddingString = Literal['center', 'right']
 
 
-class Frame(types.PreservesType, types.SequenceLayer):
+class Frame(types.PreservesType, types.SequenceLayer, spec.Frame):
   """Produce a sequence of overlapping frames of the input sequence."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig, spec.Frame.Config):
     """Config for Frame layer."""
 
     # The length of frames to generate.
@@ -84,6 +86,7 @@ def __post_init__(self):
             f'{self.padding=} must sum to {self.frame_length - 1=}'
         )
 
+    @override
     def make(self) -> 'Frame':
       return Frame(self, name=self.name)
 
@@ -340,7 +343,7 @@ def layer(
     return result_type(values, mask)
 
 
-class OverlapAdd(types.PreservesType, types.SequenceLayer):
+class OverlapAdd(types.PreservesType, types.SequenceLayer, spec.OverlapAdd):
   """Overlap adds windows of [b, t, frame_length, ...].
 
   For a [b, ti, frame_length, ...] input signal, the resulting sequence has
@@ -353,7 +356,7 @@ class OverlapAdd(types.PreservesType, types.SequenceLayer):
   """
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig, spec.OverlapAdd.Config):
     """Config for OverlapAdd layer."""
 
     # The length of frames to overlap-add.
@@ -381,6 +384,7 @@ def __post_init__(self):
       ):
         raise ValueError(f'Unsupported padding mode: {self.padding}')
 
+    @override
     def make(self) -> 'OverlapAdd':
       return OverlapAdd(self, name=self.name)
 
@@ -705,16 +709,17 @@ def layer(
     return fft_fn(x, axis=axis)
 
 
-class FFT(types.PreservesType, FFTBase):
+class FFT(types.PreservesType, FFTBase, spec.FFT):
   """A layer that applies an FFT to the channels dimension."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig, spec.FFT.Config):
     fft_length: int | None = None
     axis: int = -1
     padding: FFTPaddingString = _DEFAULT_FFT_PADDING
     name: str | None = None
 
+    @override
     def make(self) -> 'FFT':
       return FFT(self, name=self.name)
 
@@ -746,17 +751,18 @@ def fft_fn(x, axis):
     return fft_fn
 
 
-class IFFT(types.PreservesType, FFTBase):
+class IFFT(types.PreservesType, FFTBase, spec.IFFT):
   """A layer that applies an IFFT to the channels dimension."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig, spec.IFFT.Config):
     fft_length: int | None = None
     frame_length: int | None = None
     axis: int = -1
     padding: FFTPaddingString = _DEFAULT_FFT_PADDING
     name: str | None = None
 
+    @override
     def make(self) -> 'IFFT':
       return IFFT(self, name=self.name)
 
@@ -788,16 +794,17 @@ def ifft_fn(a, axis):
     return ifft_fn
 
 
-class RFFT(FFTBase):
+class RFFT(FFTBase, spec.RFFT):
   """A layer that applies an RFFT to the channels dimension."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig, spec.RFFT.Config):
     fft_length: int | None = None
     axis: int = -1
     padding: FFTPaddingString = _DEFAULT_FFT_PADDING
     name: str | None = None
 
+    @override
     def make(self) -> 'RFFT':
       return RFFT(self, name=self.name)
 
@@ -854,17 +861,18 @@ def get_output_dtype(
         raise ValueError(f'Unsupported input dtype: {input_dtype}')
 
 
-class IRFFT(FFTBase):
+class IRFFT(FFTBase, spec.IRFFT):
   """A layer that applies an IRFFT to the channels dimension."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig, spec.IRFFT.Config):
     fft_length: int | None = None
     frame_length: int | None = None
     axis: int = -1
     padding: FFTPaddingString = _DEFAULT_FFT_PADDING
     name: str | None = None
 
+    @override
     def make(self) -> 'IRFFT':
       return IRFFT(self, name=self.name)
 
@@ -921,7 +929,7 @@ def irfft_fn(a, axis=-1):
     return irfft_fn
 
 
-class STFT(types.SequenceLayer):
+class STFT(types.SequenceLayer, spec.STFT):
   """Computes the Short-time Fourier Transform of input signals.
 
   When used with 'right' FFT padding, equivalent to tf.signal.stft.
@@ -933,7 +941,7 @@ class STFT(types.SequenceLayer):
   """
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig, spec.STFT.Config):
     """Config for STFT layer."""
 
     # The frame length of the STFT.
@@ -965,6 +973,7 @@ def __post_init__(self):
           self, 'time_padding', types.validate_padding(self.time_padding)
       )
 
+    @override
     def make(self) -> 'STFT':
       return STFT(self, name=self.name)
 
@@ -1128,7 +1137,7 @@ def layer(
     return dft
 
 
-class InverseSTFT(types.SequenceLayer):
+class InverseSTFT(types.SequenceLayer, spec.InverseSTFT):
   """Computes the inverse Short-time Fourier Transform of input signals.
 
   When used with 'right' FFT padding, equivalent to tf.signal.inverse_stft.
@@ -1140,7 +1149,7 @@ class InverseSTFT(types.SequenceLayer):
   """
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig, spec.InverseSTFT.Config):
     """Config for the InverseSTFT layer."""
 
     # The frame length of the inverse STFT.
@@ -1169,6 +1178,7 @@ def __post_init__(self):
           self, 'time_padding', types.validate_padding(self.time_padding)
       )
 
+    @override
     def make(self) -> 'InverseSTFT':
       return InverseSTFT(self, name=self.name)
 
@@ -1388,7 +1398,9 @@ def layer(
     return ola
 
 
-class LinearToMelSpectrogram(types.PreservesType, types.Stateless):
+class LinearToMelSpectrogram(
+    types.PreservesType, types.Stateless, spec.LinearToMelSpectrogram
+):
   """Converts linear-scale spectrogram to a mel-scale spectrogram.
 
   The spectrogram magnitudes should be uncompressed, *not* log compressed.
@@ -1397,7 +1409,7 @@ class LinearToMelSpectrogram(types.PreservesType, types.Stateless):
   """
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig, spec.LinearToMelSpectrogram.Config):
     """Config for LinearToMelSpectrogram layer."""
 
     # The number of mel bins to compute.
@@ -1412,6 +1424,7 @@ class Config(types.SequenceLayerConfig):
     # An optional name for the layer.
     name: str | None = None
 
+    @override
     def make(self) -> 'LinearToMelSpectrogram':
       return LinearToMelSpectrogram(self, name=self.name)
 
@@ -1459,7 +1472,9 @@ def layer(
     )
 
 
-class Delay(types.PreservesShape, types.PreservesType, types.SequenceLayer):
+class Delay(
+    types.PreservesShape, types.PreservesType, types.SequenceLayer, spec.Delay
+):
   """A layer that delays its input by `length` timesteps.
 
   In contrast to sl.Lookahead, which drops `length` timesteps from the start of
@@ -1468,7 +1483,7 @@ class Delay(types.PreservesShape, types.PreservesType, types.SequenceLayer):
   """
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig, spec.Delay.Config):
     """Config for Delay layer."""
 
     # The non-negative length of the delay to apply. A length of zero is a
@@ -1484,6 +1499,7 @@ class Config(types.SequenceLayerConfig):
     # An optional name for the layer.
     name: str | None = None
 
+    @override
     def make(self) -> 'Delay':
       return Delay(self, name=self.name)
 
@@ -1569,7 +1585,12 @@ def layer(
       return x
 
 
-class Lookahead(types.PreservesShape, types.PreservesType, types.SequenceLayer):
+class Lookahead(
+    types.PreservesShape,
+    types.PreservesType,
+    types.SequenceLayer,
+    spec.Lookahead,
+):
   """A layer that drops the first `length` timesteps from its input.
 
   In contrast to sl.Delay, which inserts `length` invalid timesteps at the start
@@ -1578,7 +1599,7 @@ class Lookahead(types.PreservesShape, types.PreservesType, types.SequenceLayer):
   """
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig, spec.Lookahead.Config):
     """Config for Lookahead layer."""
 
     # The non-negative length of the lookahead to apply. A length of zero is a
@@ -1590,6 +1611,7 @@ class Config(types.SequenceLayerConfig):
     # An optional name for the layer.
     name: str | None = None
 
+    @override
     def make(self) -> 'Lookahead':
       return Lookahead(self, name=self.name)
 
@@ -1664,11 +1686,13 @@ def layer(
       return x
 
 
-class Window(types.PreservesShape, types.PreservesType, types.Stateless):
+class Window(
+    types.PreservesShape, types.PreservesType, types.Stateless, spec.Window
+):
   """Applies a window function as in the STFT/InverseSTFT."""
 
   @dataclasses.dataclass(frozen=True, slots=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig, spec.Window.Config):
     """Config of this layer."""
 
     # The axis onto which the window is applied.
@@ -1678,6 +1702,7 @@ class Config(types.SequenceLayerConfig):
     # Optional name for this layer.
     name: str | None = None
 
+    @override
     def make(self) -> 'Window':
       return Window(self, name=self.name)
 
@@ -1701,6 +1726,7 @@ def _get_axis(self, x: types.Sequence):
     return axis
 
   @types.check_layer
+  @override
   def layer(
       self,
       x: types.Sequence,
diff --git a/sequence_layers/jax/dsp_test.py b/sequence_layers/jax/dsp_test.py
index fdfd125..26fc43c 100644
--- a/sequence_layers/jax/dsp_test.py
+++ b/sequence_layers/jax/dsp_test.py
@@ -20,231 +20,29 @@
 import jax
 import jax.numpy as jnp
 import numpy as np
+
 from sequence_layers.jax import combinators
 from sequence_layers.jax import dsp
 from sequence_layers.jax import signal
 from sequence_layers.jax import test_utils
 from sequence_layers.jax import types
-from sequence_layers.jax import utils
-import tensorflow as tf
-
-
-def _pad_or_truncate_for_fft(values, padding, axis, required_input_length):
-  axis_size = values.shape[axis]
-  pad_amount = max(0, required_input_length - axis_size)
-  if padding == 'center':
-    left = pad_amount // 2
-    right = pad_amount - left
-  else:
-    assert padding == 'right'
-    left, right = 0, pad_amount
-
-  paddings = [(0, 0)] * values.ndim
-  paddings[axis] = (left, right)
-  values = np.pad(values, paddings)
-  axis_size = values.shape[axis]
+from sequence_layers.specs import dsp_behaviors as spec
 
-  trim_amount = max(0, axis_size - required_input_length)
-  if padding == 'center':
-    left = trim_amount // 2
-  else:
-    left = 0
-  return jax.lax.slice_in_dim(values, left, required_input_length, axis=axis)
 
+class FFTTest(test_utils.SequenceLayerTest, spec.FFTTest):
+  """Verify FFT contract."""
 
-class FFTTest(test_utils.SequenceLayerTest, parameterized.TestCase):
-
-  @parameterized.parameters(
-      itertools.product(
-          (((2, 3, 32), -1), ((2, 3, 5, 32), -1), ((2, 3, 5, 32), -2)),
-          (31, 32, 33),
-          ('center', 'right'),
-      )
-  )
-  def test_fft(self, shape_axis, fft_length, padding):
-    shape, axis = shape_axis
-    key = jax.random.PRNGKey(1234)
-    x = test_utils.random_sequence(*shape, dtype=jnp.complex64, low_length=1)
-    l = dsp.FFT.Config(
-        fft_length, axis=axis, padding=padding, name='fft'
-    ).make()
-    l = self.init_and_bind_layer(key, l, x)
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, 1)
-    self.assertEqual(l.name, 'fft')
-    channel_shape = list(shape[2:])
-    channel_shape[axis] = fft_length
-    self.assertEqual(l.get_output_shape_for_sequence(x), tuple(channel_shape))
-    y = self.verify_contract(l, x, training=False)
-    self.assertEmpty(l.variables)
 
-    # Check that the result is the same as manually padding/truncating followed
-    # by the FFT.
-    def apply_fft(values):
-      values = _pad_or_truncate_for_fft(values, padding, axis, fft_length)
-      return np.fft.fft(values, n=fft_length, axis=axis)
+class IFFTTest(test_utils.SequenceLayerTest, spec.IFFTTest):
+  """Verify IFFT contract."""
 
-    y_expected = x.apply_values(apply_fft).mask_invalid()
-    self.assertSequencesClose(y, y_expected, atol=1e-5, rtol=1e-5)
-    self.assertEqual(y.shape[axis], fft_length)
 
+class RFFTTest(test_utils.SequenceLayerTest, spec.RFFTTest):
+  """Verify RFFT contract."""
 
-class IFFTTest(test_utils.SequenceLayerTest, parameterized.TestCase):
 
-  @parameterized.parameters(
-      itertools.product(
-          (((2, 3, 32), -1), ((2, 3, 5, 32), -1), ((2, 3, 5, 32), -2)),
-          (31, 32, 33, None),
-          ('center', 'right'),
-      )
-  )
-  def test_ifft(self, shape_axis, frame_length, padding):
-    shape, axis = shape_axis
-    key = jax.random.PRNGKey(1234)
-
-    # The length of the input sequence.
-    fft_length = shape[axis]
-
-    x = test_utils.random_sequence(*shape, dtype=jnp.complex64)
-    l = dsp.IFFT.Config(
-        fft_length,
-        frame_length=frame_length,
-        axis=axis,
-        padding=padding,
-        name='ifft',
-    ).make()
-
-    # If frame_length is not provided, it should be infered from the input
-    # length by IFFT to be the same as fft_length.
-    if frame_length is None:
-      frame_length = fft_length
-
-    l = self.init_and_bind_layer(key, l, x)
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, 1)
-    self.assertEqual(l.name, 'ifft')
-    channel_shape = list(shape[2:])
-    channel_shape[axis] = frame_length
-    self.assertEqual(l.get_output_shape_for_sequence(x), tuple(channel_shape))
-    y = self.verify_contract(l, x, training=False)
-    self.assertEmpty(l.variables)
-
-    # Check that the result is the same as manually padding/truncating followed
-    # by the IFFT.
-    def apply_fft(values):
-      values = np.fft.ifft(values, n=fft_length, axis=axis)
-      return _pad_or_truncate_for_fft(values, padding, axis, frame_length)
-
-    y_expected = x.apply_values(apply_fft).mask_invalid()
-    self.assertSequencesClose(y, y_expected, atol=1e-5, rtol=1e-5)
-    self.assertEqual(y.shape[axis], frame_length)
-
-
-class RFFTTest(test_utils.SequenceLayerTest, parameterized.TestCase):
-
-  def run_rfft_test(self, shape_axis, fft_length, padding, dtype):
-    shape, axis = shape_axis
-    key = jax.random.PRNGKey(1234)
-    x = test_utils.random_sequence(*shape, dtype=dtype)
-    l = dsp.RFFT.Config(
-        fft_length, axis=axis, padding=padding, name='rfft'
-    ).make()
-    l = self.init_and_bind_layer(key, l, x)
-
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, 1)
-    self.assertEqual(l.name, 'rfft')
-
-    channel_shape = list(shape[2:])
-    channel_shape[axis] = fft_length // 2 + 1
-    self.assertEqual(l.get_output_shape_for_sequence(x), tuple(channel_shape))
-    y = self.verify_contract(l, x, training=False)
-    self.assertEmpty(l.variables)
-
-    # Check that the result is the same as manually padding/truncating followed
-    # by the RFFT.
-    def apply_fft(values):
-      values = _pad_or_truncate_for_fft(values, padding, axis, fft_length)
-      return np.fft.rfft(values, n=fft_length, axis=axis)
-
-    y_expected = x.apply_values(apply_fft).mask_invalid()
-    self.assertSequencesClose(y, y_expected, atol=1e-5, rtol=1e-5)
-    self.assertEqual(y.shape[axis], fft_length // 2 + 1)
-
-  @parameterized.parameters(
-      itertools.product(
-          (((2, 3, 32), -1), ((2, 3, 5, 32), -1), ((2, 3, 5, 32), -2)),
-          (31, 32, 33),
-          ('center', 'right'),
-      )
-  )
-  def test_rfft(self, shape_axis, fft_length, padding):
-    self.run_rfft_test(
-        shape_axis=shape_axis,
-        fft_length=fft_length,
-        padding=padding,
-        dtype=jnp.float32,
-    )
-
-  def test_rfft_bfloat16(self):
-    self.run_rfft_test(
-        shape_axis=((2, 3, 32), -1),
-        fft_length=31,
-        padding='center',
-        dtype=jnp.bfloat16,
-    )
-
-
-class IRFFTTest(test_utils.SequenceLayerTest, parameterized.TestCase):
-
-  @parameterized.parameters(
-      itertools.product(
-          (((2, 3, 17), -1), ((2, 3, 5, 17), -1), ((2, 3, 5, 17), -2)),
-          (31, 32, 33, None),
-          (32, None),
-          ('center', 'right'),
-      )
-  )
-  def test_irfft(self, shape_axis, frame_length, fft_length, padding):
-    shape, axis = shape_axis
-    key = jax.random.PRNGKey(1234)
-
-    x = test_utils.random_sequence(*shape, dtype=jnp.complex64)
-    l = dsp.IRFFT.Config(
-        fft_length,
-        frame_length=frame_length,
-        axis=axis,
-        padding=padding,
-        name='irfft',
-    ).make()
-    l = self.init_and_bind_layer(key, l, x)
-
-    # If frame_length or fft_length are not provided, they are infered from the
-    # input shape.
-    if fft_length is None:
-      fft_length = 2 * (shape[axis] - 1)
-
-    if frame_length is None:
-      frame_length = fft_length
-
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, 1)
-    self.assertEqual(l.name, 'irfft')
-    channel_shape = list(shape[2:])
-    channel_shape[axis] = frame_length
-    self.assertEqual(l.get_output_shape_for_sequence(x), tuple(channel_shape))
-    y = self.verify_contract(l, x, training=False)
-    self.assertEmpty(l.variables)
-
-    # Check that the result is the same as manually padding/truncating followed
-    # by the IRFFT.
-    def apply_fft(values):
-      values = np.fft.irfft(values, n=fft_length, axis=axis)
-      return _pad_or_truncate_for_fft(values, padding, axis, frame_length)
-
-    y_expected = x.apply_values(apply_fft).mask_invalid()
-    self.assertSequencesClose(y, y_expected, atol=1e-5, rtol=1e-5)
-    self.assertEqual(y.shape[axis], frame_length)
+class IRFFTTest(test_utils.SequenceLayerTest, spec.IRFFTTest):
+  """Verify IRFFT contract."""
 
 
 class FFTInverseTTest(test_utils.SequenceLayerTest, parameterized.TestCase):
@@ -294,15 +92,11 @@ def test_fft_inverse(self, shape_axis, fft_length, fft_config_dtype, padding):
     )
 
     # Shortcuts.
-    forward_fn = lambda x: forward(x, training=False)
-    backward_fn = lambda x: backward(x, training=False)
+    def forward_fn(val):
+      return forward(val, training=False)
 
-    # Depending on the padding parameters, there may be no inverse.
-    # In that case the backward transform should be the pseudo-inverse.
-    # For a general test, we test the pseudo-inverse properties. Let A and B
-    # be the forward and backward transforms. They should satisfy:
-    # 1) A B A = A
-    # 2) B A B = B
+    def backward_fn(val):
+      return backward(val, training=False)
 
     y_A = forward_fn(x)  # pylint: disable=invalid-name
     y_BA = backward_fn(y_A)  # pylint: disable=invalid-name
@@ -315,7 +109,8 @@ def test_fft_inverse(self, shape_axis, fft_length, fft_config_dtype, padding):
     self.assertSequencesClose(y_BA, y_BABA, atol=1e-5, rtol=1e-3)
 
 
-class FrameTest(test_utils.SequenceLayerTest, parameterized.TestCase):
+class FrameTest(test_utils.SequenceLayerTest, spec.FrameTest):
+  """Verify Frame contract and JAX specific behaviors."""
 
   @parameterized.product(
       frame_length_frame_step=((1, 1), (2, 1), (1, 2), (2, 2), (3, 2), (2, 3)),
@@ -328,25 +123,20 @@ class FrameTest(test_utils.SequenceLayerTest, parameterized.TestCase):
           'reverse_causal',
           'same',
           'valid',
-          'explicit_semicausal',
           'semicausal_full',
       ),
   )
-  def test_frame(self, frame_length_frame_step, channel_shape, padding):
+  def test_frame_exhaustive(
+      self, frame_length_frame_step, channel_shape, padding
+  ):
     key = jax.random.PRNGKey(1234)
     batch_size = 2
     frame_length, frame_step = frame_length_frame_step
-    if padding == 'explicit_semicausal':
-      total_pad = frame_length - 1
-      overlap = max(0, frame_length - frame_step)
-      explicit_padding = (overlap, total_pad - overlap)
-    else:
-      explicit_padding = padding
     x = test_utils.random_sequence(batch_size, 1, *channel_shape)
     l = dsp.Frame.Config(
         frame_length=frame_length,
         frame_step=frame_step,
-        padding=explicit_padding,
+        padding=padding,
         name='frame',
     ).make()
     l = self.init_and_bind_layer(key, l, x)
@@ -359,7 +149,6 @@ def test_frame(self, frame_length_frame_step, channel_shape, padding):
             'reverse_causal_valid',
             'causal',
             'reverse_causal',
-            'explicit_semicausal',
         ),
     )
     self.assertEqual(l.block_size, frame_step)
@@ -369,14 +158,6 @@ def test_frame(self, frame_length_frame_step, channel_shape, padding):
         expected_input_latency = 0
       case 'reverse_causal_valid' | 'reverse_causal':
         expected_input_latency = frame_length - 1
-      case 'explicit_semicausal':
-        # If frame_length >= frame_step, the below expression simplifies to
-        # frame_step - 1. If frame_length < frame_step, the expression
-        # simplifies to frame_length - 1. In both cases, the output latency will
-        # be zero both expressions are less than frame_step.
-        expected_input_latency = (frame_length - 1) - max(
-            0, frame_length - frame_step
-        )
       case 'semicausal_full':
         expected_input_latency = frame_step - 1
       case _:
@@ -398,238 +179,12 @@ def test_frame(self, frame_length_frame_step, channel_shape, padding):
       self.verify_contract(l, x, training=False)
 
 
-class STFTTest(test_utils.SequenceLayerTest, parameterized.TestCase):
-
-  @parameterized.parameters(
-      itertools.product(
-          (True, False),
-          (1, 2, 3, 4),
-          (1, 2),
-          (2, 3),
-          (
-              'causal_valid',
-              'valid',
-              'same',
-              'reverse_causal_valid',
-              'causal',
-              'reverse_causal',
-          ),
-          ('center', 'right'),
-      )
-  )
-  def test_stft(
-      self,
-      output_magnitude,
-      frame_length,
-      frame_step,
-      fft_length,
-      time_padding,
-      fft_padding,
-  ):
-    key = jax.random.PRNGKey(1234)
-    batch_size, time = 2, 20
-    x = test_utils.random_sequence(batch_size, time)
-    l = dsp.STFT.Config(
-        output_magnitude=output_magnitude,
-        frame_length=frame_length,
-        frame_step=frame_step,
-        fft_length=fft_length,
-        time_padding=time_padding,
-        fft_padding=fft_padding,
-        name='stft',
-    ).make()
-    l = self.init_and_bind_layer(key, l, x)
-    self.assertEqual(l.block_size, frame_step)
-    self.assertEqual(
-        l.supports_step,
-        time_padding
-        in ('causal_valid', 'reverse_causal_valid', 'causal', 'reverse_causal'),
-    )
-    self.assertEqual(1 / l.output_ratio, frame_step)
-    match time_padding:
-      case 'causal_valid' | 'causal':
-        expected_input_latency = 0
-      case 'reverse_causal_valid' | 'reverse_causal':
-        expected_input_latency = frame_length - 1
-      case 'semicausal':
-        # If frame_length > frame_step, input_latency is frame_step - 1 so the
-        # output latency is always zero.
-        expected_input_latency = (frame_length - 1) - max(
-            0, frame_length - frame_step
-        )
-      case _:
-        # Unsupported defaults to zero.
-        expected_input_latency = 0
-    self.assertEqual(l.input_latency, expected_input_latency)
-    self.assertEqual(l.output_latency, expected_input_latency // frame_step)
-    self.assertEqual(l.name, 'stft')
-    self.assertEqual(l.get_output_shape_for_sequence(x), (fft_length // 2 + 1,))
-    y = self.verify_contract(l, x, training=False)
-    self.assertEmpty(l.variables)
-
-    # Check compatibility with tf.signal.stft (which only supports right
-    # padding).
-    if fft_padding == 'right':
-      left, right = utils.convolution_explicit_padding(
-          time_padding, frame_length, frame_step, dilation_rate=1
-      )
-      # Mask is unused so valid does not matter.
-      x = x.pad_time(left, right, valid=False)
-      y_tfs = tf.signal.stft(
-          x.values,
-          frame_length=frame_length,
-          frame_step=frame_step,
-          fft_length=fft_length,
-          pad_end=False,
-      )
-      if output_magnitude:
-        y_tfs = tf.abs(y_tfs)
-      y_tfs = types.Sequence(y_tfs.numpy(), y.mask).mask_invalid().values
-      self.assertAllClose(y.values, y_tfs)
-
-  @parameterized.product(channel_shape=((1,), (2,), (2, 3)))
-  def test_multichannel(self, channel_shape):
-    key = jax.random.PRNGKey(1234)
-    batch_size, time = 2, 20
-    x = test_utils.random_sequence(
-        batch_size, time, low_length=time // 2, *channel_shape
-    )
-    l = dsp.STFT.Config(
-        output_magnitude=True,
-        frame_length=8,
-        frame_step=3,
-        fft_length=8,
-        time_padding='causal',
-        fft_padding='right',
-        name='stft',
-    ).make()
-    l = self.init_and_bind_layer(key, l, x)
-    self.assertEqual(l.block_size, 3)
-    self.assertTrue(l.supports_step)
-    self.assertEqual(1 / l.output_ratio, 3)
-    self.assertEqual(l.name, 'stft')
-    y = self.verify_contract(l, x, training=False)
-
-    x_flat = x.apply_values(lambda v: v.reshape(v.shape[:2] + (-1,)))
-    ys = []
-    for x_i in utils.sequence_unstack(x_flat, axis=2):
-      ys.append(l.layer(x_i, training=False))
-
-    y_expected = (
-        utils.sequence_stack(ys, axis=3)
-        .apply_values(lambda v: v.reshape(v.shape[:3] + channel_shape))
-        .mask_invalid()
-    )
-    self.assertSequencesClose(y, y_expected)
+class STFTTest(test_utils.SequenceLayerTest, spec.STFTTest):
+  """Verify STFT contract."""
 
 
-class InverseSTFTTest(test_utils.SequenceLayerTest, parameterized.TestCase):
-
-  @parameterized.parameters(
-      itertools.product(
-          (1, 2, 3, 4),
-          (1, 2),
-          (2, 3),
-          (
-              'causal',
-              # 'same',  # TODO(rryan): Fix SAME tests.
-              'valid',
-          ),
-          ('center', 'right'),
-      )
-  )
-  def test_inverse_stft(
-      self,
-      frame_length,
-      frame_step,
-      fft_length,
-      time_padding,
-      fft_padding,
-  ):
-    if frame_length < frame_step:
-      self.skipTest('TODO(rryan): Enable length < step tests.')
-    key = jax.random.PRNGKey(1234)
-    batch_size, time = 2, 20
-    x = test_utils.random_sequence(
-        batch_size, time, fft_length // 2 + 1, dtype=jnp.complex64
-    )
-    l = dsp.InverseSTFT.Config(
-        frame_length=frame_length,
-        frame_step=frame_step,
-        fft_length=fft_length,
-        window_fn=signal.inverse_stft_window_fn(frame_step, signal.hann_window),
-        time_padding=time_padding,
-        fft_padding=fft_padding,
-        name='inverse_stft',
-    ).make()
-    l = self.init_and_bind_layer(key, l, x)
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, frame_step)
-    self.assertEqual(l.name, 'inverse_stft')
-    # Only streamable in causal mode.
-    self.assertEqual(l.supports_step, time_padding == 'causal')
-    self.assertEqual(l.get_output_shape_for_sequence(x), ())
-    y = self.verify_contract(l, x, training=False)
-    self.assertEmpty(l.variables)
-
-    # Check compatibility with tf.signal.inverse_stft (which only supports right
-    # FFT padding).
-    if fft_padding == 'right':
-      y_tfs = tf.signal.inverse_stft(
-          x.values,
-          frame_length=frame_length,
-          frame_step=frame_step,
-          fft_length=fft_length,
-          window_fn=tf.signal.inverse_stft_window_fn(
-              frame_step, tf.signal.hann_window
-          ),
-      )
-      if time_padding == 'causal':
-        if trim := max(frame_length - frame_step, 0):
-          y_tfs = y_tfs[:, :-trim]
-      y_tfs = types.Sequence(y_tfs.numpy(), y.mask).mask_invalid().values
-      self.assertAllClose(y.values, y_tfs)
-
-  @parameterized.product(channel_shape=((1,), (2,), (2, 3)))
-  def test_multichannel(self, channel_shape):
-    key = jax.random.PRNGKey(1234)
-    batch_size, time = 2, 20
-    frame_length, frame_step, fft_length = 8, 3, 8
-    x = test_utils.random_sequence(
-        batch_size,
-        time,
-        fft_length // 2 + 1,
-        *channel_shape,
-        low_length=time // 2,
-        dtype=jnp.complex64,
-    )
-    l = dsp.InverseSTFT.Config(
-        frame_length=frame_length,
-        frame_step=frame_step,
-        fft_length=fft_length,
-        window_fn=signal.inverse_stft_window_fn(frame_step, signal.hann_window),
-        time_padding='causal',
-        fft_padding='right',
-        name='inverse_stft',
-    ).make()
-    l = self.init_and_bind_layer(key, l, x)
-    self.assertEqual(l.block_size, 1)
-    self.assertTrue(l.supports_step)
-    self.assertEqual(l.output_ratio, frame_step)
-    self.assertEqual(l.name, 'inverse_stft')
-    y = self.verify_contract(l, x, training=False)
-
-    x_flat = x.apply_values(lambda v: v.reshape(v.shape[:3] + (-1,)))
-    ys = []
-    for x_i in utils.sequence_unstack(x_flat, axis=3):
-      ys.append(l.layer(x_i, training=False))
-
-    y_expected = (
-        utils.sequence_stack(ys, axis=2)
-        .apply_values(lambda v: v.reshape(v.shape[:2] + channel_shape))
-        .mask_invalid()
-    )
-    self.assertSequencesClose(y, y_expected)
+class InverseSTFTTest(test_utils.SequenceLayerTest, spec.InverseSTFTTest):
+  """Verify InverseSTFT contract."""
 
 
 class STFTPerfectReconstructionTest(
@@ -670,7 +225,7 @@ def test_stft_perfect_reconstruction_padding_semicausal_full(
             frame_length=frame_length,
             frame_step=frame_step,
             fft_length=fft_length,
-            window_fn=signal.hann_window,
+            window_fn=window_fn,
             time_padding=time_padding,
             fft_padding=fft_padding,
             name='stft',
@@ -683,9 +238,7 @@ def test_stft_perfect_reconstruction_padding_semicausal_full(
             frame_length=frame_length,
             frame_step=frame_step,
             fft_length=fft_length,
-            window_fn=signal.inverse_stft_window_fn(
-                frame_step, signal.hann_window
-            ),
+            window_fn=signal.inverse_stft_window_fn(frame_step, window_fn),
             time_padding=time_padding,
             fft_padding=fft_padding,
             name='inverse_stft',
@@ -719,86 +272,16 @@ def test_stft_perfect_reconstruction_padding_semicausal_full(
 
 
 class LinearToMelSpectrogramTest(
-    test_utils.SequenceLayerTest, parameterized.TestCase
+    test_utils.SequenceLayerTest, spec.LinearToMelSpectrogramTest
 ):
+  """Verify LinearToMelSpectrogram contract."""
 
-  def test_linear_to_mel_spectrogram(self):
-    key = jax.random.PRNGKey(1234)
-    batch_size, time, num_spectrogram_bins = 2, 3, 5
-    x = test_utils.random_sequence(batch_size, time, num_spectrogram_bins)
-    l = dsp.LinearToMelSpectrogram.Config(
-        num_mel_bins=8,
-        sample_rate=400,
-        lower_edge_hertz=1.0,
-        upper_edge_hertz=200.0,
-        name='linear_to_mel_spectrogram',
-    ).make()
-    l = self.init_and_bind_layer(key, l, x)
-
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, 1)
-    self.assertEqual(l.name, 'linear_to_mel_spectrogram')
-    self.assertEqual(l.get_output_shape_for_sequence(x), (8,))
-    self.verify_contract(l, x, training=False)
-    self.assertEmpty(l.variables)
 
-
-class OverlapAddTest(test_utils.SequenceLayerTest, parameterized.TestCase):
-
-  @parameterized.parameters(
-      itertools.product(
-          (
-              (1, 1),
-              (2, 1),
-              (2, 2),
-              (3, 2),
-              # TODO(rryan): Fix frame_length < frame_step tests.
-              # (1, 2),
-              # (2, 3),
-          ),
-          ((), (3,), (5, 9)),
-          (
-              'causal',
-              # 'same',  # TODO(rryan): Fix SAME tests.
-              'valid',
-              'semicausal_full',
-          ),
-      )
-  )
-  def test_overlap_add(self, frame_length_frame_step, inner_shape, padding):
-    if (
-        frame_length_frame_step == (4, 2)
-        and inner_shape == (5, 9)
-        and padding == 'valid'
-    ):
-      self.skipTest('b/423622422')
-    key = jax.random.PRNGKey(1234)
-    frame_length, frame_step = frame_length_frame_step
-
-    # TODO(rryan): Check why test fails with t = 35.
-    b, t = 2, 34
-    x = test_utils.random_sequence(b, t, frame_length, *inner_shape)
-    l = dsp.OverlapAdd.Config(
-        frame_length=frame_length,
-        frame_step=frame_step,
-        padding=padding,
-        name='overlap_add',
-    ).make()
-    l = self.init_and_bind_layer(key, l, x)
-    self.assertEqual(l.supports_step, padding == 'causal')
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, frame_step)
-    self.assertEqual(l.name, 'overlap_add')
-    self.assertEqual(
-        l.get_output_shape_for_sequence(x),
-        inner_shape,
-    )
-    self.verify_contract(l, x, training=False)
-    self.assertEmpty(l.variables)
+class OverlapAddTest(test_utils.SequenceLayerTest, spec.OverlapAddTest):
+  """Verify OverlapAdd contract and perfect reconstruction."""
 
   @parameterized.parameters((1, 1), (2, 1), (2, 2), (3, 2))
   def test_frame_overlap_add_perfect(self, frame_length, frame_step):
-
     b, t = 2, 35
     x = test_utils.random_sequence(b, t)
     forward = (
@@ -825,23 +308,21 @@ def test_frame_overlap_add_perfect(self, frame_length, frame_step):
     y = forward.layer(x, training=False)
     z = backward.layer(y, training=False)
 
-    # z should not be shorter than x and the extra part should only contain
-    # zeros.
     self.assertLessEqual(x.shape[1], z.shape[1])
-    self.assertTrue(jnp.all(z.lengths() >= x.lengths()))
+    self.assertTrue(np.all(np.array(z.lengths()) >= np.array(x.lengths())))
     np.testing.assert_array_equal(
         z.mask[:, x.shape[1] :],
         jnp.zeros((z.shape[0], z.shape[1] - x.shape[1]), dtype=jnp.bool_),
     )
 
-    # The extra valid entries of z should only contain zeros.
     z_values = z.values[:, : x.shape[1]]
     z_mask = z.mask[:, : x.shape[1]]
     difference_mask = jnp.logical_xor(x.mask, z_mask)
     self.assertTrue(jnp.all(z_values[difference_mask] == 0))
 
 
-class DelayTest(test_utils.SequenceLayerTest, parameterized.TestCase):
+class DelayTest(test_utils.SequenceLayerTest, spec.DelayTest):
+  """Verify Delay contract and nonnegative checks."""
 
   def test_delay_nonnegative(self):
     x = test_utils.random_sequence(2, 11, 3, 5)
@@ -849,32 +330,9 @@ def test_delay_nonnegative(self):
     with self.assertRaises(ValueError):
       l.layer(x, training=False)
 
-  @parameterized.product(length=(0, 1, 4), delay_layer_output=(True, False))
-  def test_delay(self, length, delay_layer_output):
-    x = test_utils.random_sequence(2, 11, 3, 5)
-    l = (
-        dsp.Delay.Config(
-            length=length, delay_layer_output=delay_layer_output, name='delay'
-        )
-        .make()
-        .bind({})
-    )
-    self.assertTrue(l.supports_step)
-    self.assertEqual(l.input_latency, length)
-    self.assertEqual(l.output_latency, 0 if delay_layer_output else length)
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, 1)
-    self.assertEqual(l.name, 'delay')
-    self.assertEqual(
-        l.get_output_shape_for_sequence(x),
-        (3, 5),
-    )
-    y = self.verify_contract(l, x, training=False)
-    self.assertEqual(y.shape[1], 11 + length if delay_layer_output else 11)
-    self.assertEmpty(l.variables)
-
 
-class LookaheadTest(test_utils.SequenceLayerTest, parameterized.TestCase):
+class LookaheadTest(test_utils.SequenceLayerTest, spec.LookaheadTest):
+  """Verify Lookahead contract and nonnegative checks."""
 
   def test_lookahead_nonnegative(self):
     x = test_utils.random_sequence(2, 11, 3, 5)
@@ -882,47 +340,18 @@ def test_lookahead_nonnegative(self):
     with self.assertRaises(ValueError):
       l.layer(x, training=False)
 
-  @parameterized.product(length=(0, 1, 4))
-  def test_lookahead(self, length):
-    x = test_utils.random_sequence(2, 11, 3, 5)
-    l = dsp.Lookahead.Config(length=length, name='lookahead').make().bind({})
-    self.assertTrue(l.supports_step)
-    self.assertEqual(l.input_latency, 0)
-    self.assertEqual(l.output_latency, length)
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, 1)
-    self.assertEqual(l.name, 'lookahead')
-    self.assertEqual(
-        l.get_output_shape_for_sequence(x),
-        (3, 5),
-    )
-    y = self.verify_contract(l, x, training=False)
-    self.assertEqual(y.shape[1], 11 - length)
-    self.assertEmpty(l.variables)
-
-  def test_lookahead_preserve_length_in_layer(self):
-    x = test_utils.random_sequence(2, 11, 3, 5)
-    l = (
-        dsp.Lookahead.Config(
-            length=2,
-            preserve_length_in_layer=True,
-            name='lookahead',
-        )
-        .make()
-        .bind({})
-    )
-    y = self.verify_contract(l, x, training=False)
-    self.assertEqual(y.shape[1], 11)
-
 
-class WindowTest(test_utils.SequenceLayerTest, parameterized.TestCase):
+class WindowTest(test_utils.SequenceLayerTest, spec.WindowTest):
+  """Verify Window contract and invalid axis handling."""
 
   @parameterized.parameters(
       (20, 2),
       (15, 3),
       (10, 4),
   )
-  def test_window(self, frame_step, frame_length_multiplier):
+  def test_window_perfect_reconstruction(
+      self, frame_step, frame_length_multiplier
+  ):
     frame_length = frame_step * frame_length_multiplier
     batch = 2
     time = 11 * frame_step + frame_length
@@ -937,7 +366,7 @@ def test_window(self, frame_step, frame_length_multiplier):
                     frame_length=frame_length,
                     frame_step=frame_step,
                     padding='semicausal',
-                ),  # (B T1 T2 S+K O)
+                ),
                 dsp.Window.Config(
                     axis=2,
                     window_fn=signal.hamming_window,
@@ -952,7 +381,7 @@ def test_window(self, frame_step, frame_length_multiplier):
                     frame_length=frame_length,
                     frame_step=frame_step,
                     padding='causal',
-                ),  # (B T S O)
+                ),
                 dsp.Lookahead.Config(frame_length - frame_step),
             ],
             name='test',
@@ -963,7 +392,6 @@ def test_window(self, frame_step, frame_length_multiplier):
 
     seq_out = module.layer(seq_in, training=False)
 
-    # Trim the lengths.
     expected = types.Sequence.from_lengths(
         seq_in.values[:, : -frame_length + frame_step], seq_out.lengths()
     )
diff --git a/sequence_layers/jax/normalization.py b/sequence_layers/jax/normalization.py
index 0f6b3af..7841821 100644
--- a/sequence_layers/jax/normalization.py
+++ b/sequence_layers/jax/normalization.py
@@ -14,17 +14,18 @@
 """Normalization layers."""
 
 import dataclasses
-from typing import Callable
+from typing import Callable, override
 
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
 import numpy as np
+
 from sequence_layers.jax import meta
 from sequence_layers.jax import types
 from sequence_layers.jax import typing as jt
 from sequence_layers.jax import utils
-
+from sequence_layers.specs import normalization as spec
 
 __all__ = (
     # go/keep-sorted start
@@ -131,11 +132,15 @@ def maybe_zero_gradient(
   return forward_fn_custom_gradient(*args)
 
 
-class L2Normalize(types.PreservesType, types.StatelessPointwise):
+class L2Normalize(
+    types.PreservesType,
+    types.StatelessPointwise,
+    spec.L2Normalize[types.Sequence, types.ShapeDType],
+):
   """L2 normalization over the specified channel axes."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig, spec.L2Normalize.Config):
     """Config for L2Normalize."""
 
     axis: int | types.ShapeLike = -1
@@ -147,6 +152,7 @@ def __post_init__(self):
       if not isinstance(self.axis, int):
         object.__setattr__(self, 'axis', tuple(self.axis))
 
+    @override
     def make(self) -> 'L2Normalize':
       return L2Normalize(self, name=self.name)
 
@@ -184,11 +190,15 @@ def forward_fn(values: jax.Array) -> tuple[jt.AnyPyTree, jt.AnyPyTree]:
     return types.Sequence(y, x.mask)
 
 
-class LayerNormalization(types.PreservesType, types.StatelessPointwise):
+class LayerNormalization(
+    types.PreservesType,
+    types.StatelessPointwise,
+    spec.LayerNormalization[types.Sequence, types.ShapeDType],
+):
   """Applies layer normalization to input sequences."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig, spec.LayerNormalization.Config):
     """Config for LayerNormalization."""
 
     axis: int | types.ShapeLike = -1
@@ -211,6 +221,7 @@ def __post_init__(self):
       if not isinstance(self.axis, int):
         object.__setattr__(self, 'axis', tuple(self.axis))
 
+    @override
     def make(self) -> 'LayerNormalization':
       return LayerNormalization(self, name=self.name)
 
@@ -292,7 +303,11 @@ def forward_fn(values: jax.Array) -> tuple[jt.AnyPyTree, jt.AnyPyTree]:
     return types.Sequence(y, x.mask)
 
 
-class RMSNormalization(types.PreservesType, types.StatelessPointwise):
+class RMSNormalization(
+    types.PreservesType,
+    types.StatelessPointwise,
+    spec.RMSNormalization[types.Sequence, types.ShapeDType],
+):
   """A simplified version of LayerNormalization used in T5.
 
   No mean statistics or offset terms are included.
@@ -307,7 +322,7 @@ class RMSNormalization(types.PreservesType, types.StatelessPointwise):
   """
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig, spec.RMSNormalization.Config):
     """Config for RMSNormalization."""
 
     axis: int | types.ShapeLike = -1
@@ -328,6 +343,7 @@ def __post_init__(self):
       if not isinstance(self.axis, int):
         object.__setattr__(self, 'axis', tuple(self.axis))
 
+    @override
     def make(self) -> 'RMSNormalization':
       return RMSNormalization(self, name=self.name)
 
@@ -398,7 +414,11 @@ def forward_fn(values: jax.Array) -> tuple[jt.AnyPyTree, jt.AnyPyTree]:
     return types.Sequence(y, x.mask)
 
 
-class BatchNormalization(types.PreservesType, types.StatelessPointwise):
+class BatchNormalization(
+    types.PreservesType,
+    types.StatelessPointwise,
+    spec.BatchNormalization[types.Sequence, types.ShapeDType],
+):
   """Applies batch normalization to the channels dimensions of input sequences.
 
   In training mode this layer computes statistics from valid sequence timesteps
@@ -411,7 +431,7 @@ class BatchNormalization(types.PreservesType, types.StatelessPointwise):
   """
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig, spec.BatchNormalization.Config):
     """Batch normalization config."""
 
     axis: int = -1
@@ -430,11 +450,13 @@ class Config(types.SequenceLayerConfig):
     guard_against_excess_precision: bool = False
     name: str | None = None
 
+    @override
     def make(self) -> 'BatchNormalization':
       return BatchNormalization(self, name=self.name)
 
   config: Config
 
+  @override
   @types.check_step
   def step(
       self,
@@ -638,14 +660,18 @@ def _masked_moments(
   return mean, variance
 
 
-class GroupNormalization(types.PreservesType, types.StatelessPointwise):
+class GroupNormalization(
+    types.PreservesType,
+    types.StatelessPointwise,
+    spec.GroupNormalization[types.Sequence, types.ShapeDType],
+):
   """Applies group normalization to input sequences.
 
   https://arxiv.org/abs/1803.08494
   """
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig, spec.GroupNormalization.Config):
     """Config for GroupNormalization."""
 
     num_groups: int
@@ -664,6 +690,7 @@ class Config(types.SequenceLayerConfig):
     guard_against_excess_precision: bool = False
     name: str | None = None
 
+    @override
     def make(self) -> 'GroupNormalization':
       if self.num_groups <= 0:
         raise ValueError(f'{self.num_groups=} must be positive.')
@@ -672,13 +699,16 @@ def make(self) -> 'GroupNormalization':
   config: Config
 
   @property
+  @override
   def supports_step(self) -> bool:
     return self.config.cumulative
 
   @property
+  @override
   def receptive_field_per_step(self) -> dict[int, types.ReceptiveField]:
     return {0: (-np.inf, 0 if self.config.cumulative else np.inf)}
 
+  @override
   def get_initial_state(
       self,
       batch_size: int,
@@ -791,6 +821,7 @@ def _scale_and_shift(
 
     return values
 
+  @override
   @types.check_step
   def step(
       self,
diff --git a/sequence_layers/jax/normalization_test.py b/sequence_layers/jax/normalization_test.py
index 22e037b..af6d887 100644
--- a/sequence_layers/jax/normalization_test.py
+++ b/sequence_layers/jax/normalization_test.py
@@ -14,68 +14,23 @@
 """Normalization tests."""
 
 import itertools
+
 from absl.testing import parameterized
 import chex
 import flax
 import jax
 import jax.numpy as jnp
 import numpy as np
+
 from sequence_layers.jax import combinators
 from sequence_layers.jax import dense
 from sequence_layers.jax import normalization
 from sequence_layers.jax import test_utils
 from sequence_layers.jax import types
+from sequence_layers.specs import normalization_behaviors as spec
 
 
-class L2NormalizeTest(test_utils.SequenceLayerTest):
-
-  def test_invalid_axis(self):
-    """Normalizing over the batch or time dimension is not allowed."""
-    key = jax.random.PRNGKey(1234)
-    l = normalization.L2Normalize.Config(axis=[-1, -2]).make()
-    x = test_utils.random_sequence(2, 3, 5)
-    with self.assertRaises(ValueError):
-      self.init_and_bind_layer(key, l, x)
-
-  @parameterized.parameters(
-      itertools.product(
-          (False, True),
-          [
-              ((2, 10, 3), [-1]),
-              ((2, 3, 5, 9), [-1]),
-              ((2, 3, 5, 9), [-2]),
-              ((2, 3, 5, 9), [-1, -2]),
-          ],
-      )
-  )
-  def test_l2_normalization(self, training, shape_axes):
-    key = jax.random.PRNGKey(1234)
-    shape, axes = shape_axes
-    epsilon = 1e-12
-    l = normalization.L2Normalize.Config(
-        axis=axes, epsilon=epsilon, name='l2_normalization'
-    ).make()
-    x = test_utils.random_sequence(*shape)
-    l = self.init_and_bind_layer(key, l, x)
-
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, 1)
-    self.assertEqual(l.name, 'l2_normalization')
-    self.assertEqual(l.get_output_shape_for_sequence(x), shape[2:])
-
-    y = self.verify_contract(l, x, training=training)
-    self.assertEmpty(flax.core.meta.unbox(l.variables))
-
-    # Verify the train batch is normalized correctly.
-    reduce_axes = tuple(
-        a for a in range(len(shape)) if a in axes or a - len(shape) in axes
-    )
-    x_ss = np.sum(np.square(x.values), axis=reduce_axes, keepdims=True)
-
-    y_expected = types.Sequence(
-        x.values / np.sqrt(x_ss + epsilon), x.mask
-    ).mask_invalid()
-    self.assertSequencesClose(y, y_expected)
+class L2NormalizeTest(spec.L2NormalizeTest, test_utils.SequenceLayerTest):
 
   @parameterized.product(
       test_utils.standard_dtype_configs(input=True),
@@ -103,68 +58,9 @@ def test_l2_normalization_dtypes(self, input_dtype, config):
     )
 
 
-class LayerNormalizationTest(test_utils.SequenceLayerTest):
-
-  def test_invalid_axis(self):
-    """Normalizing over the batch or time dimension is not allowed."""
-    key = jax.random.PRNGKey(1234)
-    l = normalization.LayerNormalization.Config(axis=[-1, -2]).make()
-    x = test_utils.random_sequence(2, 3, 5)
-    with self.assertRaises(ValueError):
-      self.init_and_bind_layer(key, l, x)
-
-  @parameterized.parameters(
-      itertools.product(
-          (False, True),
-          [
-              ((2, 10, 4), [-1], [4]),
-              ((2, 3, 5, 4), [-1], [4]),
-              ((2, 3, 4, 9), [-2], [4]),
-              ((2, 3, 4, 8), [-1, -2], [4, 8]),
-          ],
-      )
-  )
-  def test_layer_normalization(self, training, shape_axes):
-    key = jax.random.PRNGKey(1234)
-    shape, axes, expected_param_shape = shape_axes
-    l = normalization.LayerNormalization.Config(
-        axis=axes, name='layer_normalization'
-    ).make()
-    x = test_utils.random_sequence(*shape)
-    l = self.init_and_bind_layer(key, l, x)
-
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, 1)
-    self.assertEqual(l.name, 'layer_normalization')
-    self.assertEqual(l.get_output_shape_for_sequence(x), shape[2:])
-
-    y = self.verify_contract(l, x, training=training)
-    chex.assert_trees_all_equal_shapes_and_dtypes(
-        flax.core.meta.unbox(l.variables),
-        {
-            'params': {
-                'scale': jnp.zeros(expected_param_shape),
-                'bias': jnp.zeros(expected_param_shape),
-            }
-        },
-    )
-
-    # Verify the train batch is normalized correctly.
-    reduce_axes = tuple(
-        a for a in range(len(shape)) if a in axes or a - len(shape) in axes
-    )
-    mean = np.mean(y.values, axis=reduce_axes)
-    var = np.var(y.values, axis=reduce_axes)
-
-    # Invalid timesteps will have a mean and variance of zero.
-    chex.assert_trees_all_close(mean, np.zeros_like(mean), rtol=1e-6, atol=1e-6)
-    mask = y.mask.astype(jnp.float32)
-    mask = np.reshape(
-        mask, mask.shape + (1,) * (len(mean.shape) - len(mask.shape))
-    )
-    chex.assert_trees_all_close(
-        var, np.broadcast_to(mask, mean.shape), rtol=1e-4, atol=1e-4
-    )
+class LayerNormalizationTest(
+    spec.LayerNormalizationTest, test_utils.SequenceLayerTest
+):
 
   @parameterized.product(
       test_utils.standard_dtype_configs(param=True, input=True),
@@ -214,64 +110,9 @@ def test_layer_normalization_dtypes(self, param_dtype, input_dtype, config):
     )
 
 
-class RMSNormalizationTest(test_utils.SequenceLayerTest):
-
-  def test_invalid_axis(self):
-    """Normalizing over the batch or time dimension is not allowed."""
-    key = jax.random.PRNGKey(1234)
-    l = normalization.RMSNormalization.Config(
-        axis=[-1, -2],
-    ).make()
-    x = test_utils.random_sequence(2, 3, 5)
-    with self.assertRaises(ValueError):
-      self.init_and_bind_layer(key, l, x)
-
-  @parameterized.parameters(
-      itertools.product(
-          (False, True),
-          [
-              ((2, 10, 3), [-1], [3]),
-              ((2, 3, 5, 9), [-1], [9]),
-              ((2, 3, 5, 9), [-2], [5]),
-              ((2, 3, 5, 9), [-1, -2], [5, 9]),
-          ],
-      )
-  )
-  def test_rms_normalization(self, training, shape_axes):
-    key = jax.random.PRNGKey(1234)
-    shape, axes, expected_param_shape = shape_axes
-    epsilon = 1e-1
-    l = normalization.RMSNormalization.Config(
-        axes, epsilon=epsilon, name='rms_normalization'
-    ).make()
-    x = test_utils.random_sequence(*shape)
-    l = self.init_and_bind_layer(key, l, x)
-
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, 1)
-    self.assertEqual(l.name, 'rms_normalization')
-    self.assertEqual(l.get_output_shape_for_sequence(x), shape[2:])
-
-    y = self.verify_contract(l, x, training=training)
-    chex.assert_trees_all_equal_shapes_and_dtypes(
-        flax.core.meta.unbox(l.variables),
-        {
-            'params': {
-                'scale': jnp.zeros(expected_param_shape),
-            }
-        },
-    )
-
-    # Verify the train batch is normalized correctly.
-    reduce_axes = tuple(
-        a for a in range(len(shape)) if a in axes or a - len(shape) in axes
-    )
-    x_ss = np.mean(np.square(x.values), axis=reduce_axes, keepdims=True)
-
-    y_expected = types.Sequence(
-        x.values / np.sqrt(x_ss + epsilon), x.mask
-    ).mask_invalid()
-    self.assertSequencesClose(y, y_expected)
+class RMSNormalizationTest(
+    spec.RMSNormalizationTest, test_utils.SequenceLayerTest
+):
 
   @parameterized.product(
       test_utils.standard_dtype_configs(param=True, input=True),
@@ -314,22 +155,9 @@ def test_rms_normalization_dtypes(self, param_dtype, input_dtype, config):
     )
 
 
-class BatchNormalizationTest(test_utils.SequenceLayerTest):
-
-  def test_batch_normalization_invalid_axis(self):
-    """Normalizing over the batch or time dimension is not allowed."""
-    key = jax.random.PRNGKey(1234)
-    x = test_utils.random_sequence(2, 3, 5)
-    l = normalization.BatchNormalization.Config(axis=0).make()
-    with self.assertRaises(ValueError):
-      self.init_and_bind_layer(key, l, x)
-
-    l = normalization.BatchNormalization.Config(axis=1).make()
-    with self.assertRaises(ValueError):
-      self.init_and_bind_layer(key, l, x)
-
-    l = normalization.BatchNormalization.Config(axis=2).make()
-    self.init_and_bind_layer(key, l, x)
+class BatchNormalizationTest(
+    spec.BatchNormalizationTest, test_utils.SequenceLayerTest
+):
 
   @parameterized.parameters(
       ((4, 10, 3), -1, [3]),
@@ -488,124 +316,9 @@ def test_batch_normalization_dtypes(self, param_dtype, input_dtype, config):
     )
 
 
-class GroupNormalizationTest(test_utils.SequenceLayerTest):
-
-  def test_invalid_axis(self):
-    """Normalizing over the batch or time dimension is not allowed."""
-    key = jax.random.PRNGKey(1234)
-    x = test_utils.random_sequence(2, 3, 5)
-    l = normalization.GroupNormalization.Config(num_groups=1, axis=0).make()
-    with self.assertRaises(ValueError):
-      self.init_and_bind_layer(key, l, x)
-
-    l = normalization.GroupNormalization.Config(num_groups=1, axis=1).make()
-    with self.assertRaises(ValueError):
-      self.init_and_bind_layer(key, l, x)
-
-    l = normalization.GroupNormalization.Config(num_groups=1, axis=2).make()
-    self.init_and_bind_layer(key, l, x)
-
-  def test_invalid_groups(self):
-    key = jax.random.PRNGKey(1234)
-    x = test_utils.random_sequence(2, 3, 5)
-    l = normalization.GroupNormalization.Config(num_groups=2).make()
-    with self.assertRaises(ValueError):
-      self.init_and_bind_layer(key, l, x)
-
-  @parameterized.parameters(
-      itertools.product(
-          [
-              ((8, 6, 6), -1, 3, [6]),
-              ((8, 6, 5, 6), -2, 5, [5]),
-              ((8, 6, 5, 6), -2, 1, [5]),
-          ],
-          (False, True),
-      )
-  )
-  def test_group_normalization(self, shape_axes, cumulative):
-    key = jax.random.PRNGKey(1234)
-    shape, axis, num_groups, expected_param_shape = shape_axes
-    l = normalization.GroupNormalization.Config(
-        num_groups=num_groups,
-        cumulative=cumulative,
-        axis=axis,
-        name='group_normalization',
-    ).make()
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, 1)
-    self.assertEqual(l.name, 'group_normalization')
-
-    x = test_utils.random_sequence(*shape)
-    l = self.init_and_bind_layer(key, l, x, randomize_weights=False)
-    self.assertEqual(l.get_output_shape_for_sequence(x), shape[2:])
-
-    y = self.verify_contract(
-        l,
-        x,
-        training=True,
-        grad_rtol=1e-5,
-        grad_atol=1e-4,
-    )
-    y_test = self.verify_contract(
-        l,
-        x,
-        training=False,
-        grad_rtol=1e-5,
-        grad_atol=1e-4,
-    )
-
-    # Training mode doesn't affect behavior.
-    self.assertSequencesEqual(y, y_test)
-
-    unboxed_variables = flax.core.meta.unbox(l.variables)
-    chex.assert_trees_all_equal_shapes_and_dtypes(
-        unboxed_variables,
-        {
-            'params': {
-                'scale': jnp.zeros(expected_param_shape),
-                'bias': jnp.zeros(expected_param_shape),
-            }
-        },
-    )
-
-    axis = axis + x.ndim if axis < 0 else axis
-    axis_dim = y.values.shape[axis]
-    group_size = axis_dim // num_groups
-    outer_dims, _, inner_dims = np.split(y.values.shape, [axis, axis + 1])
-
-    expanded_param_shape = [1] * y.values.ndim
-    expanded_param_shape[axis] = axis_dim
-    scale = unboxed_variables['params']['scale'].reshape(expanded_param_shape)
-    bias = unboxed_variables['params']['bias'].reshape(expanded_param_shape)
-
-    y_grouped = np.reshape(
-        (y.values - bias) / scale,
-        outer_dims.tolist() + [num_groups, group_size] + inner_dims.tolist(),
-    )
-
-    if cumulative:
-      # TODO(rryan): Test cumulative mode numerically.
-      return
-
-    reduction_dims = [a for a in range(y_grouped.ndim) if a not in (0, axis)]
-
-    expanded_mask = types.Sequence(y_grouped, x.mask).expanded_mask()
-
-    # Check each group is mean zero and unit variance.
-    mean = np.mean(
-        y_grouped, axis=reduction_dims, keepdims=True, where=expanded_mask
-    )
-    var = np.var(
-        y_grouped, axis=reduction_dims, keepdims=True, where=expanded_mask
-    )
-
-    # Handle zero length sequences. The moment calculation avoids NaNs by
-    # capping divisors at 1.
-    mean = np.where(np.isnan(mean), np.zeros_like(mean), mean)
-    var = np.where(np.isnan(var), np.ones_like(var), var)
-
-    chex.assert_trees_all_close(mean, jnp.zeros_like(mean), atol=1e-6)
-    chex.assert_trees_all_close(var, jnp.ones_like(var), atol=1e-4)
+class GroupNormalizationTest(
+    spec.GroupNormalizationTest, test_utils.SequenceLayerTest
+):
 
   @parameterized.product(
       test_utils.standard_dtype_configs(param=True, input=True),
diff --git a/sequence_layers/jax/pooling.py b/sequence_layers/jax/pooling.py
index 5377b5e..1c4677a 100644
--- a/sequence_layers/jax/pooling.py
+++ b/sequence_layers/jax/pooling.py
@@ -27,6 +27,8 @@
 from sequence_layers.jax import typing as jt
 from sequence_layers.jax import utils
 from typing_extensions import override
+from sequence_layers.specs import pooling as spec
+
 
 __all__ = (
     # go/keep-sorted start
@@ -101,7 +103,10 @@ def pack(value: TypingSequence[Any], default: Any) -> tuple[Any, ...]:
 
 
 class BasePooling(
-    types.PreservesType, types.SequenceLayer, metaclass=abc.ABCMeta
+    types.PreservesType,
+    types.SequenceLayer,
+    spec.BasePooling[types.Sequence, types.ShapeDType],
+    metaclass=abc.ABCMeta,
 ):
   """Shared base logic for pooling layers."""
 
@@ -522,11 +527,15 @@ def _paddings(self) -> tuple[Any, ...]:
     return (self.config.time_padding, *self.config.spatial_padding)
 
 
-class MinPooling1D(Pooling1DMixin, BaseMinPooling):
+class MinPooling1D(
+    Pooling1DMixin,
+    BaseMinPooling,
+    spec.MinPooling1D[types.Sequence, types.ShapeDType],
+):
   """A 1D min pooling layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig, spec.MinPooling1D.Config):
     """Config for MinPooling1D."""
 
     pool_size: int
@@ -544,11 +553,15 @@ def make(self) -> 'MinPooling1D':
   config: Config
 
 
-class MaxPooling1D(Pooling1DMixin, BaseMaxPooling):
+class MaxPooling1D(
+    Pooling1DMixin,
+    BaseMaxPooling,
+    spec.MaxPooling1D[types.Sequence, types.ShapeDType],
+):
   """A 1D max pooling layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig, spec.MaxPooling1D.Config):
     """Config for MaxPooling1D."""
 
     pool_size: int
@@ -566,11 +579,15 @@ def make(self) -> 'MaxPooling1D':
   config: Config
 
 
-class AveragePooling1D(Pooling1DMixin, BaseAveragePooling):
+class AveragePooling1D(
+    Pooling1DMixin,
+    BaseAveragePooling,
+    spec.AveragePooling1D[types.Sequence, types.ShapeDType],
+):
   """A 1D average pooling layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig, spec.AveragePooling1D.Config):
     """Config for AveragePooling1D."""
 
     pool_size: int
@@ -591,11 +608,15 @@ def make(self) -> 'AveragePooling1D':
   config: Config
 
 
-class MinPooling2D(Pooling2DMixin, BaseMinPooling):
+class MinPooling2D(
+    Pooling2DMixin,
+    BaseMinPooling,
+    spec.MinPooling2D[types.Sequence, types.ShapeDType],
+):
   """A 2D min pooling layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig, spec.MinPooling2D.Config):
     """Config for MinPooling2D."""
 
     pool_size: int | TypingSequence[int]
@@ -644,11 +665,15 @@ def make(self) -> 'MinPooling2D':  # pytype: disable=invalid-annotation
   config: Config
 
 
-class MaxPooling2D(Pooling2DMixin, BaseMaxPooling):
+class MaxPooling2D(
+    Pooling2DMixin,
+    BaseMaxPooling,
+    spec.MaxPooling2D[types.Sequence, types.ShapeDType],
+):
   """A 2D max pooling layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig, spec.MaxPooling2D.Config):
     """Config for MaxPooling2D."""
 
     pool_size: int | TypingSequence[int]
@@ -697,11 +722,15 @@ def make(self) -> 'MaxPooling2D':  # pytype: disable=invalid-annotation
   config: Config
 
 
-class AveragePooling2D(Pooling2DMixin, BaseAveragePooling):
+class AveragePooling2D(
+    Pooling2DMixin,
+    BaseAveragePooling,
+    spec.AveragePooling2D[types.Sequence, types.ShapeDType],
+):
   """A 2D average pooling layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig, spec.AveragePooling2D.Config):
     """Config for AveragePooling2D."""
 
     pool_size: int | TypingSequence[int]
@@ -754,11 +783,15 @@ def make(self) -> 'AveragePooling2D':  # pytype: disable=invalid-annotation
   config: Config
 
 
-class MinPooling3D(Pooling3DMixin, BaseMinPooling):
+class MinPooling3D(
+    Pooling3DMixin,
+    BaseMinPooling,
+    spec.MinPooling3D[types.Sequence, types.ShapeDType],
+):
   """A 3D min pooling layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig, spec.MinPooling3D.Config):
     """Config for MinPooling3D."""
 
     pool_size: int | TypingSequence[int]
@@ -809,11 +842,15 @@ def make(self) -> 'MinPooling3D':  # pytype: disable=invalid-annotation
   config: Config
 
 
-class MaxPooling3D(Pooling3DMixin, BaseMaxPooling):
+class MaxPooling3D(
+    Pooling3DMixin,
+    BaseMaxPooling,
+    spec.MaxPooling3D[types.Sequence, types.ShapeDType],
+):
   """A 3D max pooling layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig, spec.MaxPooling3D.Config):
     """Config for MaxPooling3D."""
 
     pool_size: int | TypingSequence[int]
@@ -864,11 +901,15 @@ def make(self) -> 'MaxPooling3D':  # pytype: disable=invalid-annotation
   config: Config
 
 
-class AveragePooling3D(Pooling3DMixin, BaseAveragePooling):
+class AveragePooling3D(
+    Pooling3DMixin,
+    BaseAveragePooling,
+    spec.AveragePooling3D[types.Sequence, types.ShapeDType],
+):
   """A 3D average pooling layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig, spec.AveragePooling3D.Config):
     """Config for AveragePooling3D."""
 
     pool_size: int | TypingSequence[int]
diff --git a/sequence_layers/jax/pooling_test.py b/sequence_layers/jax/pooling_test.py
index 7b5fe2a..8a08420 100644
--- a/sequence_layers/jax/pooling_test.py
+++ b/sequence_layers/jax/pooling_test.py
@@ -18,160 +18,17 @@
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from sequence_layers.jax import convolution
 from sequence_layers.jax import pooling
 from sequence_layers.jax import test_utils
 from sequence_layers.jax import types
 from sequence_layers.jax import utils
+from sequence_layers.specs import pooling_behaviors as spec
 
 
-class Pooling1DTest(test_utils.SequenceLayerTest):
 
-  @parameterized.product(
-      pool_type_kwargs=(
-          ('min', {}),
-          ('max', {}),
-          ('average', {'masked_average': False}),
-          ('average', {'masked_average': True}),
-      ),
-      params=[
-          # 1x1 conv.
-          (1, 1, 1),
-          # even pool_size with smaller, equal and larger strides.
-          (2, 1, 1),
-          (2, 2, 1),
-          (2, 3, 1),
-          # odd pool_size with smaller, equal and larger strides.
-          (3, 2, 1),
-          (3, 3, 1),
-          (3, 4, 1),
-          # pool_size smaller, equal and larger than even dilation_rate.
-          (1, 1, 2),
-          (2, 1, 2),
-          (3, 1, 2),
-          # pool_size smaller, equal and larger than odd dilation_rate.
-          (1, 1, 3),
-          (2, 1, 3),
-          (3, 1, 3),
-      ],
-      padding=[
-          'same',
-          'valid',
-          'reverse_causal_valid',
-          'causal',
-          'reverse_causal',
-          'semicausal',
-      ],
-  )
-  def test_pooling1d(self, pool_type_kwargs, params, padding):
-    pool_type, kwargs = pool_type_kwargs
-    return self._test_pooling1d(
-        pool_type,
-        params,
-        (3,),
-        padding,
-        jnp.float32,
-        **kwargs,
-    )
-
-  @parameterized.product(
-      pool_type_kwargs=(
-          ('min', {}),
-          ('max', {}),
-          ('average', {'masked_average': False}),
-          ('average', {'masked_average': True}),
-      ),
-      dtype=(jnp.float32, jnp.int32),
-  )
-  def test_dtypes(self, pool_type_kwargs, dtype):
-    pool_type, kwargs = pool_type_kwargs
-    return self._test_pooling1d(
-        pool_type, (3, 2, 1), (3,), 'reverse_causal', dtype, **kwargs
-    )
-
-  @parameterized.product(
-      pool_type_kwargs=(
-          ('min', {}),
-          ('max', {}),
-          ('average', {'masked_average': False}),
-          ('average', {'masked_average': True}),
-      ),
-      channel_shape=(
-          (),
-          (3,),
-          (3, 5),
-      ),
-  )
-  def test_channel_shapes(self, pool_type_kwargs, channel_shape):
-    pool_type, kwargs = pool_type_kwargs
-    return self._test_pooling1d(
-        pool_type,
-        (3, 2, 1),
-        channel_shape,
-        'reverse_causal',
-        jnp.float32,
-        **kwargs,
-    )
-
-  @parameterized.product(
-      masked_average=[True, False],
-  )
-  def test_masked_average(self, masked_average):
-    key = jax.random.PRNGKey(1234)
-    pool_size, stride, dilation_rate = 3, 3, 1
-    padding = 'reverse_causal'
-    l = pooling.AveragePooling1D.Config(
-        pool_size=pool_size,
-        strides=stride,
-        dilation_rate=dilation_rate,
-        padding=padding,
-        name='pool_1d',
-        masked_average=masked_average,
-    ).make()
-
-    x = types.Sequence(
-        jnp.array([
-            [1, 2, 3, 4, 5, 6],
-            [3, 4, 5, 6, 7, 8],
-            [5, 6, 7, 8, 9, 0],
-            [2, 3, 0, 6, 2, 1],
-            [0, 6, 2, 1, 7, 8],
-        ]).astype(jnp.float32),
-        jnp.array([
-            [False, False, False, False, False, False],
-            [True, True, True, False, False, False],
-            [True, True, True, True, False, False],
-            [True, True, True, True, True, False],
-            [True, True, True, True, True, True],
-        ]),
-    )
-    l = self.init_and_bind_layer(key, l, x)
-    y = l(x, training=False)
-    if masked_average:
-      expected_y_values = jnp.array([
-          [0.0, 0.0],
-          [(3 + 4 + 5) / 3.0, 0],
-          [(5 + 6 + 7) / 3.0, 8],
-          [(2 + 3 + 0) / 3.0, (6 + 2) / 2.0],
-          [(0 + 6 + 2) / 3.0, (1 + 7 + 8) / 3.0],
-      ])
-    else:
-      expected_y_values = jnp.array([
-          [0.0, 0.0],
-          [(3 + 4 + 5) / 3.0, 0],
-          [(5 + 6 + 7) / 3.0, 8 / 3.0],
-          [(2 + 3 + 0) / 3.0, (6 + 2) / 3.0],
-          [(0 + 6 + 2) / 3.0, (1 + 7 + 8) / 3.0],
-      ])
-    expected_y_mask = jnp.array([
-        [False, False],
-        [True, False],
-        [True, True],
-        [True, True],
-        [True, True],
-    ])
-    expected_y = types.Sequence(expected_y_values, expected_y_mask)
-    self.assertSequencesClose(y, expected_y)
+class Pooling1DTest(test_utils.SequenceLayerTest, spec.Pooling1DTest):
 
   def _test_pooling1d(
       self, pool_type, params, channel_shape, padding, dtype, **kwargs
@@ -191,11 +48,7 @@ def _test_pooling1d(
             name='pool_1d',
             **kwargs,
         ).make()
-        pad_value = (
-            jnp.inf
-            if jnp.issubdtype(dtype, jnp.floating)
-            else jnp.iinfo(dtype).max
-        )
+        pad_value = np.inf
         golden_fn = lambda x: nn.pooling.min_pool(
             x.values,
             window_shape=(pool_size,),
@@ -211,11 +64,7 @@ def _test_pooling1d(
             name='pool_1d',
             **kwargs,
         ).make()
-        pad_value = (
-            -jnp.inf
-            if jnp.issubdtype(dtype, jnp.floating)
-            else jnp.iinfo(dtype).min
-        )
+        pad_value = -np.inf
         golden_fn = lambda x: nn.pooling.max_pool(
             x.values,
             window_shape=(pool_size,),
@@ -335,197 +184,7 @@ def _test_pooling1d(
         chex.assert_trees_all_equal(y.mask, mask_golden)
 
 
-class Pooling2DTest(test_utils.SequenceLayerTest):
-
-  @parameterized.product(
-      pool_type_kwargs=(
-          ('min', {}),
-          ('max', {}),
-          ('average', {'masked_average': False}),
-          ('average', {'masked_average': True}),
-      ),
-      params=[
-          # 1x1 conv.
-          (1, 1, 1),
-          # even pool_size with smaller, equal and larger strides.
-          (2, 1, 1),
-          (2, 2, 1),
-          (2, 3, 1),
-          # odd pool_size with smaller, equal and larger strides.
-          (3, 2, 1),
-          (3, 3, 1),
-          (3, 4, 1),
-          # pool_size smaller, equal and larger than even dilation_rate.
-          (1, 1, 2),
-          (2, 1, 2),
-          (3, 1, 2),
-          # pool_size smaller, equal and larger than odd dilation_rate.
-          (1, 1, 3),
-          (2, 1, 3),
-          (3, 1, 3),
-      ],
-      time_padding=[
-          'same',
-          'valid',
-          'reverse_causal_valid',
-          'causal',
-          'reverse_causal',
-          'semicausal',
-      ],
-  )
-  def test_pooling2d(
-      self,
-      pool_type_kwargs,
-      params,
-      time_padding,
-  ):
-    pool_type, kwargs = pool_type_kwargs
-    self._test_pooling2d(
-        pool_type,
-        params,
-        (9,),
-        time_padding,
-        'same',
-        jnp.float32,
-        **kwargs,
-    )
-
-  @parameterized.product(
-      pool_type_kwargs=(
-          ('min', {}),
-          ('max', {}),
-          ('average', {'masked_average': False}),
-          ('average', {'masked_average': True}),
-      ),
-      spatial_padding=[
-          'same',
-          'valid',
-          'reverse_causal_valid',
-          'causal',
-          'reverse_causal',
-          'semicausal',
-      ],
-  )
-  def test_spatial_padding(self, pool_type_kwargs, spatial_padding):
-    pool_type, kwargs = pool_type_kwargs
-    return self._test_pooling2d(
-        pool_type,
-        (3, 2, 1),
-        (9,),
-        'reverse_causal',
-        spatial_padding,
-        jnp.float32,
-        **kwargs,
-    )
-
-  @parameterized.product(
-      pool_type_kwargs=(
-          ('min', {}),
-          ('max', {}),
-          ('average', {'masked_average': False}),
-          ('average', {'masked_average': True}),
-      ),
-      dtype=(jnp.float32, jnp.int32),
-  )
-  def test_dtypes(self, pool_type_kwargs, dtype):
-    jax.config.update('jax_traceback_filtering', 'off')
-    pool_type, kwargs = pool_type_kwargs
-    return self._test_pooling2d(
-        pool_type,
-        (3, 2, 1),
-        (9,),
-        'reverse_causal',
-        'reverse_causal',
-        dtype,
-        **kwargs,
-    )
-
-  @parameterized.product(
-      pool_type_kwargs=(
-          ('min', {}),
-          ('max', {}),
-          ('average', {'masked_average': False}),
-          ('average', {'masked_average': True}),
-      ),
-      channel_shape=(
-          (9,),
-          (9, 5),
-          (9, 5, 3),
-      ),
-  )
-  def test_channel_shapes(self, pool_type_kwargs, channel_shape):
-    pool_type, kwargs = pool_type_kwargs
-    return self._test_pooling2d(
-        pool_type,
-        (3, 2, 1),
-        channel_shape,
-        'reverse_causal',
-        'reverse_causal',
-        jnp.float32,
-        **kwargs,
-    )
-
-  @parameterized.product(
-      masked_average=[True, False],
-  )
-  def test_masked_average(self, masked_average):
-    key = jax.random.PRNGKey(1234)
-    pool_size, stride, dilation_rate = (3, 2), (3, 2), (1, 1)
-    time_padding = 'reverse_causal'
-    spatial_padding = 'reverse_causal'
-    l = pooling.AveragePooling2D.Config(
-        pool_size=pool_size,
-        strides=stride,
-        dilation_rate=dilation_rate,
-        time_padding=time_padding,
-        spatial_padding=spatial_padding,
-        name='pool_2d',
-        masked_average=masked_average,
-    ).make()
-
-    x = types.Sequence(
-        jnp.array([
-            [[1, 2], [2, 3], [5, 6], [7, 8], [9, 3], [4, 2]],
-            [[2, 3], [5, 6], [7, 8], [9, 3], [3, 1], [2, 7]],
-            [[5, 2], [7, 3], [0, 3], [3, 1], [2, 6], [1, 2]],
-            [[7, 3], [0, 3], [3, 1], [2, 6], [1, 2], [3, 4]],
-            [[0, 3], [3, 1], [2, 6], [1, 2], [3, 4], [5, 7]],
-        ]).astype(jnp.float32),
-        jnp.array([
-            [False, False, False, False, False, False],
-            [True, True, True, False, False, False],
-            [True, True, True, True, False, False],
-            [True, True, True, True, True, False],
-            [True, True, True, True, True, True],
-        ]),
-    )
-    l = self.init_and_bind_layer(key, l, x)
-    y = l(x, training=False)
-    if masked_average:
-      expected_y_values = jnp.array([
-          [[0.0], [0.0]],
-          [[(2 + 5 + 7 + 3 + 6 + 8) / 6.0], [0]],
-          [[(5 + 7 + 0 + 2 + 3 + 3) / 6.0], [(3 + 1) / 2.0]],
-          [[(7 + 0 + 3 + 3 + 3 + 1) / 6.0], [(2 + 1 + 6 + 2) / 4.0]],
-          [[(0 + 3 + 2 + 3 + 1 + 6) / 6.0], [(1 + 3 + 5 + 2 + 4 + 7) / 6.0]],
-      ])
-    else:
-      expected_y_values = jnp.array([
-          [[0.0], [0.0]],
-          [[(2 + 5 + 7 + 3 + 6 + 8) / 6.0], [0]],
-          [[(5 + 7 + 0 + 2 + 3 + 3) / 6.0], [(3 + 1) / 6.0]],
-          [[(7 + 0 + 3 + 3 + 3 + 1) / 6.0], [(2 + 1 + 6 + 2) / 6.0]],
-          [[(0 + 3 + 2 + 3 + 1 + 6) / 6.0], [(1 + 3 + 5 + 2 + 4 + 7) / 6.0]],
-      ])
-    expected_y_mask = jnp.array([
-        [False, False],
-        [True, False],
-        [True, True],
-        [True, True],
-        [True, True],
-    ])
-    expected_y = types.Sequence(expected_y_values, expected_y_mask)
-    self.assertSequencesClose(y, expected_y)
+class Pooling2DTest(test_utils.SequenceLayerTest, spec.Pooling2DTest):
 
   def _test_pooling2d(
       self,
@@ -556,11 +215,7 @@ def _test_pooling2d(
             name='pool_2d',
             **kwargs,
         ).make()
-        pad_value = (
-            jnp.inf
-            if jnp.issubdtype(dtype, jnp.floating)
-            else jnp.iinfo(dtype).max
-        )
+        pad_value = np.inf
         golden_fn = lambda x: nn.pooling.min_pool(
             x.values,
             window_shape=(pool_size, pool_size),
@@ -577,11 +232,7 @@ def _test_pooling2d(
             name='pool_2d',
             **kwargs,
         ).make()
-        pad_value = (
-            -jnp.inf
-            if jnp.issubdtype(dtype, jnp.floating)
-            else jnp.iinfo(dtype).min
-        )
+        pad_value = -np.inf
         golden_fn = lambda x: nn.pooling.max_pool(
             x.values,
             window_shape=(pool_size, pool_size),
@@ -713,7 +364,7 @@ def _test_pooling2d(
         chex.assert_trees_all_equal(y.mask, mask_golden)
 
 
-class Pooling3DTest(test_utils.SequenceLayerTest):
+class Pooling3DTest(test_utils.SequenceLayerTest, spec.Pooling3DTest):
 
   @parameterized.product(
       pool_type_kwargs=(
@@ -943,11 +594,7 @@ def _test_pooling3d(
             name='pool_3d',
             **kwargs,
         ).make()
-        pad_value = (
-            jnp.inf
-            if jnp.issubdtype(dtype, jnp.floating)
-            else jnp.iinfo(dtype).max
-        )
+        pad_value = np.inf
         golden_fn = lambda x: nn.pooling.min_pool(
             x.values,
             window_shape=(pool_size, pool_size, pool_size),
@@ -968,11 +615,7 @@ def _test_pooling3d(
             name='pool_3d',
             **kwargs,
         ).make()
-        pad_value = (
-            -jnp.inf
-            if jnp.issubdtype(dtype, jnp.floating)
-            else jnp.iinfo(dtype).min
-        )
+        pad_value = -np.inf
         golden_fn = lambda x: nn.pooling.max_pool(
             x.values,
             window_shape=(pool_size, pool_size, pool_size),
diff --git a/sequence_layers/jax/position.py b/sequence_layers/jax/position.py
index eb040c5..670045c 100644
--- a/sequence_layers/jax/position.py
+++ b/sequence_layers/jax/position.py
@@ -14,13 +14,16 @@
 """Position embeddings and timing signals."""
 
 import dataclasses
+from typing import override
 
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
 import numpy as np
+
 from sequence_layers.jax import types
 from sequence_layers.jax import utils
+from sequence_layers.specs import position as position_spec
 
 __all__ = (
     # go/keep-sorted start
@@ -31,26 +34,20 @@
 
 
 class AddTimingSignal(
-    types.PreservesType, types.PreservesShape, types.SequenceLayer
+    types.PreservesType,
+    types.PreservesShape,
+    types.SequenceLayer,
+    position_spec.AddTimingSignal[types.Sequence, types.ChannelSpec],
 ):
   """Adds sinusoids at varying frequencies to the input channels dimension."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(position_spec.AddTimingSignal.Config):
     """Config for AddTimingSignal."""
 
-    min_timescale: float = 1.0
-    max_timescale: float = 1.0e4
-    trainable_scale: bool = False
-    # Channel axes over which the timing signal's entries should vary.
-    axes: int | tuple[int, ...] | None = None
-    sharding: types.Sharding | None = None
     param_dtype: types.DType = jnp.float32
-    # If true, only advances position counter for valid timesteps. If false, the
-    # position is determined by the physical length of the inputs.
-    only_advance_position_for_valid_timesteps: bool = True
-    name: str | None = None
 
+    @override
     def make(self) -> 'AddTimingSignal':
       return AddTimingSignal(self, name=self.name)
 
@@ -180,7 +177,12 @@ def layer(
 
 
 class ApplyRotaryPositionalEncoding(
-    types.PreservesType, types.PreservesShape, types.SequenceLayer
+    types.PreservesType,
+    types.PreservesShape,
+    types.SequenceLayer,
+    position_spec.ApplyRotaryPositionalEncoding[
+        types.Sequence, types.ChannelSpec
+    ],
 ):
   """Applies Rotary Positional Encodings (RoPE) to the sequence.
 
@@ -189,25 +191,10 @@ class ApplyRotaryPositionalEncoding(
   """
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(position_spec.ApplyRotaryPositionalEncoding.Config):
     """Config for ApplyRotaryPositionalEncoding."""
 
-    max_wavelength: float
-    axis: int = -1
-    # If true, only advances position counter for valid timesteps. If false, the
-    # position is determined by the physical length of the inputs.
-    only_advance_position_for_valid_timesteps: bool = True
-    # Whether RoPE should be applied with positions in at least float32. This
-    # option is for backwards compatibility. True is the recommended value.
-    positions_in_at_least_fp32: bool = True
-    # If specified, the [batch_size, time] jnp.int32 position used for computing
-    # RoPE will be read from the constants dictionary with this name. Otherwise,
-    # the physical position in the array is used. If specified,
-    # only_advance_position_for_valid_timesteps has no effect.
-    positions_name: str | None = None
-    # An optional name for the layer.
-    name: str | None = None
-
+    @override
     def make(self) -> 'ApplyRotaryPositionalEncoding':
       return ApplyRotaryPositionalEncoding(self, name=self.name)
 
diff --git a/sequence_layers/jax/position_test.py b/sequence_layers/jax/position_test.py
index 4dc7d2b..7adf2fb 100644
--- a/sequence_layers/jax/position_test.py
+++ b/sequence_layers/jax/position_test.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Google LLC
+# Copyright 2026 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,165 +11,23 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Tests for position layers."""
+"""Tests for position layers in JAX."""
 
 from absl.testing import parameterized
 import chex
 import flax
 import jax
 import jax.numpy as jnp
-import numpy as np
-from sequence_layers.jax import position as position_lib
-from sequence_layers.jax import test_utils
-from sequence_layers.jax import types
-
-
-class AddTimingSignalTest(test_utils.SequenceLayerTest):
-
-  @parameterized.parameters(
-      dict(
-          min_timescale=1.0,
-          max_timescale=1.0e4,
-          trainable_scale=True,
-          channel_shape=(3,),
-          axes=None,
-      ),
-      dict(
-          min_timescale=1.0,
-          max_timescale=1.0e4,
-          trainable_scale=False,
-          channel_shape=(3,),
-          axes=None,
-      ),
-      dict(
-          min_timescale=10.0,
-          max_timescale=1.0e5,
-          trainable_scale=False,
-          channel_shape=(3,),
-          axes=0,
-      ),
-      dict(
-          min_timescale=1.0,
-          max_timescale=1.0e4,
-          trainable_scale=True,
-          channel_shape=(5, 9),
-          axes=(1,),
-      ),
-      dict(
-          min_timescale=1.0,
-          max_timescale=1.0e4,
-          trainable_scale=True,
-          channel_shape=(5, 9, 3),
-          axes=[1, 2],
-      ),
-      dict(
-          min_timescale=1.0,
-          max_timescale=1.0e4,
-          trainable_scale=True,
-          channel_shape=(5, 9),
-          axes=(1,),
-          only_advance_position_for_valid_timesteps=False,
-      ),
-  )
-  def test_basic(
-      self,
-      min_timescale,
-      max_timescale,
-      trainable_scale,
-      channel_shape,
-      axes,
-      only_advance_position_for_valid_timesteps=True,
-  ):
-    key = jax.random.PRNGKey(1234)
-    l = position_lib.AddTimingSignal.Config(
-        min_timescale=min_timescale,
-        max_timescale=max_timescale,
-        trainable_scale=trainable_scale,
-        axes=axes,
-        only_advance_position_for_valid_timesteps=only_advance_position_for_valid_timesteps,
-        name='add_timing_signal',
-    ).make()
-
-    batch_size = 8
-    x = test_utils.random_sequence(batch_size, 1, *channel_shape)
-    l = self.init_and_bind_layer(key, l, x)
 
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, 1)
-    self.assertEqual(l.name, 'add_timing_signal')
-    self.assertEqual(l.get_output_shape_for_sequence(x), x.shape[2:])
-
-    unboxed_variables = flax.core.meta.unbox(l.variables)
-    if trainable_scale:
-      chex.assert_trees_all_equal_shapes_and_dtypes(
-          unboxed_variables,
-          {
-              'params': {
-                  'scale': jnp.zeros([]),
-              }
-          },
-      )
-    else:
-      self.assertEmpty(jax.tree_util.tree_leaves(unboxed_variables))
-
-    for time in range(13 * l.block_size, 15 * l.block_size):
-      # Test non-contiguous masks to demonstrate that
-      # only_advance_position_for_valid_timesteps works.
-      x = test_utils.random_sequence(
-          batch_size,
-          time,
-          *channel_shape,
-          random_mask=True,
-      )
-      self.verify_contract(l, x, training=False, grad_atol=1e-5, grad_rtol=1e-5)
-
-  @parameterized.parameters(
-      dict(channel_shape=(2, 3), axes=-1, normalized_axes=(1,)),
-      dict(channel_shape=(2, 3, 5), axes=[0, 2], normalized_axes=(0, 2)),
-  )
-  def test_timing_signal_along_axes(self, channel_shape, axes, normalized_axes):
-    key = jax.random.PRNGKey(1234)
-    layer = position_lib.AddTimingSignal.Config(
-        axes=axes,
-        name='add_timing_signal',
-    ).make()
-
-    batch_size = 2
-    seq_len = 3
-    inputs = types.Sequence.from_values(
-        jnp.zeros((batch_size, seq_len, *channel_shape))
-    )
-    layer = self.init_and_bind_layer(key, layer, inputs)
-    outputs = layer(inputs, training=False)
-    outputs = np.asarray(outputs.values[0, -1])
+from sequence_layers.jax import test_utils
+from sequence_layers.specs import position_behaviors
 
-    channel_dims = len(channel_shape)
 
-    with self.subTest('equal_along_broadcasted_axes'):
-      broadcast_slice_0 = tuple(
-          slice(None) if axis in normalized_axes else 0
-          for axis in range(channel_dims)
-      )
-      broadcast_slice_1 = tuple(
-          slice(None) if axis in normalized_axes else 1
-          for axis in range(channel_dims)
-      )
-      self.assertAllEqual(
-          outputs[broadcast_slice_0], outputs[broadcast_slice_1]
-      )
-
-    with self.subTest('not_equal_over_all_axes'):
-      complementary_slice_0 = tuple(
-          0 if axis in normalized_axes else slice(None)
-          for axis in range(channel_dims)
-      )
-      complementary_slice_1 = tuple(
-          1 if axis in normalized_axes else slice(None)
-          for axis in range(channel_dims)
-      )
-      self.assertNotAllEqual(
-          outputs[complementary_slice_0], outputs[complementary_slice_1]
-      )
+class AddTimingSignalTest(
+    position_behaviors.AddTimingSignalTest,
+    test_utils.SequenceLayerTest,
+    parameterized.TestCase,
+):
 
   @parameterized.product(
       test_utils.standard_dtype_configs(param=True, input=True),
@@ -184,21 +42,21 @@ def test_dtypes(
     channel_shape = (2, 3)
     min_timescale = 1.0
     max_timescale = 1.0e4
-    key = jax.random.PRNGKey(1234)
-    l = position_lib.AddTimingSignal.Config(
+    config = self.sl.AddTimingSignal.Config(
         min_timescale=min_timescale,
         max_timescale=max_timescale,
         trainable_scale=trainable_scale,
         param_dtype=param_dtype,
         name='add_timing_signal',
-    ).make()
-
-    batch_size = 2
-    x = test_utils.random_sequence(
-        batch_size, 1, *channel_shape, dtype=input_dtype
     )
-    l = self.init_and_bind_layer(key, l, x)
-    unboxed_variables = flax.core.meta.unbox(l.variables)
+    layer = self.make_layer(config)
+    batch_size = 2
+    x = self.random_sequence(batch_size, 1, *channel_shape, dtype=input_dtype)
+    layer = self.init_layer(layer, x)
+
+    # Check params dtype if trainable
+    variables = self.get_variables(layer)
+    unboxed_variables = flax.core.meta.unbox(variables)
     if trainable_scale:
       chex.assert_trees_all_equal_shapes_and_dtypes(
           unboxed_variables,
@@ -211,56 +69,23 @@ def test_dtypes(
     else:
       self.assertEmpty(jax.tree_util.tree_leaves(unboxed_variables))
 
-    for time in range(13 * l.block_size, 15 * l.block_size):
-      x = test_utils.random_sequence(
+    for time in range(13 * layer.block_size, 15 * layer.block_size):
+      x = self.random_sequence(
           batch_size, time, *channel_shape, dtype=input_dtype
       )
       self.verify_contract(
-          l,
+          layer,
           x,
           training=False,
-          **test_utils.get_grad_tols(l, x, param_dtype, input_dtype),
+          **test_utils.get_grad_tols(layer, x, param_dtype, input_dtype),
       )
 
 
-class ApplyRotaryPositionalEncodingTest(test_utils.SequenceLayerTest):
-
-  @parameterized.product(
-      max_wavelength=(1.0e4, 1.0e5),
-      channel_shape=((4,), (3, 6)),
-      only_advance_position_for_valid_timesteps=(False, True),
-  )
-  def test_basic(
-      self,
-      max_wavelength,
-      channel_shape,
-      only_advance_position_for_valid_timesteps,
-  ):
-    key = jax.random.PRNGKey(1234)
-    l = position_lib.ApplyRotaryPositionalEncoding.Config(
-        max_wavelength=max_wavelength,
-        only_advance_position_for_valid_timesteps=only_advance_position_for_valid_timesteps,
-        name='rope',
-    ).make()
-
-    batch_size = 2
-    x = test_utils.random_sequence(batch_size, 1, *channel_shape)
-    l = self.init_and_bind_layer(key, l, x)
-
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, 1)
-    self.assertEqual(l.name, 'rope')
-    self.assertEqual(l.get_output_shape_for_sequence(x), x.shape[2:])
-    self.assertEmpty(jax.tree_util.tree_leaves(l.variables))
-
-    for time in range(13 * l.block_size, 15 * l.block_size):
-      x = test_utils.random_sequence(
-          batch_size,
-          time,
-          *channel_shape,
-          random_mask=only_advance_position_for_valid_timesteps,
-      )
-      self.verify_contract(l, x, training=False)
+class ApplyRotaryPositionalEncodingTest(
+    position_behaviors.ApplyRotaryPositionalEncodingTest,
+    test_utils.SequenceLayerTest,
+    parameterized.TestCase,
+):
 
   @parameterized.product(
       test_utils.standard_dtype_configs(input=True),
@@ -275,117 +100,25 @@ def test_dtypes(
   ):
     max_wavelength = 1.0e4
     channel_shape = (2,)
-    key = jax.random.PRNGKey(1234)
-    l = position_lib.ApplyRotaryPositionalEncoding.Config(
+    config = self.sl.ApplyRotaryPositionalEncoding.Config(
         max_wavelength=max_wavelength,
         only_advance_position_for_valid_timesteps=only_advance_position_for_valid_timesteps,
         positions_in_at_least_fp32=positions_in_at_least_fp32,
         name='rope',
-    ).make()
-
-    batch_size = 2
-    x = test_utils.random_sequence(
-        batch_size, 1, *channel_shape, dtype=input_dtype
     )
-    l = self.init_and_bind_layer(key, l, x)
-    for time in range(13 * l.block_size, 15 * l.block_size):
-      x = test_utils.random_sequence(
+    layer = self.make_layer(config)
+    batch_size = 2
+    x = self.random_sequence(batch_size, 1, *channel_shape, dtype=input_dtype)
+    layer = self.init_layer(layer, x)
+    for time in range(13 * layer.block_size, 15 * layer.block_size):
+      x = self.random_sequence(
           batch_size,
           time,
           *channel_shape,
           random_mask=only_advance_position_for_valid_timesteps,
           dtype=input_dtype,
       )
-      self.verify_contract(l, x, training=False)
-
-  def test_only_advance_position_for_valid_timesteps(self):
-    l = (
-        position_lib.ApplyRotaryPositionalEncoding.Config(
-            max_wavelength=1.0e5,
-            only_advance_position_for_valid_timesteps=True,
-            name='rope',
-        )
-        .make()
-        .bind({})
-    )
-
-    x = types.Sequence(
-        jax.random.normal(jax.random.PRNGKey(1234), (3, 3, 6)),
-        jnp.asarray(
-            [[False, True, True], [True, False, True], [True, True, False]]
-        ),
-    ).mask_invalid()
-
-    y = l.layer(x, training=False)
-
-    # Verify the layer ignores invalid timesteps by showing the output is equal
-    # to processing a sequence without the invalid timesteps.
-    self.assertSequencesEqual(
-        y[0:1, 1:],
-        l.layer(x[0:1, 1:], training=False),
-    )
-    self.assertSequencesEqual(
-        types.Sequence.concatenate_sequences([y[1:2, :1], y[1:2, 2:]]),
-        l.layer(
-            types.Sequence.concatenate_sequences([x[1:2, :1], x[1:2, 2:]]),
-            training=False,
-        ),
-    )
-    self.assertSequencesEqual(
-        y[2:3, :-1],
-        l.layer(x[2:3, :-1], training=False),
-    )
-
-  def test_external_positions(self):
-    key = jax.random.PRNGKey(1234)
-    l = position_lib.ApplyRotaryPositionalEncoding.Config(
-        max_wavelength=1.0e4,
-        only_advance_position_for_valid_timesteps=False,
-        positions_name='positions',
-        name='rope',
-    ).make()
-
-    x = test_utils.random_sequence(1, 5, 8, random_lengths=False)
-    x = types.Sequence.concatenate_sequences([x, x])
-    constants = {
-        'positions': types.Sequence.from_values(jnp.arange(10)[jnp.newaxis] % 5)
-    }
-    l = self.init_and_bind_layer(key, l, x, constants=constants)
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, 1)
-    self.assertEqual(l.name, 'rope')
-    self.assertEqual(l.get_output_shape_for_sequence(x), x.shape[2:])
-    self.assertEmpty(jax.tree_util.tree_leaves(l.variables))
-    y = self.verify_contract(
-        l,
-        x,
-        constants=constants,
-        training=False,
-        stream_constants=True,
-        pad_constants=True,
-    )
-    # Since the positions repeat, the first half should equal the second half.
-    self.assertSequencesEqual(y[:, :5], y[:, 5:])
-
-  def test_error_only_advance_position_for_valid_timesteps_and_external_positions(
-      self,
-  ):
-    with self.assertRaises(ValueError):
-      key = jax.random.PRNGKey(1234)
-      l = position_lib.ApplyRotaryPositionalEncoding.Config(
-          max_wavelength=1.0e4,
-          positions_name='positions',
-          only_advance_position_for_valid_timesteps=True,
-          name='rope',
-      ).make()
-      x = test_utils.random_sequence(1, 5, 8, random_lengths=False)
-      x = types.Sequence.concatenate_sequences([x, x])
-      constants = {
-          'positions': types.Sequence.from_values(
-              jnp.arange(10)[jnp.newaxis] % 5
-          )
-      }
-      self.init_and_bind_layer(key, l, x, constants=constants)
+      self.verify_contract(layer, x, training=False)
 
 
 if __name__ == '__main__':
diff --git a/sequence_layers/jax/simple.py b/sequence_layers/jax/simple.py
index 716e0f3..d0b5de8 100644
--- a/sequence_layers/jax/simple.py
+++ b/sequence_layers/jax/simple.py
@@ -20,7 +20,8 @@
 import functools
 import math
 import typing
-from typing import Any, Callable, Sequence as TypingSequence
+from typing import Any, Callable, override
+from typing import Sequence as TypingSequence
 
 from absl import logging
 import einops
@@ -30,10 +31,14 @@
 import jax.numpy as jnp
 import numpy as np
 from sequence_layers.jax import meta
+
 from sequence_layers.jax import sharding as sharding_lib
 from sequence_layers.jax import types
+from sequence_layers.jax import typing as jt
 from sequence_layers.jax import utils
-from typing_extensions import override
+from sequence_layers.jax.types import MaskT
+from sequence_layers.jax.types import ValuesT
+from sequence_layers.specs import simple as spec
 
 try:
   # JAX v0.10.0 or newer
@@ -111,15 +116,13 @@
 
 
 def _to_tuple(x: complex | list[Any]) -> complex | tuple[Any, ...]:
-  """Replaces lists in a pytree of complex with tuples."""
   if isinstance(x, list):
-    return tuple(_to_tuple(i) for i in x)
-  else:
-    return x
+    return tuple(_to_tuple(item) for item in x)
+  return x
 
 
 @dataclasses.dataclass(frozen=True)
-class HashableArray:
+class HashableArray(spec.HashableArray):
   """Hashable multidimensional array of tuples."""
 
   data: complex | tuple[Any, ...]
@@ -130,6 +133,7 @@ def from_array(cls, x: np.ndarray) -> 'HashableArray':
     x = np.asarray(x)
     return HashableArray(_to_tuple(x.tolist()), x.dtype)
 
+  @override
   def to_array(self) -> np.ndarray:
     return np.asarray(self.data, dtype=self.dtype)
 
@@ -160,6 +164,7 @@ def _validate(
           f' with the input  channel shape ({input_shape=}).'
       )
 
+  @override
   @nn.nowrap
   def get_output_shape(
       self,
@@ -173,11 +178,13 @@ def get_output_shape(
     return jnp.broadcast_shapes(input_shape, parameter.shape)
 
 
-class Scale(StatelessPointwiseBroadcasting):
+class Scale(
+    StatelessPointwiseBroadcasting, spec.Scale[types.Sequence, types.ShapeDType]
+):
   """Scales the input by a provided constant or array."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.Scale.Config):
     """Config for Scale."""
 
     # The value to scale the input by. May be a numpy array, but must be
@@ -188,18 +195,26 @@ class Config(types.SequenceLayerConfig):
     name: str | None = None
 
     def __post_init__(self):
-      object.__setattr__(self, 'scale', HashableArray.from_array(self.scale))
+      object.__setattr__(
+          self,
+          'scale',
+          HashableArray.from_array(typing.cast(typing.Any, self.scale)),
+      )
 
+    @override
     def make(self) -> 'Scale':
-      return Scale(self, name=self.name)
+      return Scale(config=self, name=self.name)
 
   config: Config
 
   @property
+  @override
   def _broadcast_parameter(self) -> np.ndarray:
     return _to_array(self.config.scale)
 
+  @override
   @types.check_layer
+  # pyrefly: ignore[missing-override-decorator]
   def layer(
       self,
       x: types.Sequence,
@@ -237,9 +252,11 @@ def __post_init__(self):
           self, 'shape', [] if self.shape is None else self.shape
       )
 
+    @override
     def make(self) -> 'Affine':
-      return Affine(self, name=self.name)
+      return Affine(config=self, name=self.name)
 
+  @override
   def setup(self):
     cfg = self.config
     if cfg.use_scale:
@@ -259,6 +276,7 @@ def setup(self):
 
   config: Config
 
+  @override
   @nn.nowrap
   def get_output_shape(
       self,
@@ -267,6 +285,7 @@ def get_output_shape(
       constants: types.Constants | None = None,
   ) -> types.Shape:
     del constants
+    assert self.config.shape is not None
 
     # Check that the parameters do not have batch or time dimension.
     if len(input_shape) < len(self.config.shape):
@@ -278,7 +297,9 @@ def get_output_shape(
     # This function throws a value error if the shapes are not broadcastable.
     return jnp.broadcast_shapes(input_shape, self.config.shape)
 
+  @override
   @types.check_layer
+  # pyrefly: ignore[missing-override-decorator]
   def layer(
       self,
       x: types.Sequence,
@@ -298,11 +319,13 @@ def layer(
     return x
 
 
-class Add(StatelessPointwiseBroadcasting):
+class Add(
+    StatelessPointwiseBroadcasting, spec.Add[types.Sequence, types.ShapeDType]
+):
   """Adds the provided constant or array to the input."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.Add.Config):
     """Config for Add."""
 
     # The value to add to the input. May be a numpy array, but must be
@@ -313,18 +336,26 @@ class Config(types.SequenceLayerConfig):
     name: str | None = None
 
     def __post_init__(self):
-      object.__setattr__(self, 'shift', HashableArray.from_array(self.shift))
+      object.__setattr__(
+          self,
+          'shift',
+          HashableArray.from_array(typing.cast(typing.Any, self.shift)),
+      )
 
+    @override
     def make(self) -> 'Add':
-      return Add(self, name=self.name)
+      return Add(config=self, name=self.name)
 
   config: Config
 
   @property
+  @override
   def _broadcast_parameter(self) -> np.ndarray:
     return _to_array(self.config.shift)
 
+  @override
   @types.check_layer
+  # pyrefly: ignore[missing-override-decorator]
   def layer(
       self,
       x: types.Sequence,
@@ -355,19 +386,25 @@ class Config(types.SequenceLayerConfig):
 
     def __post_init__(self):
       object.__setattr__(
-          self, 'maximum', HashableArray.from_array(self.maximum)
+          self,
+          'maximum',
+          HashableArray.from_array(typing.cast(typing.Any, self.maximum)),
       )
 
+    @override
     def make(self) -> 'Maximum':
-      return Maximum(self, name=self.name)
+      return Maximum(config=self, name=self.name)
 
   config: Config
 
   @property
+  @override
   def _broadcast_parameter(self) -> np.ndarray:
     return _to_array(self.config.maximum)
 
+  @override
   @types.check_layer
+  # pyrefly: ignore[missing-override-decorator]
   def layer(
       self,
       x: types.Sequence,
@@ -398,19 +435,25 @@ class Config(types.SequenceLayerConfig):
 
     def __post_init__(self):
       object.__setattr__(
-          self, 'divisor', HashableArray.from_array(self.divisor)
+          self,
+          'divisor',
+          HashableArray.from_array(typing.cast(typing.Any, self.divisor)),
       )
 
+    @override
     def make(self) -> 'Mod':
-      return Mod(self, name=self.name)
+      return Mod(config=self, name=self.name)
 
   config: Config
 
   @property
+  @override
   def _broadcast_parameter(self) -> np.ndarray:
     return _to_array(self.config.divisor)
 
+  @override
   @types.check_layer
+  # pyrefly: ignore[missing-override-decorator]
   def layer(
       self,
       x: types.Sequence,
@@ -443,19 +486,25 @@ class Config(types.SequenceLayerConfig):
 
     def __post_init__(self):
       object.__setattr__(
-          self, 'minimum', HashableArray.from_array(self.minimum)
+          self,
+          'minimum',
+          HashableArray.from_array(typing.cast(typing.Any, self.minimum)),
       )
 
+    @override
     def make(self) -> 'Minimum':
-      return Minimum(self, name=self.name)
+      return Minimum(config=self, name=self.name)
 
   config: Config
 
   @property
+  @override
   def _broadcast_parameter(self) -> np.ndarray:
     return _to_array(self.config.minimum)
 
+  @override
   @types.check_layer
+  # pyrefly: ignore[missing-override-decorator]
   def layer(
       self,
       x: types.Sequence,
@@ -493,6 +542,7 @@ def __post_init__(self):
       else:
         object.__setattr__(self, 'axis', tuple(self.axis))
 
+    @override
     def make(self) -> '_ReduceChannels':
       raise NotImplementedError()
 
@@ -504,6 +554,7 @@ def _reduce_fn(self) -> Callable[..., jax.Array]:
     ...
 
   @property
+  @override
   def supports_step(self) -> bool:
     return True
 
@@ -513,6 +564,7 @@ def _validate_axis(self, input_shape: types.ShapeLike) -> tuple[int, ...]:
     rank = len(input_shape) + 2
     axis = self.config.axis
     if axis is not None:
+      # pyrefly: ignore[not-iterable]
       axis = [a + rank if a < 0 else a for a in axis]
     else:
       axis = list(range(2, rank))
@@ -524,6 +576,7 @@ def _validate_axis(self, input_shape: types.ShapeLike) -> tuple[int, ...]:
         )
     return tuple(axis)
 
+  @override
   @nn.nowrap
   def get_output_shape(
       self,
@@ -538,7 +591,9 @@ def get_output_shape(
     else:
       return tuple(d for i, d in enumerate(input_shape) if i + 2 not in axis)
 
+  @override
   @types.check_layer
+  # pyrefly: ignore[missing-override-decorator]
   def layer(
       self,
       x: types.Sequence,
@@ -560,10 +615,12 @@ class Mean(_ReduceChannels):
   class Config(_ReduceChannels.Config):
     """Config for Mean."""
 
+    @override
     def make(self) -> 'Mean':
-      return Mean(self, name=self.name)
+      return Mean(config=self, name=self.name)
 
   @property
+  @override
   def _reduce_fn(self) -> Callable[..., jax.Array]:
     return jnp.mean
 
@@ -575,10 +632,12 @@ class Min(_ReduceChannels):
   class Config(_ReduceChannels.Config):
     """Config for Min."""
 
+    @override
     def make(self) -> 'Min':
-      return Min(self, name=self.name)
+      return Min(config=self, name=self.name)
 
   @property
+  @override
   def _reduce_fn(self) -> Callable[..., jax.Array]:
     return jnp.min
 
@@ -590,10 +649,12 @@ class Max(_ReduceChannels):
   class Config(_ReduceChannels.Config):
     """Config for Max."""
 
+    @override
     def make(self) -> 'Max':
-      return Max(self, name=self.name)
+      return Max(config=self, name=self.name)
 
   @property
+  @override
   def _reduce_fn(self) -> Callable[..., jax.Array]:
     return jnp.max
 
@@ -605,38 +666,50 @@ class Sum(_ReduceChannels):
   class Config(_ReduceChannels.Config):
     """Config for Sum."""
 
+    @override
     def make(self) -> 'Sum':
-      return Sum(self, name=self.name)
+      return Sum(config=self, name=self.name)
 
   @property
+  @override
   def _reduce_fn(self) -> Callable[..., jax.Array]:
     return jnp.sum
 
 
-class Abs(types.StatelessPointwiseFunctor):
+class Abs(
+    types.PreservesType,
+    types.StatelessPointwiseFunctor,
+    spec.Abs[types.Sequence, types.ShapeDType],
+):
   """Absolute value layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.Abs.Config):
     name: str | None = None
 
+    @override
     def make(self) -> 'Abs':
-      return Abs(self, name=self.name)
+      return Abs(config=self, name=self.name)
 
   config: Config
 
   @property
+  @override
   def mask_required(self):
     return False
 
+  @override
   @nn.nowrap
-  def fn(
+  def fn[ValuesT: jt.ArrayT, MaskT: jt.ArrayT](
       self,
-      values: types.ValuesT,
-      mask: types.MaskT,
-  ) -> tuple[types.ValuesT, types.MaskT]:
+      values: ValuesT,
+      mask: MaskT,
+  ) -> tuple[ValuesT, MaskT]:
+
+    # pyrefly: ignore[bad-argument-type]
     return jnp.abs(values), mask
 
+  @override
   @nn.nowrap
   def get_output_dtype(
       self,
@@ -654,31 +727,42 @@ def get_output_dtype(
         return input_dtype
 
 
-class Cast(types.StatelessPointwiseFunctor):
+class Cast(
+    types.StatelessPointwiseFunctor,
+    spec.Cast[types.Sequence, types.ShapeDType],
+):
   """Cast input values to the specified type."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.Cast.Config):
     dtype: types.DType
     name: str | None = None
 
+    @override
     def make(self) -> 'Cast':
-      return Cast(self, name=self.name)
+      return Cast(config=self, name=self.name)
 
   config: Config
 
   @property
+  @override
   def mask_required(self):
     return False
 
+  @override
   @nn.nowrap
-  def fn(
-      self,
-      values: types.ValuesT,
-      mask: types.MaskT,
-  ) -> tuple[types.ValuesT, types.MaskT]:
+  def fn[
+      ValuesT: (jax.Array, np.ndarray, jax.ShapeDtypeStruct),
+      MaskT: (jax.Array, np.ndarray, jax.ShapeDtypeStruct),
+  ](
+      self,
+      values: ValuesT,
+      mask: MaskT,
+  ) -> tuple[ValuesT, MaskT]:
+    # pyrefly: ignore[missing-attribute]
     return values.astype(self.config.dtype), mask
 
+  @override
   @nn.nowrap
   def get_output_dtype(
       self,
@@ -689,21 +773,30 @@ def get_output_dtype(
     return self.config.dtype
 
 
-class GatedUnit(types.PreservesType, types.Stateless):
+class GatedUnit(
+    types.PreservesType,
+    types.Stateless,
+    spec.GatedUnit[types.Sequence, types.ShapeDType],
+):
   """Computes a generalized Gated Unit, reducing the input channels by 2x."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
-    feature_activation: Callable[[types.ValuesT], types.ValuesT] | None
-    gate_activation: Callable[[types.ValuesT], types.ValuesT] | None
+  class Config(spec.GatedUnit.Config):
+    feature_activation: Callable[[types.ArrayLike], types.ArrayLike] | None = (
+        None
+    )
+    gate_activation: Callable[[types.ArrayLike], types.ArrayLike] | None = None
     name: str | None = None
 
+    @override
     def make(self) -> 'GatedUnit':
-      return GatedUnit(self, name=self.name)
+      return GatedUnit(config=self, name=self.name)
 
   config: Config
 
+  @override
   @types.check_layer
+  # pyrefly: ignore[missing-override-decorator]
   def layer(
       self,
       x: types.Sequence,
@@ -716,9 +809,11 @@ def layer(
       feature = self.config.feature_activation(feature)
     if self.config.gate_activation:
       gate = self.config.gate_activation(gate)
+    # pyrefly: ignore[unsupported-operation]
     values = feature * gate
     return types.Sequence(values, x.mask)
 
+  @override
   @nn.nowrap
   def get_output_shape(
       self,
@@ -735,29 +830,53 @@ def get_output_shape(
     return tuple(input_shape[:-1]) + (channels // 2,)
 
 
-class GatedLinearUnit(GatedUnit):
+class GatedLinearUnit(
+    GatedUnit, spec.GatedLinearUnit[types.Sequence, types.ShapeDType]
+):
   """Computes a Gated Linear Unit, reducing the input channels by 2x."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(GatedUnit.Config, spec.GatedLinearUnit.Config):
     name: str | None = None
 
+    @override
     def make(self) -> 'GatedLinearUnit':
       return GatedLinearUnit(
-          GatedUnit.Config(None, jax.nn.sigmoid, name=self.name), name=self.name
+          config=GatedUnit.Config(
+              None,
+              typing.cast(
+                  typing.Callable[[types.ArrayLike], types.ArrayLike],
+                  jax.nn.sigmoid,
+              ),
+              name=self.name,
+          ),
+          name=self.name,
       )
 
 
-class GatedTanhUnit(GatedUnit):
+class GatedTanhUnit(
+    GatedUnit, spec.GatedTanhUnit[types.Sequence, types.ShapeDType]
+):
   """Computes a Gated Tanh Unit, reducing the input channels by 2x."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(GatedUnit.Config, spec.GatedTanhUnit.Config):
     name: str | None = None
 
+    @override
     def make(self) -> 'GatedTanhUnit':
       return GatedTanhUnit(
-          GatedUnit.Config(jax.nn.tanh, jax.nn.sigmoid, name=self.name),
+          config=GatedUnit.Config(
+              typing.cast(
+                  typing.Callable[[types.ArrayLike], types.ArrayLike],
+                  jax.nn.tanh,
+              ),
+              typing.cast(
+                  typing.Callable[[types.ArrayLike], types.ArrayLike],
+                  jax.nn.sigmoid,
+              ),
+              name=self.name,
+          ),
           name=self.name,
       )
 
@@ -770,13 +889,16 @@ class Config(types.SequenceLayerConfig):
     clip_value: float
     name: str | None = None
 
+    @override
     def make(self) -> 'GradientClipping':
       assert self.clip_value > 0
-      return GradientClipping(self, name=self.name)
+      return GradientClipping(config=self, name=self.name)
 
   config: Config
 
+  @override
   @types.check_layer
+  # pyrefly: ignore[missing-override-decorator]
   def layer(
       self,
       x: types.Sequence,
@@ -839,17 +961,24 @@ def fn(
     return jax.lax.stop_gradient(values), mask
 
 
-class Identity(types.PreservesType, types.StatelessPointwise):
+class Identity(
+    types.PreservesType,
+    types.StatelessPointwise,
+    spec.Identity[types.Sequence, types.ShapeDType],
+):
   """Identity pass-through of the input."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.Identity.Config):
     name: str | None = None
 
+    @override
     def make(self) -> 'Identity':
       return Identity(name=self.name)
 
+  @override
   @types.check_layer
+  # pyrefly: ignore[missing-override-decorator]
   def layer(
       self,
       x: types.Sequence,
@@ -870,12 +999,15 @@ class Config(types.SequenceLayerConfig):
     mask_sharding: types.Sharding | None = None
     name: str | None = None
 
+    @override
     def make(self) -> 'ApplySharding':
-      return ApplySharding(self, name=self.name)
+      return ApplySharding(config=self, name=self.name)
 
   config: Config
 
+  @override
   @types.check_layer
+  # pyrefly: ignore[missing-override-decorator]
   def layer(
       self,
       x: types.Sequence,
@@ -908,12 +1040,15 @@ class Config(types.SequenceLayerConfig):
     apply_to_mask: bool = False
     name: str | None = None
 
+    @override
     def make(self) -> 'OptimizationBarrier':
-      return OptimizationBarrier(self, name=self.name)
+      return OptimizationBarrier(config=self, name=self.name)
 
   config: Config
 
+  @override
   @types.check_layer
+  # pyrefly: ignore[missing-override-decorator]
   def layer(
       self,
       x: types.Sequence,
@@ -932,7 +1067,10 @@ def shard_values_mask(values, mask):
     return x.apply_masked(shard_values_mask)
 
 
-class Lambda(types.Stateless):
+class Lambda(
+    types.Stateless,
+    spec.Lambda[types.Sequence, types.ShapeDType],
+):
   """A SequenceLayer that wraps a Python lambda function.
 
   The wrapped lambda is assumed to be stateless. The receptive field of the
@@ -941,7 +1079,7 @@ class Lambda(types.Stateless):
   """
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.Lambda.Config):
     """Configuration for a Lambda layer."""
 
     # If sequence_input is True, a callable that takes an sl.Sequence and
@@ -964,12 +1102,14 @@ class Config(types.SequenceLayerConfig):
     # An optional name for the layer.
     name: str | None = None
 
+    @override
     def make(self) -> 'Lambda':
-      return Lambda(self, name=self.name)
+      return Lambda(config=self, name=self.name)
 
   config: Config
 
   @property
+  @override
   def supports_step(self) -> bool:
     return True
 
@@ -987,14 +1127,16 @@ def _validate_input_spec(self, input_spec: types.ShapeDType) -> None:
     #         f' input spec {expected_input_spec=}'
     #     )
 
+  @override
   def get_output_spec(
       self,
-      input_spec: types.ChannelSpec,
+      input_spec: types.ShapeDType,
       *,
       constants: types.Constants | None = None,
-  ) -> types.ChannelSpec:
+  ) -> types.ShapeDType:
     self._validate_input_spec(input_spec)
     if self.config.sequence_input:
+      # pyrefly: ignore[bad-assignment]
       input_spec = types.Sequence(
           types.ShapeDType(
               (1, 1) + tuple(input_spec.shape),
@@ -1010,6 +1152,7 @@ def get_output_spec(
     output_spec = jax.eval_shape(self.config.fn, input_spec)
     return jax.ShapeDtypeStruct(output_spec.shape[2:], output_spec.dtype)
 
+  @override
   @nn.nowrap
   def get_output_dtype(
       self,
@@ -1028,6 +1171,7 @@ def get_output_dtype(
         )
     ).dtype
 
+  @override
   @nn.nowrap
   def get_output_shape(
       self,
@@ -1046,7 +1190,9 @@ def get_output_shape(
         )
     ).shape
 
+  @override
   @types.check_layer
+  # pyrefly: ignore[missing-override-decorator]
   def layer(
       self,
       x: types.Sequence,
@@ -1076,36 +1222,48 @@ def layer(
             f' {values.shape=}'
         )
       if self.config.mask_required:
+        # pyrefly: ignore[bad-specialization]
         y = types.Sequence(values, x.mask)
       else:
+        # pyrefly: ignore[bad-specialization]
         y = type(x)(values, x.mask)
 
     return y
 
 
-class CheckpointName(types.PreservesType, types.StatelessPointwiseFunctor):
+class CheckpointName(
+    types.PreservesType,
+    types.StatelessPointwiseFunctor,
+    spec.CheckpointName[types.Sequence, types.ShapeDType],
+):
   """Applies a checkpoint name to the sequence values."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.CheckpointName.Config):
     checkpoint_name: str
     name: str | None = None
 
+    @override
     def make(self) -> 'CheckpointName':
-      return CheckpointName(self, name=self.name)
+      return CheckpointName(config=self, name=self.name)
 
   config: Config
 
   @property
+  @override
   def mask_required(self):
     return False
 
+  @override
   @nn.nowrap
-  def fn(
-      self,
-      values: types.ValuesT,
-      mask: types.MaskT,
-  ) -> tuple[types.ValuesT, types.MaskT]:
+  def fn[
+      ValuesT: (jax.Array, np.ndarray, jax.ShapeDtypeStruct),
+      MaskT: (jax.Array, np.ndarray, jax.ShapeDtypeStruct),
+  ](
+      self,
+      values: ValuesT,
+      mask: MaskT,
+  ) -> tuple[ValuesT, MaskT]:
     values = jax.ad_checkpoint.checkpoint_name(
         values, self.config.checkpoint_name
     )
@@ -1130,21 +1288,27 @@ class Config(types.SequenceLayerConfig):
     param_dtype: types.DType = jnp.float32
     name: str | None = None
 
+    @override
     def make(self) -> 'Snake':
-      return Snake(self, name=self.name)
+      return Snake(config=self, name=self.name)
 
   config: Config
 
   @property
+  @override
   def mask_required(self):
     return False
 
+  @override
   @nn.compact
-  def fn(
-      self,
-      values: types.ValuesT,
-      mask: types.MaskT,
-  ) -> tuple[types.ValuesT, types.MaskT]:
+  def fn[
+      ValuesT: (jax.Array, np.ndarray, jax.ShapeDtypeStruct),
+      MaskT: (jax.Array, np.ndarray, jax.ShapeDtypeStruct),
+  ](
+      self,
+      values: ValuesT,
+      mask: MaskT,
+  ) -> tuple[ValuesT, MaskT]:
     channel_shape = values.shape[2:]
     alpha_log = self.param(
         'alpha_log',
@@ -1164,81 +1328,117 @@ def fn(
     else:
       beta = alpha
 
+    # pyrefly: ignore[unsupported-operation]
     values += jnp.square(jnp.sin(values * alpha)) / (beta + 1e-12)
     return values, mask
 
 
-class Tanh(types.PreservesType, types.StatelessPointwiseFunctor):
+class Tanh(
+    types.PreservesType,
+    types.StatelessPointwiseFunctor,
+    spec.Tanh[types.Sequence, types.ShapeDType],
+):
   """A tanh layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.Tanh.Config):
     name: str | None = None
 
+    @override
     def make(self) -> 'Tanh':
-      return Tanh(self, name=self.name)
+      return Tanh(config=self, name=self.name)
 
   config: Config
 
   @property
+  @override
   def mask_required(self):
     return False
 
+  @override
   @nn.nowrap
-  def fn(
-      self,
-      values: types.ValuesT,
-      mask: types.MaskT,
-  ) -> tuple[types.ValuesT, types.MaskT]:
+  def fn[
+      ValuesT: (jax.Array, np.ndarray, jax.ShapeDtypeStruct),
+      MaskT: (jax.Array, np.ndarray, jax.ShapeDtypeStruct),
+  ](
+      self,
+      values: ValuesT,
+      mask: MaskT,
+  ) -> tuple[ValuesT, MaskT]:
+    # pyrefly: ignore[bad-argument-type]
     return jax.nn.tanh(values), mask
 
 
-class Relu(types.PreservesType, types.StatelessPointwiseFunctor):
+class Relu(
+    types.PreservesType,
+    types.StatelessPointwiseFunctor,
+    spec.Relu[types.Sequence, types.ShapeDType],
+):
   """A Relu layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.Relu.Config):
     name: str | None = None
 
+    @override
     def make(self) -> 'Relu':
-      return Relu(name=self.name)
+      return Relu(config=self, name=self.name)
+
+  config: Config
 
   @property
+  @override
   def mask_required(self):
     return False
 
+  @override
   @nn.nowrap
-  def fn(
-      self,
-      values: types.ValuesT,
-      mask: types.MaskT,
-  ) -> tuple[types.ValuesT, types.MaskT]:
+  def fn[
+      ValuesT: (jax.Array, np.ndarray, jax.ShapeDtypeStruct),
+      MaskT: (jax.Array, np.ndarray, jax.ShapeDtypeStruct),
+  ](
+      self,
+      values: ValuesT,
+      mask: MaskT,
+  ) -> tuple[ValuesT, MaskT]:
+    # pyrefly: ignore[bad-argument-type]
     return jax.nn.relu(values), mask
 
 
-class LeakyRelu(types.PreservesType, types.StatelessPointwiseFunctor):
+class LeakyRelu(
+    types.PreservesType,
+    types.StatelessPointwiseFunctor,
+    spec.LeakyRelu[types.Sequence, types.ShapeDType],
+):
   """A Leaky Relu layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.LeakyRelu.Config):
     negative_slope: complex = 0.01
     name: str | None = None
 
+    @override
     def make(self) -> 'LeakyRelu':
-      return LeakyRelu(self, name=self.name)
+      return LeakyRelu(config=self, name=self.name)
 
   config: Config
 
   @property
+  @override
   def mask_required(self):
     return False
 
+  @override
   @nn.nowrap
-  def fn(
-      self,
-      values: types.ValuesT,
-      mask: types.MaskT,
-  ) -> tuple[types.ValuesT, types.MaskT]:
+  def fn[
+      ValuesT: (jax.Array, np.ndarray, jax.ShapeDtypeStruct),
+      MaskT: (jax.Array, np.ndarray, jax.ShapeDtypeStruct),
+  ](
+      self,
+      values: ValuesT,
+      mask: MaskT,
+  ) -> tuple[ValuesT, MaskT]:
+    # pyrefly: ignore[bad-argument-type]
     return jax.nn.leaky_relu(values, self.config.negative_slope), mask
 
 
@@ -1251,11 +1451,13 @@ class Config(types.SequenceLayerConfig):
     param_dtype: types.DType = jnp.float32
     name: str | None = None
 
+    @override
     def make(self) -> 'PRelu':
-      return PRelu(self, name=self.name)
+      return PRelu(config=self, name=self.name)
 
   config: Config
 
+  @override
   def setup(self):
     self.negative_slope = self.param(
         'negative_slope',
@@ -1265,87 +1467,129 @@ def setup(self):
     )
 
   @property
+  @override
   def mask_required(self) -> bool:
     return False
 
+  @override
   @nn.nowrap
-  def fn(
+  def fn[
+      ValuesT: (jax.Array, np.ndarray, jax.ShapeDtypeStruct),
+      MaskT: (jax.Array, np.ndarray, jax.ShapeDtypeStruct),
+  ](
       self,
-      values: types.ValuesT,
-      mask: types.MaskT,
-  ) -> tuple[types.ValuesT, types.MaskT]:
+      values: ValuesT,
+      mask: MaskT,
+  ) -> tuple[ValuesT, MaskT]:
 
     return (
         jnp.where(
+            # pyrefly: ignore[unsupported-operation]
             values >= 0,
             values,
+            # pyrefly: ignore[unsupported-operation]
             self.negative_slope.astype(values.dtype) * values,
         ),
         mask,
     )
 
 
-class Elu(types.PreservesType, types.StatelessPointwiseFunctor):
+class Elu(
+    types.PreservesType,
+    types.StatelessPointwiseFunctor,
+    spec.Elu[types.Sequence, types.ShapeDType],
+):
   """An elu activation layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.Elu.Config):
     alpha: complex = 1.0
     name: str | None = None
 
+    @override
     def make(self) -> 'Elu':
-      return Elu(self, name=self.name)
+      return Elu(config=self, name=self.name)
 
   config: Config
 
+  @property
+  @override
+  def mask_required(self):
+    return False
+
+  @override
   @nn.nowrap
-  def fn(
-      self,
-      values: types.ValuesT,
-      mask: types.MaskT,
-  ) -> tuple[types.ValuesT, types.MaskT]:
+  def fn[
+      ValuesT: (jax.Array, np.ndarray, jax.ShapeDtypeStruct),
+      MaskT: (jax.Array, np.ndarray, jax.ShapeDtypeStruct),
+  ](
+      self,
+      values: ValuesT,
+      mask: MaskT,
+  ) -> tuple[ValuesT, MaskT]:
+    # pyrefly: ignore[bad-argument-type]
     return jax.nn.elu(values, self.config.alpha), mask
 
 
-class Exp(types.PreservesType, types.StatelessPointwiseFunctor):
+class Exp(
+    types.PreservesType,
+    types.StatelessPointwiseFunctor,
+    spec.Exp[types.Sequence, types.ShapeDType],
+):
   """An exp layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.Exp.Config):
     name: str | None = None
 
+    @override
     def make(self) -> 'Exp':
-      return Exp(self, name=self.name)
+      return Exp(config=self, name=self.name)
 
   config: Config
 
+  @override
   @nn.nowrap
-  def fn(
-      self,
-      values: types.ValuesT,
-      mask: types.MaskT,
-  ) -> tuple[types.ValuesT, types.MaskT]:
+  def fn[
+      ValuesT: (jax.Array, np.ndarray, jax.ShapeDtypeStruct),
+      MaskT: (jax.Array, np.ndarray, jax.ShapeDtypeStruct),
+  ](
+      self,
+      values: ValuesT,
+      mask: MaskT,
+  ) -> tuple[ValuesT, MaskT]:
+    # pyrefly: ignore[bad-argument-type]
     return jnp.exp(values), mask
 
 
-class Log(types.PreservesType, types.StatelessPointwiseFunctor):
+class Log(
+    types.PreservesType,
+    types.StatelessPointwiseFunctor,
+    spec.Log[types.Sequence, types.ShapeDType],
+):
   """A log layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.Log.Config):
     name: str | None = None
 
+    @override
     def make(self) -> 'Log':
-      return Log(self, name=self.name)
+      return Log(config=self, name=self.name)
 
   config: Config
 
+  @override
   @nn.nowrap
-  def fn(
-      self,
-      values: types.ValuesT,
-      mask: types.MaskT,
-  ) -> tuple[types.ValuesT, types.MaskT]:
+  def fn[
+      ValuesT: (jax.Array, np.ndarray, jax.ShapeDtypeStruct),
+      MaskT: (jax.Array, np.ndarray, jax.ShapeDtypeStruct),
+  ](
+      self,
+      values: ValuesT,
+      mask: MaskT,
+  ) -> tuple[ValuesT, MaskT]:
+    # pyrefly: ignore[bad-argument-type]
     return jnp.log(values), mask
 
 
@@ -1357,140 +1601,199 @@ class Config(types.SequenceLayerConfig):
     power: float = 1.0
     name: str | None = None
 
+    @override
     def make(self) -> 'Power':
-      return Power(self, name=self.name)
+      return Power(config=self, name=self.name)
 
   config: Config
 
   @property
+  @override
   def mask_required(self):
     return False
 
+  @override
   @nn.nowrap
-  def fn(
-      self,
-      values: types.ValuesT,
-      mask: types.MaskT,
-  ) -> tuple[types.ValuesT, types.MaskT]:
+  def fn[
+      ValuesT: (jax.Array, np.ndarray, jax.ShapeDtypeStruct),
+      MaskT: (jax.Array, np.ndarray, jax.ShapeDtypeStruct),
+  ](
+      self,
+      values: ValuesT,
+      mask: MaskT,
+  ) -> tuple[ValuesT, MaskT]:
+    # pyrefly: ignore[bad-argument-type]
     return jnp.power(values, self.config.power), mask
 
 
-class Sigmoid(types.PreservesType, types.StatelessPointwiseFunctor):
+class Sigmoid(
+    types.PreservesType,
+    types.StatelessPointwiseFunctor,
+    spec.Sigmoid[types.Sequence, types.ShapeDType],
+):
   """A sigmoid layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.Sigmoid.Config):
     name: str | None = None
 
+    @override
     def make(self) -> 'Sigmoid':
-      return Sigmoid(self, name=self.name)
+      return Sigmoid(config=self, name=self.name)
 
   config: Config
 
+  @override
   @nn.nowrap
-  def fn(
-      self,
-      values: types.ValuesT,
-      mask: types.MaskT,
-  ) -> tuple[types.ValuesT, types.MaskT]:
+  def fn[
+      ValuesT: (jax.Array, np.ndarray, jax.ShapeDtypeStruct),
+      MaskT: (jax.Array, np.ndarray, jax.ShapeDtypeStruct),
+  ](
+      self,
+      values: ValuesT,
+      mask: MaskT,
+  ) -> tuple[ValuesT, MaskT]:
+    # pyrefly: ignore[bad-argument-type]
     return jax.nn.sigmoid(values), mask
 
 
-class Softplus(types.PreservesType, types.StatelessPointwiseFunctor):
+class Softplus(
+    types.PreservesType,
+    types.StatelessPointwiseFunctor,
+    spec.Softplus[types.Sequence, types.ShapeDType],
+):
   """A softplus layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.Softplus.Config):
     name: str | None = None
 
+    @override
     def make(self) -> 'Softplus':
-      return Softplus(self, name=self.name)
+      return Softplus(config=self, name=self.name)
 
   config: Config
 
+  @override
   @nn.nowrap
-  def fn(
-      self,
-      values: types.ValuesT,
-      mask: types.MaskT,
-  ) -> tuple[types.ValuesT, types.MaskT]:
+  def fn[
+      ValuesT: (jax.Array, np.ndarray, jax.ShapeDtypeStruct),
+      MaskT: (jax.Array, np.ndarray, jax.ShapeDtypeStruct),
+  ](
+      self,
+      values: ValuesT,
+      mask: MaskT,
+  ) -> tuple[ValuesT, MaskT]:
+    # pyrefly: ignore[bad-argument-type]
     return jax.nn.softplus(values), mask
 
 
-class Softmax(types.PreservesType, types.StatelessPointwiseFunctor):
+class Softmax(
+    types.PreservesType,
+    types.StatelessPointwiseFunctor,
+    spec.Softmax[types.Sequence, types.ShapeDType],
+):
   """A softmax layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.Softmax.Config):
     axis: int = -1
     name: str | None = None
 
+    @override
     def make(self) -> 'Softmax':
-      return Softmax(self, name=self.name)
+      return Softmax(config=self, name=self.name)
 
   config: Config
 
+  @override
   @nn.nowrap
-  def fn(
-      self,
-      values: types.ValuesT,
-      mask: types.MaskT,
-  ) -> tuple[types.ValuesT, types.MaskT]:
+  def fn[
+      ValuesT: (jax.Array, np.ndarray, jax.ShapeDtypeStruct),
+      MaskT: (jax.Array, np.ndarray, jax.ShapeDtypeStruct),
+  ](
+      self,
+      values: ValuesT,
+      mask: MaskT,
+  ) -> tuple[ValuesT, MaskT]:
     axis = self.config.axis
     if (axis if axis >= 0 else values.ndim + axis) < 2:
       raise ValueError(
           'The softmax cannot be applied on the batch or time dimension (got'
           f' {axis=} for shape={values.shape})'
       )
+    # pyrefly: ignore[bad-argument-type]
     return jax.nn.softmax(values, axis=axis), mask
 
 
-class Swish(types.PreservesType, types.StatelessPointwiseFunctor):
+class Swish(
+    types.PreservesType,
+    types.StatelessPointwiseFunctor,
+    spec.Swish[types.Sequence, types.ShapeDType],
+):
   """A Swish layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.Swish.Config):
     name: str | None = None
 
+    @override
     def make(self) -> 'Swish':
       return Swish(name=self.name)
 
   @property
+  @override
   def mask_required(self):
     return False
 
+  @override
   @nn.nowrap
-  def fn(
-      self,
-      values: types.ValuesT,
-      mask: types.MaskT,
-  ) -> tuple[types.ValuesT, types.MaskT]:
+  def fn[
+      ValuesT: (jax.Array, np.ndarray, jax.ShapeDtypeStruct),
+      MaskT: (jax.Array, np.ndarray, jax.ShapeDtypeStruct),
+  ](
+      self,
+      values: ValuesT,
+      mask: MaskT,
+  ) -> tuple[ValuesT, MaskT]:
+    # pyrefly: ignore[bad-argument-type]
     return jax.nn.swish(values), mask
 
 
-class Gelu(types.PreservesType, types.StatelessPointwiseFunctor):
+class Gelu(
+    types.PreservesType,
+    types.StatelessPointwiseFunctor,
+    spec.Gelu[types.Sequence, types.ShapeDType],
+):
   """A Gaussian Error Linear Unit (GELU) layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.Gelu.Config):
     approximate: bool = True
     name: str | None = None
 
+    @override
     def make(self) -> 'Gelu':
-      return Gelu(self, name=self.name)
+      return Gelu(config=self, name=self.name)
 
   config: Config
 
   @property
+  @override
   def mask_required(self):
     return False
 
+  @override
   @nn.nowrap
-  def fn(
-      self,
-      values: types.ValuesT,
-      mask: types.MaskT,
-  ) -> tuple[types.ValuesT, types.MaskT]:
+  def fn[
+      ValuesT: (jax.Array, np.ndarray, jax.ShapeDtypeStruct),
+      MaskT: (jax.Array, np.ndarray, jax.ShapeDtypeStruct),
+  ](
+      self,
+      values: ValuesT,
+      mask: MaskT,
+  ) -> tuple[ValuesT, MaskT]:
+    # pyrefly: ignore[bad-argument-type]
     return jax.nn.gelu(values, approximate=self.config.approximate), mask
 
 
@@ -1518,13 +1821,14 @@ def __post_init__(self):
       # Use hashable types for sequences.
       object.__setattr__(self, 'slices', tuple(self.slices))
 
-    def as_slices(self) -> tuple[slice | int | None]:
+    def as_slices(self) -> tuple[slice | int | None, ...]:
       return tuple(
           slice(*s) if isinstance(s, tuple) else s for s in self.slices
       )
 
+    @override
     def make(self) -> 'Slice':
-      return Slice(self, name=self.name)
+      return Slice(config=self, name=self.name)
 
   config: Config
 
@@ -1537,6 +1841,7 @@ def _validate_slice_for_input_shape(self, input_shape: types.ShapeLike):
           % (input_shape, self.config.slices)
       )
 
+  @override
   def get_output_shape(
       self,
       input_shape: types.ShapeLike,
@@ -1547,10 +1852,6 @@ def get_output_shape(
     output_dims = []
     input_index = 0
 
-    # Compute the output shape:
-    # - int: Remove the current input dimension.
-    # - slice: Compute the output dimension size using slice.indices.
-    # - None (tf.newaxis): Add a dimension.
     for slice_i in self.config.slices:
       if isinstance(slice_i, tuple):
         slice_i = slice(*slice_i)
@@ -1572,7 +1873,9 @@ def get_output_shape(
         )
     return tuple(output_dims)
 
+  @override
   @types.check_layer
+  # pyrefly: ignore[missing-override-decorator]
   def layer(
       self,
       x: types.Sequence,
@@ -1580,7 +1883,6 @@ def layer(
       training: bool,
       constants: types.Constants | None = None,
   ) -> types.Sequence:
-    # Slice the batch and time dimensions with [:, :].
     full_slice = (
         slice(None, None, None),
         slice(None, None, None),
@@ -1590,7 +1892,11 @@ def layer(
     return x.apply_values_masked(lambda v: v.__getitem__(full_slice))
 
 
-class Flatten(types.PreservesType, types.Stateless):
+class Flatten(
+    types.PreservesType,
+    types.Stateless,
+    spec.Flatten[types.Sequence, types.ShapeDType],
+):
   """Flattens the channel dimensions of the input sequence.
 
   An input sequence with shape [batch_size, time, ...] is reshaped to
@@ -1601,9 +1907,11 @@ class Flatten(types.PreservesType, types.Stateless):
   class Config(types.SequenceLayerConfig):
     name: str | None = None
 
+    @override
     def make(self) -> 'Flatten':
       return Flatten(name=self.name)
 
+  @override
   @nn.nowrap
   def get_output_shape(
       self,
@@ -1612,9 +1920,11 @@ def get_output_shape(
       constants: types.Constants | None = None,
   ) -> types.Shape:
     del constants
-    return (np.prod(input_shape),)
+    return (int(np.prod(input_shape)),)
 
+  @override
   @types.check_layer
+  # pyrefly: ignore[missing-override-decorator]
   def layer(
       self,
       x: types.Sequence,
@@ -1628,17 +1938,18 @@ def layer(
     return x.apply_values_masked(jnp.reshape, [batch_size, time, num_elements])
 
 
-class OneHot(types.Stateless):
+class OneHot(types.Stateless, spec.OneHot[types.Sequence, types.ShapeDType]):
   """Computes one-hot vector of the input."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.OneHot.Config):
     depth: int
     compute_dtype: types.DType = jnp.float32
     name: str | None = None
 
+    @override
     def make(self) -> 'OneHot':
-      return OneHot(self, name=self.name)
+      return OneHot(config=self, name=self.name)
 
   config: Config
 
@@ -1650,6 +1961,7 @@ def _validate(self, dtype: types.DType):
           f' {dtype}'
       )
 
+  @override
   @nn.nowrap
   def get_output_shape(
       self,
@@ -1659,6 +1971,7 @@ def get_output_shape(
   ) -> types.Shape:
     return tuple(input_shape) + (self.config.depth,)
 
+  @override
   @nn.nowrap
   def get_output_dtype(
       self,
@@ -1669,7 +1982,9 @@ def get_output_dtype(
     self._validate(input_dtype)
     return self.config.compute_dtype
 
+  @override
   @types.check_layer
+  # pyrefly: ignore[missing-override-decorator]
   def layer(
       self,
       x: types.Sequence,
@@ -1688,11 +2003,13 @@ def layer(
     )
 
 
-class Embedding(types.Stateless):
+class Embedding(
+    types.Stateless, spec.Embedding[types.Sequence, types.ShapeDType]
+):
   """Computes embeddings of integer input codes."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.Embedding.Config):
     """Config for Embedding."""
 
     # Dimensionality of the embedded values.
@@ -1712,11 +2029,13 @@ class Config(types.SequenceLayerConfig):
     name: str | None = None
     embedding_param_name: str = 'embedding'
 
+    @override
     def make(self) -> 'Embedding':
-      return Embedding(self, name=self.name)
+      return Embedding(config=self, name=self.name)
 
   config: Config
 
+  @override
   def setup(self):
     self.embedding = self.param(
         self.config.embedding_param_name,
@@ -1737,6 +2056,7 @@ def _validate(self, dtype: types.DType):
           f' {dtype}'
       )
 
+  @override
   @nn.nowrap
   def get_output_shape(
       self,
@@ -1744,8 +2064,10 @@ def get_output_shape(
       *,
       constants: types.Constants | None = None,
   ) -> types.Shape:
+    del constants
     return tuple(input_shape) + (self.config.dimension,)
 
+  @override
   @nn.nowrap
   def get_output_dtype(
       self,
@@ -1758,7 +2080,9 @@ def get_output_dtype(
       return self.config.param_dtype
     return self.config.compute_dtype
 
+  @override
   @types.check_layer
+  # pyrefly: ignore[missing-override-decorator]
   def layer(
       self,
       x: types.Sequence,
@@ -1840,7 +2164,7 @@ class Config(types.SequenceLayerConfig):
 
     @override
     def make(self) -> 'EmbeddingTranspose':
-      return EmbeddingTranspose(self, name=self.name)
+      return EmbeddingTranspose(config=self, name=self.name)
 
   config: Config
 
@@ -1858,6 +2182,7 @@ def get_output_dtype(
       *,
       constants: types.Constants | None = None,
   ) -> types.DType:
+    assert self.embedding.config is not None
     return utils.get_promoted_dtype(
         input_dtype,
         self.config.param_dtype or self.embedding.config.param_dtype,
@@ -1872,6 +2197,8 @@ def get_output_shape(
       *,
       constants: types.Constants | None = None,
   ) -> types.Shape:
+    del constants
+    assert self.config.embedding.config is not None
     if (
         not input_shape
         or input_shape[-1] != self.config.embedding.config.dimension
@@ -1885,14 +2212,16 @@ def get_output_shape(
   @override
   @types.check_layer
   @nn.compact
+  # pyrefly: ignore[missing-override-decorator]
   def layer(
       self,
       x: types.Sequence,
+      *,
       training: bool,
       constants: types.Constants | None = None,
   ) -> types.Sequence:
     del training, constants
-
+    assert self.embedding.config is not None
     if self.config.use_bias:
       bias_init = utils.shard_initializer(
           self.config.bias_init, self.config.bias_sharding
@@ -1917,11 +2246,15 @@ def layer(
     return ret
 
 
-class ExpandDims(types.PreservesType, types.Stateless):
+class ExpandDims(
+    types.PreservesType,
+    types.Stateless,
+    spec.ExpandDims[types.Sequence, types.ShapeDType],
+):
   """Applies jnp.expand_dims to the channels dimension of the input."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.ExpandDims.Config):
     """Configuration for ExpandDims."""
 
     # The axis or axes in the channel shape to expand dims on.
@@ -1935,8 +2268,9 @@ def __post_init__(self):
       if not isinstance(self.axis, int):
         object.__setattr__(self, 'axis', tuple(self.axis))
 
+    @override
     def make(self) -> 'ExpandDims':
-      return ExpandDims(self, name=self.name)
+      return ExpandDims(config=self, name=self.name)
 
   config: Config
 
@@ -1961,6 +2295,7 @@ def _normalize_and_validate_axes(
     return dims
 
   @nn.nowrap
+  @override
   def get_output_shape(
       self,
       input_shape: types.ShapeLike,
@@ -1975,6 +2310,8 @@ def get_output_shape(
     return tuple(output_shape)
 
   @types.check_layer
+  @override
+  # pyrefly: ignore[missing-override-decorator]
   def layer(
       self,
       x: types.Sequence,
@@ -1987,11 +2324,15 @@ def layer(
     return x.apply_values_masked(jnp.expand_dims, dims)
 
 
-class Reshape(types.PreservesType, types.Stateless):
+class Reshape(
+    types.PreservesType,
+    types.Stateless,
+    spec.Reshape[types.Sequence, types.ShapeDType],
+):
   """Reshapes the channels dimension of the input."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.Reshape.Config):
     """Configuration for Reshape."""
 
     # The new shape of the channels dimension. Can't contain -1, and must have
@@ -2004,8 +2345,9 @@ def __post_init__(self):
       # Use hashable types for sequences.
       object.__setattr__(self, 'output_shape', tuple(self.output_shape))
 
+    @override
     def make(self) -> 'Reshape':
-      return Reshape(self, name=self.name)
+      return Reshape(config=self, name=self.name)
 
   config: Config
 
@@ -2019,6 +2361,7 @@ def _validate_output_shape(self, input_shape: types.ShapeLike) -> None:
       )
 
   @nn.nowrap
+  @override
   def get_output_shape(
       self,
       input_shape: types.ShapeLike,
@@ -2030,6 +2373,8 @@ def get_output_shape(
     return tuple(self.config.output_shape)
 
   @types.check_layer
+  @override
+  # pyrefly: ignore[missing-override-decorator]
   def layer(
       self,
       x: types.Sequence,
@@ -2077,8 +2422,9 @@ def __post_init__(self):
       # Use hashable types for sequences.
       object.__setattr__(self, 'output_shape', tuple(self.output_shape))
 
+    @override
     def make(self) -> 'GlobalReshape':
-      return GlobalReshape(self, name=self.name)
+      return GlobalReshape(config=self, name=self.name)
 
   config: Config
 
@@ -2092,6 +2438,7 @@ def _validate_reshape(self, input_shape: types.ShapeLike) -> None:
       )
 
   @nn.nowrap
+  @override
   def get_output_shape(
       self,
       input_shape: types.ShapeLike,
@@ -2102,14 +2449,18 @@ def get_output_shape(
     return tuple(self.config.output_shape[1:])
 
   @property
+  @override
   def supports_step(self) -> bool:
     return False
 
   @property
+  @override
   def receptive_field_per_step(self) -> dict[int, types.ReceptiveField]:
     return {0: (-np.inf, np.inf)}
 
   @types.check_layer
+  @override
+  # pyrefly: ignore[missing-override-decorator]
   def layer(
       self,
       x: types.Sequence,
@@ -2137,11 +2488,15 @@ def layer(
     return types.Sequence(out, mask=mask)
 
 
-class Transpose(types.PreservesType, types.Stateless):
+class Transpose(
+    types.PreservesType,
+    types.Stateless,
+    spec.Transpose[types.Sequence, types.ShapeDType],
+):
   """Transposes (i.e., permutes) the channels dimension of the input."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.Transpose.Config):
     """Configuration for Transpose.
 
     The usage is the same as that of jax.numpy.transpose.
@@ -2167,12 +2522,13 @@ def __post_init__(self):
       if self.axes is not None:
         object.__setattr__(self, 'axes', tuple(self.axes))
 
+    @override
     def make(self) -> 'Transpose':
 
       if self.axes is not None and (0 in self.axes or 1 in self.axes):
         raise ValueError("Can't transpose batch or time dimension.")
 
-      return Transpose(self, name=self.name)
+      return Transpose(config=self, name=self.name)
 
   config: Config
 
@@ -2192,6 +2548,7 @@ def _validate_axes(self, input_shape: types.ShapeLike) -> tuple[int, ...]:
 
     return tuple(axes)
 
+  @override
   @nn.nowrap
   def get_output_shape(
       self,
@@ -2203,7 +2560,9 @@ def get_output_shape(
     axes = self._validate_axes(input_shape)
     return tuple(input_shape[a - 2] for a in axes)
 
+  @override
   @types.check_layer
+  # pyrefly: ignore[missing-override-decorator]
   def layer(
       self,
       x: types.Sequence,
@@ -2219,23 +2578,28 @@ class SwapAxes(Transpose):
   """Swap two channel axes."""
 
   @dataclasses.dataclass(frozen=True)
+  # pyrefly: ignore[bad-override]
   class Config(types.SequenceLayerConfig):
     axis1: int
     axis2: int
     name: str | None = None
 
+    @override
     def make(self) -> 'SwapAxes':
 
       axes = [self.axis1, self.axis2]
       if 0 in axes or 1 in axes:
         raise ValueError("Can't swap batch or time dimension.")
 
+      # pyrefly: ignore[missing-argument]
       return SwapAxes(
-          Transpose.Config(axes=axes, name=self.name), name=self.name
+          typing.cast(typing.Any, Transpose.Config(axes=axes, name=self.name)),
+          name=self.name,
       )
 
   @override
   def _validate_axes(self, input_shape: types.ShapeLike) -> tuple[int, ...]:
+    assert self.config.axes is not None
     ndim = 2 + len(input_shape)  # ndim including batch and time.
     axes = [a if a >= 0 else ndim + a for a in self.config.axes]
     if 0 in axes or 1 in axes:
@@ -2254,7 +2618,7 @@ class MoveAxis(Transpose):
   """Moves one or several channel axes to new locations."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(types.SequenceLayerConfig):  # pyrefly: ignore[bad-override]
     """Config of MoveAxis layer."""
 
     source: int | TypingSequence[int]
@@ -2266,24 +2630,22 @@ def __post_init__(self):
       object.__setattr__(self, 'source', to_tuple(self.source))
       object.__setattr__(self, 'destination', to_tuple(self.destination))
 
+    @override
     def make(self) -> 'MoveAxis':
-
-      if (
-          0 in self.source
-          or 1 in self.source
-          or 0 in self.destination
-          or 1 in self.destination
-      ):
+      source = typing.cast(TypingSequence[int], self.source)
+      destination = typing.cast(TypingSequence[int], self.destination)
+      if 0 in source or 1 in source or 0 in destination or 1 in destination:
         raise ValueError("Can't move batch or time dimension.")
 
-      if len(self.source) != len(self.destination):
+      if len(source) != len(destination):
         raise ValueError(
-            f'Inconsistent number of elements: {len(self.source)} vs'
-            f' {len(self.destination)}'
+            f'Inconsistent number of elements: {len(source)} vs'
+            f' {len(destination)}'
         )
 
-      return MoveAxis(self, name=self.name)
+      return MoveAxis(config=self, name=self.name)
 
+  # pyrefly: ignore[bad-override]
   config: Config
 
   @override
@@ -2311,12 +2673,15 @@ class Emit(types.PreservesType, types.PreservesShape, types.StatelessEmitting):
   class Config(types.SequenceLayerConfig):
     name: str | None = None
 
+    @override
     def make(self) -> 'Emit':
-      return Emit(self, name=self.name)
+      return Emit(config=self, name=self.name)
 
   config: Config
 
   @types.check_layer_with_emits
+  @override
+  # pyrefly: ignore[missing-override-decorator]
   def layer_with_emits(
       self,
       x: types.Sequence,
@@ -2337,12 +2702,15 @@ class Config(types.SequenceLayerConfig):
     emit_name: str
     name: str | None = None
 
+    @override
     def make(self) -> 'NamedEmit':
-      return NamedEmit(self, name=self.name)
+      return NamedEmit(config=self, name=self.name)
 
   config: Config
 
   @types.check_layer_with_emits
+  @override
+  # pyrefly: ignore[missing-override-decorator]
   def layer_with_emits(
       self,
       x: types.Sequence,
@@ -2351,24 +2719,32 @@ def layer_with_emits(
       constants: types.Constants | None = None,
   ) -> tuple[types.Sequence, types.Emits]:
     return x, {self.config.emit_name: x}
+    return x, {self.config.emit_name: x}
 
 
-class Dropout(types.PreservesType, types.StatelessPointwise):
+class Dropout(
+    types.PreservesType,
+    types.StatelessPointwise,
+    spec.Dropout[types.Sequence, types.ShapeDType],
+):
   """Computes dropout using Flax RNGs."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.Dropout.Config):
     rate: float = 0.0
     broadcast_dims: TypingSequence[int] = ()
     rng_collection: str = 'dropout'
     name: str | None = None
 
+    @override
     def make(self) -> 'Dropout':
-      return Dropout(self, name=self.name)
+      return Dropout(config=self, name=self.name)
 
   config: Config
 
+  @override
   @types.check_step
+  # pyrefly: ignore[missing-override-decorator]
   def step(
       self,
       x: types.Sequence,
@@ -2434,7 +2810,9 @@ def apply_dropout(self, x: jax.Array, training: bool) -> jax.Array:
     return x
 
   @nn.compact
+  @override
   @types.check_layer
+  # pyrefly: ignore[missing-override-decorator]
   def layer(
       self,
       x: types.Sequence,
@@ -2534,34 +2912,54 @@ def layer(
     return x.apply_values(lambda v: v + noise)
 
 
-class Downsample1D(types.PreservesType, types.PreservesShape, types.Stateless):
+class Downsample1D(
+    types.PreservesType,
+    types.Stateless,
+    spec.Downsample1D[types.Sequence, types.ShapeDType],
+):
   """A 1D downsampling layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.Downsample1D.Config):
     """Configuration for Downsample1D."""
 
     rate: int
     name: str | None = None
 
+    @override
     def make(self) -> 'Downsample1D':
-      return Downsample1D(self, name=self.name)
+      return Downsample1D(config=self, name=self.name)
 
   config: Config
 
   @property
+  @override
   def block_size(self) -> int:
     return self.config.rate
 
   @property
+  @override
   def output_ratio(self) -> fractions.Fraction:
     return fractions.Fraction(1, self.config.rate)
 
   @property
+  @override
   def input_latency(self) -> int:
     return self.config.rate - 1
 
+  @override
+  @nn.nowrap
+  def get_output_shape(
+      self,
+      input_shape: types.ShapeLike,
+      *,
+      constants: types.Constants | None = None,
+  ) -> types.Shape:
+    return tuple(input_shape)
+
+  @override
   @types.check_layer
+  # pyrefly: ignore[missing-override-decorator]
   def layer(
       self,
       x: types.Sequence,
@@ -2576,30 +2974,49 @@ def layer(
     )
 
 
-class Upsample1D(types.PreservesType, types.PreservesShape, types.Stateless):
+class Upsample1D(
+    types.PreservesType,
+    types.Stateless,
+    spec.Upsample1D[types.Sequence, types.ShapeDType],
+):
   """A 1D upsampling layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.Upsample1D.Config):
     """Configuration for Upsample1D."""
 
     rate: int
     name: str | None = None
 
+    @override
     def make(self) -> 'Upsample1D':
-      return Upsample1D(self, name=self.name)
+      return Upsample1D(config=self, name=self.name)
 
   config: Config
 
   @property
+  @override
   def output_ratio(self) -> fractions.Fraction:
     return fractions.Fraction(self.config.rate)
 
   @property
+  @override
   def receptive_field_per_step(self) -> dict[int, types.ReceptiveField]:
     return {s: (0, 0) for s in range(self.config.rate)}
 
+  @override
+  @nn.nowrap
+  def get_output_shape(
+      self,
+      input_shape: types.ShapeLike,
+      *,
+      constants: types.Constants | None = None,
+  ) -> types.Shape:
+    return tuple(input_shape)
+
+  @override
   @types.check_layer
+  # pyrefly: ignore[missing-override-decorator]
   def layer(
       self,
       x: types.Sequence,
@@ -2628,26 +3045,33 @@ class Config(types.SequenceLayerConfig):
     def __post_init__(self):
       object.__setattr__(self, 'rate', utils.normalize_2tuple(self.rate))
 
+    @override
     def make(self) -> 'Upsample2D':
-      return Upsample2D(self, name=self.name)
+      return Upsample2D(config=self, name=self.name)
 
   config: Config
 
   @property
+  @override
   def output_ratio(self) -> fractions.Fraction:
-    return fractions.Fraction(self.config.rate[0])
+    rate = typing.cast(TypingSequence[int], self.config.rate)
+    return fractions.Fraction(rate[0])
 
   @property
+  @override
   def receptive_field_per_step(self) -> dict[int, types.ReceptiveField]:
-    return {s: (0, 0) for s in range(self.config.rate[0])}
+    rate = typing.cast(TypingSequence[int], self.config.rate)
+    return {s: (0, 0) for s in range(rate[0])}
 
   @nn.nowrap
+  @override
   def get_output_shape(
       self,
       input_shape: types.ShapeLike,
       *,
       constants: types.Constants | None = None,
   ) -> types.Shape:
+    rate = typing.cast(TypingSequence[int], self.config.rate)
     if len(input_shape) != 2:
       raise ValueError(
           'Upsample2D requires rank 4 input got:'
@@ -2655,11 +3079,13 @@ def get_output_shape(
       )
 
     return (
-        input_shape[0] * self.config.rate[1],
+        input_shape[0] * rate[1],
         input_shape[1],
     )
 
   @types.check_layer
+  @override
+  # pyrefly: ignore[missing-override-decorator]
   def layer(
       self,
       x: types.Sequence,
@@ -2667,28 +3093,36 @@ def layer(
       training: bool,
       constants: types.Constants | None = None,
   ) -> types.Sequence:
-    values = jnp.repeat(x.values, self.config.rate[0], axis=1)
-    values = jnp.repeat(values, self.config.rate[1], axis=2)
-    mask = jnp.repeat(x.mask, self.config.rate[0], axis=1)
+    rate = typing.cast(TypingSequence[int], self.config.rate)
+    values = jnp.repeat(x.values, rate[0], axis=1)
+    values = jnp.repeat(values, rate[1], axis=2)
+    mask = jnp.repeat(x.mask, rate[0], axis=1)
 
     # Upsampling does not change the masked state, so use the type of x to
     # repack the upsampled values and mask.
     return type(x)(values, mask)
 
 
-class MaskInvalid(types.PreservesType, types.StatelessPointwise):
+class MaskInvalid(
+    types.PreservesType,
+    types.StatelessPointwise,
+    spec.MaskInvalid[types.Sequence, types.ShapeDType],
+):
   """Masks the input sequence."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.MaskInvalid.Config):
     name: str | None = None
 
+    @override
     def make(self) -> 'MaskInvalid':
-      return MaskInvalid(self, name=self.name)
+      return MaskInvalid(config=self, name=self.name)
 
   config: Config
 
+  @override
   @types.check_layer
+  # pyrefly: ignore[missing-override-decorator]
   def layer(
       self,
       x: types.Sequence,
@@ -2700,11 +3134,15 @@ def layer(
     return x.mask_invalid()
 
 
-class Logging(types.PreservesType, types.StatelessPointwise):
+class Logging(
+    types.PreservesType,
+    types.StatelessPointwise,
+    spec.Logging[types.Sequence, types.ShapeDType],
+):
   """Layer that logs input arguments to get_initial_state, step, and layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.Logging.Config):
     """Configuration for the Logging layer."""
 
     prefix: str = ''
@@ -2725,8 +3163,9 @@ class Config(types.SequenceLayerConfig):
         '\ttraining={training}\n\tconstants={constants}'
     )
 
+    @override
     def make(self) -> 'Logging':
-      return Logging(self)
+      return Logging(config=self)
 
   config: Config
 
@@ -2758,7 +3197,9 @@ def arrays_to_specs(leaf: Any) -> types.ShapeDType | str:
       kwargs = jax.tree.map(arrays_to_specs, kwargs)
       logging.info(format_str.format(prefix=self.config.prefix, **kwargs))
 
+  @override
   @types.check_layer
+  # pyrefly: ignore[missing-override-decorator]
   def layer(
       self,
       x: types.Sequence,
@@ -2774,10 +3215,11 @@ def layer(
     )
     return x
 
+  @override
   def get_initial_state(
       self,
       batch_size: int,
-      input_spec: types.ChannelSpec,
+      input_spec: types.ShapeDType,
       *,
       training: bool,
       constants: types.Constants | None = None,
@@ -2794,6 +3236,8 @@ def get_initial_state(
     )
 
   @types.check_step
+  @override
+  # pyrefly: ignore[missing-override-decorator]
   def step(
       self,
       x: types.Sequence,
@@ -2819,12 +3263,15 @@ class Argmax(types.Stateless):
   class Config(types.SequenceLayerConfig):
     name: str | None = None
 
+    @override
     def make(self) -> 'Argmax':
-      return Argmax(self, name=self.name)
+      return Argmax(config=self, name=self.name)
 
   config: Config
 
   @types.check_layer
+  @override
+  # pyrefly: ignore[missing-override-decorator]
   def layer(
       self,
       x: types.Sequence,
@@ -2834,6 +3281,7 @@ def layer(
   ) -> types.Sequence:
     return x.apply_values(jnp.argmax, axis=-1)
 
+  @override
   @nn.nowrap
   def get_output_shape(
       self,
@@ -2843,6 +3291,7 @@ def get_output_shape(
   ) -> types.Shape:
     return tuple(input_shape[:-1])
 
+  @override
   @nn.nowrap
   def get_output_dtype(
       self,
@@ -2880,12 +3329,14 @@ def __post_init__(self):
             f'`batch` and `time` are reserved axes labels (got {self.pattern}).'
         )
 
+    @override
     def make(self) -> 'EinopsRearrange':
-      return EinopsRearrange(self, name=self.name)
+      return EinopsRearrange(config=self, name=self.name)
 
   config: Config
 
   @property
+  @override
   def supports_step(self) -> bool:
     return True
 
@@ -2896,6 +3347,8 @@ def _get_rearrange_fn(self) -> Callable[[jax.Array], jax.Array]:
     return functools.partial(einops.rearrange, pattern=pattern, **axes_lengths)
 
   @types.check_layer
+  @override
+  # pyrefly: ignore[missing-override-decorator]
   def layer(
       self,
       x: types.Sequence,
@@ -2908,6 +3361,7 @@ def layer(
     return x.apply_values(rearrange_fn)
 
   @nn.nowrap
+  @override
   def get_output_shape(
       self,
       input_shape: types.ShapeLike,
@@ -2916,7 +3370,9 @@ def get_output_shape(
   ) -> types.Shape:
     del constants
     rearrange_fn = self._get_rearrange_fn()
-    output = jax.eval_shape(rearrange_fn, jnp.zeros((1, 1) + input_shape))
+    output = jax.eval_shape(
+        rearrange_fn, jnp.zeros((1, 1) + tuple(input_shape))
+    )
     return tuple(output.shape[2:])
 
 
@@ -2951,18 +3407,21 @@ def __post_init__(self):
             f'`batch` is a reserved axes labels (got {self.pattern}).'
         )
 
+    @override
     def make(self) -> 'GlobalEinopsRearrange':
-      return GlobalEinopsRearrange(self, name=self.name)
+      return GlobalEinopsRearrange(config=self, name=self.name)
 
   config: Config
 
   @property
+  @override
   def supports_step(self) -> bool:
     return False
 
   @property
-  def receptive_field(self) -> tuple[int | None, int | None]:
-    return (-np.inf, np.inf)
+  @override
+  def receptive_field(self) -> types.ReceptiveField:
+    return typing.cast(types.ReceptiveField, (-np.inf, np.inf))
 
   def _get_rearrange_fn(self) -> Callable[[jax.Array], jax.Array]:
     before, after = self.config.pattern.split('->')
@@ -2971,6 +3430,8 @@ def _get_rearrange_fn(self) -> Callable[[jax.Array], jax.Array]:
     return functools.partial(einops.rearrange, pattern=pattern, **axes_lengths)
 
   @types.check_layer
+  @override
+  # pyrefly: ignore[missing-override-decorator]
   def layer(
       self,
       x: types.Sequence,
@@ -3000,6 +3461,7 @@ def layer(
     return types.Sequence(values, mask)
 
   @nn.nowrap
+  @override
   def get_output_shape(
       self,
       input_shape: types.ShapeLike,
@@ -3013,12 +3475,16 @@ def get_output_shape(
     else:
       time_dim = 1
     output = jax.eval_shape(
-        rearrange_fn, jnp.zeros((1, time_dim) + input_shape)
+        rearrange_fn, jnp.zeros((1, time_dim) + tuple(input_shape))
     )
     return tuple(output.shape[2:])
 
 
-class Squeeze(types.PreservesType, types.Stateless):
+class Squeeze(
+    types.PreservesType,
+    types.Stateless,
+    spec.Squeeze[types.Sequence, types.ShapeDType],
+):
   """This layer squeezes all the depth dimensions of the input.
 
   I.e. [batch_size, time, *depth_dims -> [batch_size, time] (where all the
@@ -3026,13 +3492,14 @@ class Squeeze(types.PreservesType, types.Stateless):
   """
 
   @dataclasses.dataclass(frozen=True)
-  class Config(types.SequenceLayerConfig):
+  class Config(spec.Squeeze.Config):
     """Config of Squeeze."""
 
     axis: int | TypingSequence[int] | None = None
 
     name: str | None = None
 
+    @override
     def make(self) -> 'Squeeze':
 
       axis = self.axis
@@ -3042,7 +3509,7 @@ def make(self) -> 'Squeeze':
       elif axis is not None and (0 in axis or 1 in axis):
         raise ValueError('Batch and time (axis=0 or 1) cannot be squeezed.')
 
-      return Squeeze(self, name=self.name)
+      return Squeeze(config=self, name=self.name)
 
   config: Config
 
@@ -3057,6 +3524,7 @@ def _validate_axis(self, input_shape: types.ShapeLike) -> tuple[int, ...]:
 
     return tuple(axis)
 
+  @override
   @nn.nowrap
   def get_output_shape(
       self,
@@ -3071,7 +3539,9 @@ def get_output_shape(
         types.ShapeDType((0, 1) + tuple(input_shape), jnp.float32),
     ).shape[2:]
 
+  @override
   @types.check_layer
+  # pyrefly: ignore[missing-override-decorator]
   def layer(
       self,
       x: types.Sequence,
diff --git a/sequence_layers/jax/simple_test.py b/sequence_layers/jax/simple_test.py
index 60af7a4..fa7eab7 100644
--- a/sequence_layers/jax/simple_test.py
+++ b/sequence_layers/jax/simple_test.py
@@ -30,132 +30,20 @@
 import jax.experimental.mesh_utils  # Required for OSS.
 import jax.numpy as jnp
 import numpy as np
+
 from sequence_layers.jax import sharding as sharding_lib
 from sequence_layers.jax import simple
 from sequence_layers.jax import test_utils
 from sequence_layers.jax import types
+from sequence_layers.specs import simple_behaviors as spec
 
 
-class ScaleTest(test_utils.SequenceLayerTest):
-
-  @parameterized.parameters(((2, 13, 5),), ((2, 13, 5, 9),))
-  def test_basic(self, shape):
-    key = jax.random.PRNGKey(1234)
-    x = test_utils.random_sequence(*shape)
-    l = simple.Scale.Config(scale=2.0, name='scale').make()
-    l = self.init_and_bind_layer(key, l, x)
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, 1)
-    self.assertEqual(l.get_output_shape_for_sequence(x), shape[2:])
-    self.assertEqual(l.name, 'scale')
-    y = self.verify_contract(l, x, training=False)
-    self.assertEmpty(l.variables)
-    y_expected = x.apply_values(lambda v: v * 2.0)
-    self.assertSequencesEqual(y, y_expected)
-
-  @parameterized.parameters(((2, 13, 5),), ((2, 13, 9, 5),))
-  def test_ndarray(self, shape):
-    key = jax.random.PRNGKey(1234)
-    x = test_utils.random_sequence(*shape)
-    l = simple.Scale.Config(
-        scale=np.arange(5, dtype=np.float32), name='scale'
-    ).make()
-    l = self.init_and_bind_layer(key, l, x)
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, 1)
-    self.assertEqual(l.get_output_shape_for_sequence(x), shape[2:])
-    self.assertEqual(l.name, 'scale')
-    y = self.verify_contract(l, x, training=False)
-    self.assertEmpty(l.variables)
-    y_expected = x.apply_values(lambda v: v * np.arange(5, dtype=np.float32))
-    self.assertSequencesEqual(y, y_expected)
-
-  def test_broadcast(self):
-    key = jax.random.PRNGKey(1234)
-    x = test_utils.random_sequence(2, 3, 5, 1)
-    l = simple.Scale.Config(scale=np.ones((5, 9))).make()
-    l = self.init_and_bind_layer(key, l, x)
-    self.assertEqual(l.get_output_shape_for_sequence(x), (5, 9))
-
-  def test_too_many_dims(self):
-    x = test_utils.random_sequence(2, 3, 5, 1)
-    l = simple.Scale.Config(scale=np.ones((5, 5, 5))).make().bind({})
-    with self.assertRaises(ValueError):
-      l.get_output_shape_for_sequence(x)
-
-    with self.assertRaises(ValueError):
-      l.layer(x, training=False)
-
-  def test_broadcast_failure(self):
-    x = test_utils.random_sequence(2, 3, 5, 9)
-    l = simple.Scale.Config(scale=np.ones((5,))).make().bind({})
-    with self.assertRaises(ValueError):
-      l.get_output_shape_for_sequence(x)
-
-    with self.assertRaises(ValueError):
-      l.layer(x, training=False)
-
-
-class AddTest(test_utils.SequenceLayerTest):
-
-  @parameterized.parameters((((2, 13, 5)),), (((2, 13, 5, 9)),))
-  def test_add(self, shape):
-    key = jax.random.PRNGKey(1234)
-    x = test_utils.random_sequence(*shape)
-    l = simple.Add.Config(-2.0, name='add').make()
-    l = self.init_and_bind_layer(key, l, x)
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, 1)
-    self.assertEqual(l.get_output_shape_for_sequence(x), shape[2:])
-    self.assertEqual(l.name, 'add')
-    y = self.verify_contract(l, x, training=False)
-    self.assertEmpty(l.variables)
-    y_expected = x.apply_values(lambda v: v - 2.0).mask_invalid()
-    self.assertSequencesEqual(y, y_expected)
-
-  @parameterized.parameters(((2, 13, 5),), ((2, 13, 9, 5),))
-  def test_ndarray(self, shape):
-    key = jax.random.PRNGKey(1234)
-    x = test_utils.random_sequence(*shape)
-    l = simple.Add.Config(
-        shift=np.arange(5, dtype=np.float32), name='add'
-    ).make()
-    l = self.init_and_bind_layer(key, l, x)
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, 1)
-    self.assertEqual(l.get_output_shape_for_sequence(x), shape[2:])
-    self.assertEqual(l.name, 'add')
-    y = self.verify_contract(l, x, training=False)
-    self.assertEmpty(l.variables)
-    y_expected = x.apply_values(
-        lambda v: v + np.arange(5, dtype=np.float32)
-    ).mask_invalid()
-    self.assertSequencesEqual(y, y_expected)
-
-  def test_broadcast(self):
-    key = jax.random.PRNGKey(1234)
-    x = test_utils.random_sequence(2, 3, 5, 1)
-    l = simple.Add.Config(shift=np.ones((5, 9))).make()
-    l = self.init_and_bind_layer(key, l, x)
-    self.assertEqual(l.get_output_shape_for_sequence(x), (5, 9))
-
-  def test_too_many_dims(self):
-    x = test_utils.random_sequence(2, 3, 5, 1)
-    l = simple.Add.Config(shift=np.ones((5, 5, 5))).make().bind({})
-    with self.assertRaises(ValueError):
-      l.get_output_shape_for_sequence(x)
-
-    with self.assertRaises(ValueError):
-      l.layer(x, training=False)
+class ScaleTest(test_utils.SequenceLayerTest, spec.ScaleTest):
+  pass
 
-  def test_broadcast_failure(self):
-    x = test_utils.random_sequence(2, 3, 5, 9)
-    l = simple.Add.Config(shift=np.ones((5,))).make().bind({})
-    with self.assertRaises(ValueError):
-      l.get_output_shape_for_sequence(x)
 
-    with self.assertRaises(ValueError):
-      l.layer(x, training=False)
+class AddTest(test_utils.SequenceLayerTest, spec.AddTest):
+  pass
 
 
 class MinimumTest(test_utils.SequenceLayerTest):
@@ -344,34 +232,31 @@ def test_broadcast_failure(self):
       l.layer(x, training=False)
 
 
-class GatedUnitTest(test_utils.SequenceLayerTest):
+class GatedUnitTest(test_utils.SequenceLayerTest, spec.GatedUnitTest):
 
   @parameterized.parameters(
       itertools.product(
-          (simple.GatedUnit.Config(None, None),  # Bilinear
-           simple.GatedUnit.Config(None, jax.nn.swish),  # SwiGLU
-           simple.GatedUnit.Config(None, jax.nn.gelu),  # GeGLU
-           simple.GatedUnit.Config(lambda x: x, None),  # Bilinear
-           simple.GatedUnit.Config(jax.nn.swish, jax.nn.tanh),
-           simple.GatedTanhUnit.Config(),
-           simple.GatedLinearUnit.Config()),
-          ((2, 13, 6), (2, 13, 5, 10)))
-      )  # pyformat: disable
-  def test_gated_activation(self, layer_config, shape):
+          (
+              simple.GatedUnit.Config(None, None),  # Bilinear
+              simple.GatedUnit.Config(None, jax.nn.swish),  # SwiGLU
+              simple.GatedUnit.Config(None, jax.nn.gelu),  # GeGLU
+              simple.GatedUnit.Config(lambda x: x, None),  # Bilinear
+              simple.GatedUnit.Config(jax.nn.swish, jax.nn.tanh),
+              simple.GatedTanhUnit.Config(),
+              simple.GatedLinearUnit.Config(),
+          ),
+          ((2, 13, 6), (2, 13, 5, 10)),
+      )
+  )  # pyformat: disable
+  def test_variables_empty(self, layer_config, shape):
     key = jax.random.PRNGKey(1234)
     x = test_utils.random_sequence(*shape)
     l = layer_config.make()
     l = self.init_and_bind_layer(key, l, x)
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, 1)
-    self.assertEqual(
-        l.get_output_shape_for_sequence(x), shape[2:-1] + (shape[-1] // 2,)
-    )
-    self.verify_contract(l, x, training=True)
     self.assertEmpty(l.variables)
 
 
-class DropoutTest(test_utils.SequenceLayerTest):
+class DropoutTest(test_utils.SequenceLayerTest, spec.DropoutTest):
 
   @parameterized.parameters(
       jnp.float32, jnp.bfloat16, jnp.int32, jnp.int8, jnp.bool
@@ -588,28 +473,8 @@ def test_slice_wrongsize(self):
       l.layer(x, training=False)
 
 
-class FlattenTest(test_utils.SequenceLayerTest):
-
-  @parameterized.parameters(
-      (((2, 3, 5)),), (((2, 3, 5, 9)),), (((2, 3, 5, 9, 2)),)
-  )
-  def test_flatten(self, shape):
-    key = jax.random.PRNGKey(1234)
-    x = test_utils.random_sequence(*shape)
-    l = simple.Flatten.Config(name='flatten').make()
-    l = self.init_and_bind_layer(key, l, x)
-
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, 1)
-    num_elements = np.prod(shape[2:])
-    self.assertEqual(l.get_output_shape_for_sequence(x), (num_elements,))
-    self.assertEqual(l.name, 'flatten')
-
-    y = self.verify_contract(l, x, training=False)
-    self.assertEmpty(l.variables)
-
-    y_expected = x.apply_values(jnp.reshape, shape[:2] + (num_elements,))
-    self.assertSequencesEqual(y, y_expected)
+class FlattenTest(test_utils.SequenceLayerTest, spec.FlattenTest):
+  pass
 
 
 class GlobalReshapeTest(test_utils.SequenceLayerTest):
@@ -678,30 +543,7 @@ def test_wrong_shape(self):
       self.init_and_bind_layer(key, l, x)
 
 
-class ReshapeTest(test_utils.SequenceLayerTest):
-
-  @parameterized.parameters(
-      ((2, 3, 5), (1, 5, 1)),
-      ((2, 3, 5, 9), (3, 3, 5)),
-      ((2, 3, 1), ()),
-      ((2, 3), (1,)),
-  )
-  def test_reshape(self, shape, output_shape):
-    key = jax.random.PRNGKey(1234)
-    x = test_utils.random_sequence(*shape)
-    l = simple.Reshape.Config(output_shape, name='reshape').make()
-    l = self.init_and_bind_layer(key, l, x)
-
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, 1)
-    self.assertEqual(l.get_output_shape_for_sequence(x), output_shape)
-    self.assertEqual(l.name, 'reshape')
-
-    y = self.verify_contract(l, x, training=False)
-    self.assertEmpty(l.variables)
-
-    y_expected = x.apply_values(jnp.reshape, shape[:2] + output_shape)
-    self.assertSequencesEqual(y, y_expected)
+class ReshapeTest(test_utils.SequenceLayerTest, spec.ReshapeTest):
 
   def test_wrong_shape(self):
     l = simple.Reshape.Config([4], name='reshape').make().bind({})
@@ -714,36 +556,7 @@ def test_wrong_shape(self):
       l.layer(x, training=False)
 
 
-class TransposeTest(test_utils.SequenceLayerTest):
-
-  @parameterized.parameters(
-      ((2, 3, 4, 5), (2, 3), (4, 5)),
-      ((2, 3, 4, 5, 6), (4, 2, 3), (6, 4, 5)),
-      ((2, 3, 1, 2, 3), None, (3, 2, 1)),
-      ((2, 3), tuple(), tuple()),
-      ((2, 3), None, tuple()),
-  )
-  def test_transpose(self, input_shape, axes, output_shape):
-    key = jax.random.PRNGKey(1234)
-    x = test_utils.random_sequence(*input_shape)
-    l = simple.Transpose.Config(axes=axes, name='transpose').make()
-    l = self.init_and_bind_layer(key, l, x)
-
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, 1)
-    self.assertEqual(l.get_output_shape_for_sequence(x), output_shape)
-    self.assertEqual(l.name, 'transpose')
-
-    y = self.verify_contract(l, x, training=False)
-    self.assertEmpty(l.variables)
-
-    if axes is not None:
-      y_expected = x.apply_values(jnp.transpose, (0, 1) + axes)
-    else:
-      axes = (0, 1) + tuple(range(2, x.ndim))[::-1]
-      y_expected = x.apply_values(jnp.transpose, axes)
-
-    self.assertSequencesEqual(y, y_expected)
+class TransposeTest(test_utils.SequenceLayerTest, spec.TransposeTest):
 
   @parameterized.parameters(
       ((2, 3), (2,)),
@@ -1066,20 +879,20 @@ def layer_vjp_fn(
     self.assertSequencesEqual(expected_gradients, y_layer_x_grad)
 
 
-class IdentityTest(test_utils.SequenceLayerTest):
+class ModuleSpecTest(test_utils.SequenceLayerTest, spec.ModuleSpecTest):
+  pass
+
+
+class IdentityTest(test_utils.SequenceLayerTest, spec.IdentityTest):
 
   @parameterized.parameters((((2, 3, 5)),), (((2, 3, 5, 9)),))
-  def test_identity(self, shape):
+  def test_jax_specifics(self, shape):
     key = jax.random.PRNGKey(1234)
     x = test_utils.random_sequence(*shape)
     l = simple.Identity(name='identity')
     l = self.init_and_bind_layer(key, l, x)
 
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, 1)
     self.assertEqual(l.get_output_shape_for_sequence(x), shape[2:])
-    self.assertEqual(l.name, 'identity')
-    self.verify_contract(l, x, training=False)
     self.assertEmpty(l.variables)
 
 
@@ -1157,10 +970,10 @@ def loss_fn(x_values):
     chex.assert_trees_all_close(grad, jnp.zeros_like(x.values))
 
 
-class OneHotTest(test_utils.SequenceLayerTest):
+class OneHotTest(test_utils.SequenceLayerTest, spec.OneHotTest):
 
   @parameterized.parameters(((1, 2, 3),), ((2, 3, 5, 9),), ((2, 3, 5, 9, 2),))
-  def test_one_hot(self, shape):
+  def test_variables_empty(self, shape):
     key = jax.random.PRNGKey(1234)
     depth = 4
     l = simple.OneHot.Config(depth, name='one_hot').make()
@@ -1213,11 +1026,13 @@ def embedding_layer_from_weights(
   return layer
 
 
-class EmbeddingTest(test_utils.SequenceLayerTest):
+class EmbeddingTest(test_utils.SequenceLayerTest, spec.EmbeddingTest):
 
-  @parameterized.parameters(((1, 2, 3),), ((2, 3, 5, 9),), ((2, 3, 5, 9, 2),))
-  def test_embedding(self, shape):
-    key = jax.random.PRNGKey(1234)
+  def test_embedding(self):
+    super().test_embedding()
+
+    # JAX-specific variables check
+    shape = (2, 3, 5, 9)
     dimension, num_embeddings = 8, 5
 
     l = simple.Embedding.Config(
@@ -1226,22 +1041,13 @@ def test_embedding(self, shape):
     x = test_utils.random_sequence(
         *shape, dtype=jnp.int32, low=0, high=num_embeddings - 1
     )
-
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, 1)
-    self.assertEqual(
-        l.get_output_shape_for_sequence(x), shape[2:] + (dimension,)
-    )
-    self.assertEqual(l.name, 'embedding')
-    l = self.init_and_bind_layer(key, l, x)
+    l = self.init_and_bind_layer(jax.random.PRNGKey(1234), l, x)
 
     y = self.verify_contract(
         l,
         x,
         training=False,
-        # Integer tensors have no gradient to test.
         test_gradients=False,
-        # Receptive field test is not supported for integers.
         test_receptive_field=False,
     )
 
@@ -1711,11 +1517,10 @@ def test_broadcast_failure(self):
       l.layer(x, training=False)
 
 
-class PointwiseMathTest(test_utils.SequenceLayerTest):
+class PointwiseMathTest(test_utils.SequenceLayerTest, spec.PointwiseMathTest):
 
   @parameterized.parameters(
       (simple.Abs.Config(), jnp.abs, (jnp.float32, jnp.complex64), None),
-      (simple.Elu.Config(), jax.nn.elu, (jnp.float32,), None),
       (simple.Exp.Config(), jnp.exp, (jnp.float32,), None),
       (simple.Gelu.Config(), jax.nn.gelu, (jnp.float32,), None),
       (simple.LeakyRelu.Config(), jax.nn.leaky_relu, (jnp.float32,), None),
@@ -1728,14 +1533,10 @@ class PointwiseMathTest(test_utils.SequenceLayerTest):
       (simple.Log.Config(), jnp.log, (jnp.float32,), None),
       (simple.Power.Config(2), jnp.square, (jnp.float32,), None),
       (simple.Power.Config(0.5), jnp.sqrt, (jnp.float32,), None),
-      (simple.Relu.Config(), jax.nn.relu, (jnp.float32,), None),
-      (simple.Sigmoid.Config(), jax.nn.sigmoid, (jnp.float32,), None),
-      (simple.Softmax.Config(), jax.nn.softmax, (jnp.float32,), None),
-      (simple.Softplus.Config(), jax.nn.softplus, (jnp.float32,), None),
-      (simple.Swish.Config(), jax.nn.swish, (jnp.float32,), None),
-      (simple.Tanh.Config(), jnp.tanh, (jnp.float32,), None),
   )
-  def test_pointwise_math(self, config, op, dtypes, expected_params):
+  def test_jax_specific_pointwise_math(
+      self, config, op, dtypes, expected_params
+  ):
     key = jax.random.PRNGKey(1234)
     batch_size, time, channels = 2, 10, 4
     for dtype in dtypes:
@@ -1770,81 +1571,9 @@ def test_pointwise_math(self, config, op, dtypes, expected_params):
       y_expected = x.apply_values(op).mask_invalid()
       self.assertSequencesClose(y, y_expected)
 
-  @parameterized.parameters(
-      (simple.Softmax.Config(), jax.nn.softmax, (jnp.float32,), -1),
-      (simple.Softmax.Config(), jax.nn.softmax, (jnp.float32,), -2),
-      (simple.Softmax.Config(), jax.nn.softmax, (jnp.float32,), 2),
-      (simple.Softmax.Config(), jax.nn.softmax, (jnp.float32,), 3),
-  )
-  def test_pointwise_math_axis(self, config, op, dtypes, axis):
-    key = jax.random.PRNGKey(1234)
-    batch_size, time, channels, channels2 = 2, 10, 4, 3
-    for dtype in dtypes:
-      x = test_utils.random_sequence(
-          batch_size, time, channels, channels2, dtype=dtype
-      )
-      l = dataclasses.replace(config, name='test', axis=axis).make()
-      l = self.init_and_bind_layer(key, l, x)
-
-      self.assertEqual(l.block_size, 1)
-      self.assertEqual(l.output_ratio, 1)
-      self.assertEqual(
-          l.get_output_shape_for_sequence(x), (channels, channels2)
-      )
-      self.assertEqual(l.name, 'test')
-      y = self.verify_contract(l, x, training=False)
-      self.assertEmpty(l.variables)
-
-      y_expected = x.apply_values(
-          functools.partial(op, axis=axis)
-      ).mask_invalid()
-      self.assertSequencesClose(y, y_expected)
-
-  @parameterized.parameters(
-      (simple.Softmax.Config(), (2, 10, 4), -2),
-      (simple.Softmax.Config(), (2, 10, 4), -3),
-      (simple.Softmax.Config(), (2, 10, 4), 0),
-      (simple.Softmax.Config(), (2, 10, 4), 1),
-      (simple.Softmax.Config(), (2, 10), -1),
-  )
-  def test_pointwise_math_axis_invalid(self, config, shape, axis):
-    key = jax.random.PRNGKey(1234)
-    x = test_utils.random_sequence(*shape)
-    l = dataclasses.replace(config, name='test', axis=axis).make()
-
-    with self.assertRaises(ValueError):
-      self.init_and_bind_layer(key, l, x)
-
 
-class CastTest(test_utils.SequenceLayerTest):
-
-  @parameterized.parameters(
-      (((2, 3, 5)), jnp.float16),
-      (((2, 3, 5, 9)), jnp.int32),
-  )
-  def test_cast(self, shape, target_dtype):
-    key = jax.random.PRNGKey(1234)
-    x = test_utils.random_sequence(*shape, dtype=jnp.float32)
-    l = simple.Cast.Config(target_dtype, name='cast').make()
-    l = self.init_and_bind_layer(key, l, x)
-
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, 1)
-    self.assertEqual(l.get_output_shape_for_sequence(x), shape[2:])
-    self.assertEqual(l.name, 'cast')
-
-    test_receptive_field = jnp.issubdtype(target_dtype, jnp.inexact)
-    y = self.verify_contract(
-        l,
-        x,
-        training=False,
-        padding_invariance_pad_value=jnp.nan
-        if target_dtype == jnp.float16
-        else 32768,
-        test_receptive_field=test_receptive_field,
-    )
-    self.assertEmpty(l.variables)
-    self.assertEqual(y.values.dtype, target_dtype)
+class CastTest(test_utils.SequenceLayerTest, spec.CastTest):
+  pass
 
 
 class ApplyShardingTest(test_utils.SequenceLayerTest):
@@ -1887,79 +1616,8 @@ def test_basic(self):
       # TODO(rryan): Test sharding was applied.
 
 
-class LambdaTest(test_utils.SequenceLayerTest):
-
-  @parameterized.parameters(True, False)
-  def test_array_fn(self, mask_required: bool):
-    def fn(v: jax.Array) -> jax.Array:
-      if mask_required:
-        # Change the masked status by adding 1.
-        v = v + 1.0
-      return v.reshape(v.shape + (1,)) > 0.5
-
-    l = (
-        simple.Lambda.Config(
-            fn,
-            mask_required=mask_required,
-            expected_input_spec=types.ShapeDType((5,), jnp.float32),
-            name='lambda',
-        )
-        .make()
-        .bind({})
-    )
-
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, 1)
-    # Output spec reflects the changed shape and dtype.
-    x = test_utils.random_sequence(2, 3, 5)
-    self.assertEqual(l.get_output_shape_for_sequence(x), (5, 1))
-    self.assertEqual(l.get_output_dtype(x.dtype), jnp.bool_)
-    self.assertEqual(l.name, 'lambda')
-    y = self.verify_contract(
-        l,
-        x,
-        training=False,
-        # Receptive field test is not supported for bools.
-        test_receptive_field=False,
-    )
-    self.assertEmpty(l.variables)
-    self.assertSequencesClose(y, x.apply_values(fn).mask_invalid())
-
-  @parameterized.parameters(True, False)
-  def test_sequence_fn(self, mask_required: bool):
-    def fn(x: types.Sequence) -> types.Sequence:
-      if mask_required:
-        # Change the masked status by adding 1.
-        x = x.apply_values(lambda v: v + 1.0)
-      return x.apply_values_masked(lambda v: v.reshape(v.shape + (1,)) > 0.5)
-
-    l = (
-        simple.Lambda.Config(
-            fn,
-            sequence_input=True,
-            expected_input_spec=types.ShapeDType((5,), jnp.float32),
-            name='lambda',
-        )
-        .make()
-        .bind({})
-    )
-
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, 1)
-    # Output spec reflects the changed shape and dtype.
-    x = test_utils.random_sequence(2, 3, 5)
-    self.assertEqual(l.get_output_shape_for_sequence(x), (5, 1))
-    self.assertEqual(l.get_output_dtype(x.dtype), jnp.bool_)
-    self.assertEqual(l.name, 'lambda')
-    y = self.verify_contract(
-        l,
-        x,
-        training=False,
-        # Receptive field test is not supported for bools.
-        test_receptive_field=False,
-    )
-    self.assertEmpty(l.variables)
-    self.assertSequencesClose(y, fn(x).mask_invalid())
+class LambdaTest(test_utils.SequenceLayerTest, spec.LambdaTest):
+  """Test behavior of Lambda layer."""
 
   def test_invalid_input(self):
     """Input that does not match expected_input_spec raises ValueError."""
@@ -2001,23 +1659,19 @@ def test_invalid_fn(self):
       l.layer(x, training=False)
 
 
-class CheckpointNameTest(test_utils.SequenceLayerTest):
+class CheckpointNameTest(test_utils.SequenceLayerTest, spec.CheckpointNameTest):
+  """Test behavior of CheckpointName layer."""
 
   def test_basic(self):
-    key = jax.random.PRNGKey(1234)
+    super().test_basic()
+
     x = test_utils.random_sequence(2, 3, 5)
+    key = jax.random.PRNGKey(1234)
     l = simple.CheckpointName.Config(
         checkpoint_name='test', name='checkpoint_name'
     ).make()
     l = self.init_and_bind_layer(key, l, x)
 
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, 1)
-    self.assertEqual(l.get_output_shape_for_sequence(x), (5,))
-    self.assertEqual(l.name, 'checkpoint_name')
-    self.verify_contract(l, x, training=False)
-    self.assertEmpty(l.variables)
-
     policy = jax.checkpoint_policies.save_only_these_names('test')
 
     @functools.partial(jax.checkpoint, policy=policy)
@@ -2034,42 +1688,12 @@ def f(x: types.Sequence) -> types.Sequence:
     )
 
 
-class Downsample1DTest(test_utils.SequenceLayerTest):
+class Downsample1DTest(test_utils.SequenceLayerTest, spec.Downsample1DTest):
+  pass
 
-  @parameterized.parameters(((2, 3, 5), 2), ((2, 3, 5, 9), 3))
-  def test_downsample1d(self, shape, rate):
-    l = simple.Downsample1D.Config(rate, name='downsample_1d').make().bind({})
 
-    self.assertEqual(l.block_size, rate)
-    self.assertEqual(1 / l.output_ratio, rate)
-    self.assertTrue(l.supports_step)
-    self.assertEqual(l.name, 'downsample_1d')
-    self.assertEmpty(l.variables)
-
-    x = test_utils.random_sequence(*shape)
-    self.assertEqual(l.get_output_shape_for_sequence(x), x.channel_shape)
-    y = self.verify_contract(l, x, training=False)
-    self.assertAllEqual(x.values[:, ::rate], y.values)
-    self.assertAllEqual(x.mask[:, ::rate], y.mask)
-
-
-class Upsample1DTest(test_utils.SequenceLayerTest):
-
-  @parameterized.parameters(((2, 3, 5), 2), ((2, 3, 5, 9), 3))
-  def test_upsample1d(self, shape, rate):
-    l = simple.Upsample1D.Config(rate, name='upsample_1d').make().bind({})
-
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, rate)
-    self.assertTrue(l.supports_step)
-    self.assertEqual(l.name, 'upsample_1d')
-    self.assertEmpty(l.variables)
-
-    x = test_utils.random_sequence(*shape)
-    self.assertEqual(l.get_output_shape_for_sequence(x), x.channel_shape)
-    y = self.verify_contract(l, x, training=False)
-    for i in range(rate):
-      self.assertAllEqual(x.values, y.values[:, i::rate])
+class Upsample1DTest(test_utils.SequenceLayerTest, spec.Upsample1DTest):
+  pass
 
 
 class Upsample2DTest(test_utils.SequenceLayerTest):
@@ -2095,24 +1719,8 @@ def test_upsample2d(self, shape, rate):
       self.assertAllEqual(x.values, y.values[:, i :: rate[0], j :: rate[1], :])
 
 
-class MaskInvalidTest(test_utils.SequenceLayerTest):
-
-  def test_basic(self):
-    x = test_utils.random_sequence(2, 15, 5)
-    l = simple.MaskInvalid.Config(name='mask_invalid').make().bind({})
-
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, 1)
-    self.assertEqual(l.get_output_shape_for_sequence(x), (5,))
-    self.assertEqual(l.name, 'mask_invalid')
-    self.verify_contract(l, x, training=False)
-    self.assertEmpty(l.variables)
-
-    x = x.mask_invalid(np.nan)
-    self.assertIsInstance(x, types.Sequence)
-    y = l.layer(x, training=False)
-    self.assertIsInstance(y, types.MaskedSequence)
-    self.assertSequencesEqual(x.mask_invalid(), y)
+class MaskInvalidTest(test_utils.SequenceLayerTest, spec.MaskInvalidTest):
+  pass
 
 
 class ReduceTest(test_utils.SequenceLayerTest):
@@ -2195,106 +1803,8 @@ def test_reduce_invalid_axis(self, layer_config, axis):
       self.init_and_bind_layer(key, l, x)
 
 
-class Has:
-  """A simple `HAS(v)` matcher that tests whether something has `v` in it."""
-
-  def __init__(self, value):
-    self._v = value
-
-  def __eq__(self, o):
-    return self._v in o
-
-  def __ne__(self, o):
-    return not self == o
-
-  def __repr__(self):
-    return '<HAS(%r)>' % self._v
-
-
-class Not:
-  """Negates a matcher."""
-
-  def __init__(self, matcher):
-    self._matcher = matcher
-
-  def __eq__(self, o):
-    return self._matcher != o
-
-  def __ne__(self, o):
-    return not self == o
-
-  def __repr__(self):
-    return '<NOT(%r)>' % self._matcher
-
-
-class LoggingTest(test_utils.SequenceLayerTest):
-
-  @mock.patch.object(logging, 'info', wraps=logging.info)
-  def test_logs_tensors(self, mock_logger):
-    x = types.Sequence.from_values(jnp.asarray([[1.414, 2, 3, 4]]))
-    state = types.Sequence.from_values(jnp.asarray([[1, 2.718, 3, 4]]))
-    training = False
-    constants = {
-        'foo': jnp.asarray([[1, 2, 3.14, 4]]),
-        'bar': np.asarray([[1, 2, 3, 4.2]]),
-    }
-
-    with self.subTest('prefix'):
-      l = simple.Logging.Config(prefix='test string').make().bind({})
-      l.layer(x, training=training, constants=constants)
-      mock_logger.assert_called_with(Has('test string'))
-
-    with self.subTest('specs_only'):
-      l = simple.Logging.Config(dump_tensors=False).make().bind({})
-      with self.subTest('layer'):
-        l.layer(x, training=training, constants=constants)
-        mock_logger.assert_called_with(Not(Has('1.414')))
-        mock_logger.assert_called_with(Not(Has('3.14')))
-        mock_logger.assert_called_with(Not(Has('4.2')))
-        mock_logger.assert_called_with(Has('(1, 4)'))
-        mock_logger.assert_called_with(Has('float32'))
-      with self.subTest('get_initial_state'):
-        l.get_initial_state(
-            batch_size=x.shape[0],
-            input_spec=x.channel_spec,
-            training=training,
-            constants=constants,
-        )
-        mock_logger.assert_called_with(Not(Has('3.14')))
-        mock_logger.assert_called_with(Not(Has('4.2')))
-        mock_logger.assert_called_with(Has('(1, 4)'))
-        mock_logger.assert_called_with(Has('float32'))
-      with self.subTest('step'):
-        l.step(x, state, training=training, constants=constants)
-        mock_logger.assert_called_with(Not(Has('1.414')))
-        mock_logger.assert_called_with(Not(Has('2.718')))
-        mock_logger.assert_called_with(Not(Has('3.14')))
-        mock_logger.assert_called_with(Not(Has('4.2')))
-        mock_logger.assert_called_with(Has('(1, 4)'))
-        mock_logger.assert_called_with(Has('float32'))
-
-    with self.subTest('dumps_tensors'):
-      l = simple.Logging.Config(dump_tensors=True).make().bind({})
-      with self.subTest('layer'):
-        l.layer(x, training=training, constants=constants)
-        mock_logger.assert_called_with(Has('1.414'))
-        mock_logger.assert_called_with(Has('3.14'))
-        mock_logger.assert_called_with(Has('4.2'))
-      with self.subTest('get_initial_state'):
-        l.get_initial_state(
-            batch_size=x.shape[0],
-            input_spec=x.channel_spec,
-            training=training,
-            constants=constants,
-        )
-        mock_logger.assert_called_with(Has('3.14'))
-        mock_logger.assert_called_with(Has('4.2'))
-      with self.subTest('step'):
-        l.step(x, state, training=training, constants=constants)
-        mock_logger.assert_called_with(Has('1.414'))
-        mock_logger.assert_called_with(Has('2.718'))
-        mock_logger.assert_called_with(Has('3.14'))
-        mock_logger.assert_called_with(Has('4.2'))
+class LoggingTest(test_utils.SequenceLayerTest, spec.LoggingTest):
+  """Test behavior of Logging layer."""
 
 
 class ArgmaxTest(test_utils.SequenceLayerTest):
@@ -2328,63 +1838,7 @@ def test_argmax(self, input_array: jnp.ndarray):
     self.assertAllEqual(y.values, jnp.array([[2], [0]]))
 
 
-class SqueezeTest(test_utils.SequenceLayerTest):
-
-  @parameterized.named_parameters(
-      dict(
-          testcase_name='float_input',
-          input_array=np.array(
-              [[[3]]],
-              dtype=np.float32,
-          ),
-          expected_output=np.array([[3]]),
-      ),
-      dict(
-          testcase_name='int_input',
-          input_array=np.array(
-              [[[3]]],
-              dtype=np.int32,
-          ),
-          expected_output=np.array([[3]], dtype=np.int32),
-      ),
-      dict(
-          testcase_name='no_op_input',
-          input_array=np.array(
-              [[3]],
-              dtype=np.float32,
-          ),
-          expected_output=np.array([[3]]),
-      ),
-      dict(
-          testcase_name='input_with_extra_dims',
-          input_array=np.array(
-              [[[[[3], [4]]]]],
-              dtype=np.float32,
-          ),
-          expected_output=np.array([[[3, 4]]]),
-      ),
-  )
-  def test_squeeze(
-      self, input_array: jnp.ndarray, expected_output: jnp.ndarray
-  ):
-    key = jax.random.PRNGKey(1234)
-    x = types.Sequence.from_values(input_array)
-    l = simple.Squeeze.Config(name='squeeze').make()
-    l = self.init_and_bind_layer(key, l, x)
-
-    _ = l.layer(x, training=False)
-
-    self.assertEqual(l.block_size, 1)
-    self.assertEqual(l.output_ratio, 1)
-    self.assertEqual(
-        l.get_output_shape_for_sequence(x), expected_output.shape[2:]
-    )
-    self.assertEqual(l.name, 'squeeze')
-    test_receptive_field = jnp.issubdtype(input_array.dtype, jnp.inexact)
-    self.verify_contract(
-        l, x, training=False, test_receptive_field=test_receptive_field
-    )
-    self.assertEmpty(l.variables)
+class SqueezeTest(test_utils.SequenceLayerTest, spec.SqueezeTest):
 
   @parameterized.parameters(
       ((2, 3, 1, 1, 1), 2, (1, 1)),
diff --git a/sequence_layers/jax/utils.py b/sequence_layers/jax/utils.py
index b900b59..7885a70 100644
--- a/sequence_layers/jax/utils.py
+++ b/sequence_layers/jax/utils.py
@@ -21,16 +21,20 @@
 import pprint
 import re
 import typing
-from typing import Any, Callable, Protocol, Self, Sequence as TypingSequence, TypeVar
+from typing import Any, Callable, Protocol, Self
+from typing import Sequence as TypingSequence
+from typing import TypeVar
 
 import flax.core.scope
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
 import numpy as np
+
 from sequence_layers.jax import meta
 from sequence_layers.jax import types
 from sequence_layers.jax import typing as jt
+from sequence_layers.specs import combinators as spec_combinators
 
 
 @jt.typed
@@ -637,49 +641,7 @@ def sequence_broadcast_affine(
   return types.Sequence(values, mask)
 
 
-@enum.unique
-class CombinationMode(enum.Enum):
-  """The type of combination to perform."""
-
-  # Broadcasts inputs together and stacks them on the first channel axis:
-  #
-  # Examples:
-  # x=() y=() -> (2)
-  # x=() y=(2) -> (2, 2)
-  # x=(3) y=(3) -> (2, 3)
-  # x=(5) y=(3, 5) -> (2, 3, 5)
-  STACK = 1
-
-  # Broadcasts inputs together and concatenates them on the final channel axis:
-  #
-  # Examples:
-  # x=() y=() -> (2)
-  # x=() y=(2) -> (3)
-  # x=(3) y=(3) -> (6)
-  # x=(5) y=(3, 5) -> (3, 10)
-  CONCAT = 2
-  # Broadcasts inputs together and adds them.
-  #
-  # Examples:
-  # x=() y=() -> ()
-  # x=() y=(2) -> (2)
-  # x=(3) y=(3) -> (3)
-  # x=(5) y=(3, 5) -> (3, 5)
-  ADD = 3
-  # Broadcasts inputs together and averages them.
-  #
-  # Examples:
-  # x=() y=() -> ()
-  # x=() y=(2) -> (2)
-  # x=(3) y=(3) -> (3)
-  # x=(5) y=(3, 5) -> (3, 5)
-  MEAN = 4
-  # Examples:
-  # x=() y=() -> ()
-  # x=() y=(2) -> (2)
-  # x=(3) y=(3) -> (3)
-  # x=(5) y=(3, 5) -> (3, 5)
-  PRODUCT = 5
+CombinationMode = spec_combinators.CombinationMode
 
 
 def sequence_broadcast_combine(
@@ -2233,6 +2195,7 @@ def layer_with_emits_spec(
       values_spec,
       types.ShapeDType(values_spec.shape[:2], dtype=types.MASK_DTYPE),
   )
+
   def layer_fn(
       layer: types.SequenceLayer,
       x: types.Sequence,
diff --git a/sequence_layers/mlx/__init__.py b/sequence_layers/mlx/__init__.py
index d44f0cf..a4b0940 100644
--- a/sequence_layers/mlx/__init__.py
+++ b/sequence_layers/mlx/__init__.py
@@ -1,4 +1,3 @@
-# pylint: disable=cyclic-import,g-importing-member
 # Copyright 2026 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,11 +13,112 @@
 # limitations under the License.
 """Sequence layers in MLX."""
 
+# CRITICAL: Do NOT use wildcard imports (e.g., `from .simple import *`) here.
+# Pyrefly (our static analysis tool) has a known limitation with cross-module
+# resolution of diamond inheritance chains. When wildcard imports are used to
+# re-export classes from `simple.py` (which combine `types` and `spec` bases),
+# Pyrefly fails to resolve the concrete method implementations in `mlx/types.py`
+# and flags all instances as abstract (`bad-instantiation` false positives).
+#
+# Explicit imports (e.g., `from .simple import Relu`) DO NOT trigger this issue.
+# If you need to expose specific layers at the package level, import them
+# explicitly instead of using a star import.
+from . import attention
 from . import backend
+from . import dense
+from . import dsp
+from . import projection_configs
+from . import simple
 from . import test_utils
 from . import types
+from . import types as basic_types
+from . import utils
+from .attention import DotProductAttention
+from .attention import DotProductSelfAttention
+from .attention import LocalDotProductSelfAttention
+from .attention import StreamingDotProductAttention
+from .attention import StreamingLocalDotProductAttention
+from .combinators import CombinationMode
+from .combinators import Parallel
+from .combinators import Repeat
+from .combinators import Residual
+from .combinators import Serial
+from .combinators import SerialCombinatorMixin
+from .combinators import SerialModules
+from .conditioning import Conditioning
+from .convolution import Conv1D
+from .convolution import Conv1DTranspose
+from .convolution import DepthwiseConv1D
+from .convolution2d import AveragePooling2D
+from .convolution2d import Conv2D
+from .convolution2d import Conv2DTranspose
+from .convolution2d import ParallelChannels
+from .convolution2d import Upsample2D
+from .dense import Dense
+from .dense import EinsumDense
+from .dsp import Delay
+from .dsp import FFT
+from .dsp import Frame
+from .dsp import IFFT
+from .dsp import InverseSTFT
+from .dsp import IRFFT
+from .dsp import LinearToMelSpectrogram
+from .dsp import Lookahead
+from .dsp import OverlapAdd
+from .dsp import RFFT
+from .dsp import STFT
+from .dsp import Window
+from .normalization import BatchNormalization
+from .normalization import GroupNormalization
+from .normalization import L2Normalize
+from .normalization import LayerNormalization
+from .normalization import RMSNormalization
+from .pooling import AveragePooling1D
+from .pooling import MaxPooling1D
+from .pooling import MinPooling1D
+from .position import AddTimingSignal
+from .position import ApplyRotaryPositionalEncoding
+from .projection_configs import CombinedQueryKeyValueProjection
+from .projection_configs import QueryAndKeyValueProjection
+from .projection_configs import QueryAndSharedKeyValueProjection
+from .projection_configs import SeparateQueryKeyValueProjection
+from .simple import Abs
+from .simple import Add
+from .simple import Cast
+from .simple import CheckpointName
+from .simple import Downsample1D
+from .simple import Dropout
+from .simple import Elu
+from .simple import Embedding
+from .simple import Exp
+from .simple import ExpandDims
+from .simple import Flatten
+from .simple import GatedLinearUnit
+from .simple import GatedTanhUnit
+from .simple import GatedUnit
+from .simple import Gelu
+from .simple import Identity
+from .simple import Lambda
+from .simple import LeakyRelu
+from .simple import Log
+from .simple import Logging
+from .simple import MaskInvalid
+from .simple import OneHot
+from .simple import Relu
+from .simple import Reshape
+from .simple import Scale
+from .simple import Sigmoid
+from .simple import Softmax
+from .simple import Softplus
+from .simple import Squeeze
+from .simple import Swish
+from .simple import Tanh
+from .simple import Transpose
+from .simple import Upsample1D
 from .test_utils import SequenceLayerTest
 from .types import ChannelSpec
+from .types import check_layer
+from .types import check_step
 from .types import Constants
 from .types import DType
 from .types import Emits
@@ -38,7 +138,10 @@
 from .types import StatelessPointwise
 
 __all__ = [
+    'basic_types',
+    'dense',
     'backend',
+    'simple',
     'types',
     'test_utils',
     'SequenceLayerTest',
@@ -47,6 +150,12 @@
     'MaskedSequence',
     'SequenceLayer',
     'SequenceLayerConfig',
+    'check_layer',
+    'check_step',
+    'Stateless',
+    'StatelessPointwise',
+    'PreservesShape',
+    'PreservesType',
     'MaskT',
     'Shape',
     'ShapeDType',
@@ -56,8 +165,87 @@
     'Emits',
     'Emitting',
     'ChannelSpec',
-    'Stateless',
-    'StatelessPointwise',
-    'PreservesShape',
-    'PreservesType',
+    'Conditioning',
+    'Dense',
+    'EinsumDense',
+    'Conv1D',
+    'DepthwiseConv1D',
+    'Conv1DTranspose',
+    'Conv2D',
+    'Conv2DTranspose',
+    'AveragePooling2D',
+    'Upsample2D',
+    'ParallelChannels',
+    'MaxPooling1D',
+    'MinPooling1D',
+    'AveragePooling1D',
+    'Serial',
+    'SerialModules',
+    'SerialCombinatorMixin',
+    'Residual',
+    'Repeat',
+    'Parallel',
+    'CombinationMode',
+    'DotProductSelfAttention',
+    'DotProductAttention',
+    'StreamingDotProductAttention',
+    'StreamingLocalDotProductAttention',
+    'LocalDotProductSelfAttention',
+    'CombinedQueryKeyValueProjection',
+    'QueryAndKeyValueProjection',
+    'QueryAndSharedKeyValueProjection',
+    'SeparateQueryKeyValueProjection',
+    'AddTimingSignal',
+    'ApplyRotaryPositionalEncoding',
+    'Identity',
+    'Relu',
+    'Gelu',
+    'Abs',
+    'Exp',
+    'Log',
+    'Swish',
+    'Tanh',
+    'Sigmoid',
+    'LeakyRelu',
+    'Elu',
+    'Softmax',
+    'Softplus',
+    'Cast',
+    'Scale',
+    'Add',
+    'MaskInvalid',
+    'GatedUnit',
+    'GatedLinearUnit',
+    'GatedTanhUnit',
+    'Flatten',
+    'Reshape',
+    'ExpandDims',
+    'Squeeze',
+    'Transpose',
+    'OneHot',
+    'Embedding',
+    'Dropout',
+    'Downsample1D',
+    'Upsample1D',
+    'CheckpointName',
+    'Lambda',
+    'Logging',
+    'L2Normalize',
+    'RMSNormalization',
+    'LayerNormalization',
+    'BatchNormalization',
+    'GroupNormalization',
+    'dsp',
+    'Delay',
+    'FFT',
+    'Frame',
+    'IFFT',
+    'IRFFT',
+    'InverseSTFT',
+    'LinearToMelSpectrogram',
+    'Lookahead',
+    'OverlapAdd',
+    'RFFT',
+    'STFT',
+    'Window',
 ]
diff --git a/sequence_layers/mlx/attention.py b/sequence_layers/mlx/attention.py
new file mode 100644
index 0000000..bbf1464
--- /dev/null
+++ b/sequence_layers/mlx/attention.py
@@ -0,0 +1,2422 @@
+"""Dot-product attention layers for MLX."""
+
+import dataclasses
+import math
+from types import MethodType
+from typing import Any, cast, override
+
+import mlx.core as mx
+
+from sequence_layers.mlx import init_mapping
+from sequence_layers.mlx import projection_configs
+from sequence_layers.mlx import types
+from sequence_layers.mlx.projection_configs import \
+    CombinedQueryKeyValueProjection
+from sequence_layers.mlx.projection_configs import QueryAndKeyValueProjection
+from sequence_layers.mlx.projection_configs import \
+    QueryAndSharedKeyValueProjection
+from sequence_layers.mlx.projection_configs import \
+    SeparateQueryKeyValueProjection
+from sequence_layers.specs import attention as attention_spec
+
+Sequence = types.Sequence
+MaskedSequence = types.MaskedSequence
+
+
+@dataclasses.dataclass(frozen=True)
+class SelfAttentionEmits:
+  """A structure for emits produced by self attention layers."""
+
+  probabilities: Sequence
+
+
+@dataclasses.dataclass(frozen=True)
+class CrossAttentionEmits:
+  """A structure for emits produced by attention layers."""
+
+  probabilities_by_source: dict[str, Sequence]
+
+
+__all__ = (
+    'DotProductSelfAttention',
+    'DotProductAttention',
+    'StreamingDotProductAttention',
+    'LocalDotProductSelfAttention',
+    'CombinedQueryKeyValueProjection',
+    'SeparateQueryKeyValueProjection',
+    'QueryAndKeyValueProjection',
+    'QueryAndSharedKeyValueProjection',
+    'SelfAttentionEmits',
+    'CrossAttentionEmits',
+)
+
+
+def _quantized_matmul_proj(x, q_weight, q_scales, q_biases, group_size, bits):
+  """Computes quantized matrix multiplication projection."""
+  return mx.quantized_matmul(
+      x,
+      q_weight,
+      scales=q_scales,
+      biases=q_biases,
+      transpose=True,
+      group_size=group_size,
+      bits=bits,
+  )
+
+
+def _query_scale_vector(per_dim_scale, query_scale, units_per_head, dtype):
+  """Compute the per-dimension query scale vector.
+
+  Returns:
+    scale: [units_per_head] array or scalar float.
+  """
+  if query_scale is None:
+    query_scale = 1.0 / math.sqrt(units_per_head)
+  if per_dim_scale is not None:
+    r_softplus_0 = 1.442695041
+    scale = r_softplus_0 * query_scale
+    softplus = mx.log1p(mx.exp(per_dim_scale.astype(dtype)))
+    return scale * softplus
+  return query_scale
+
+
+def _scale_queries(queries, per_dim_scale, query_scale, units_per_head):
+  """Scale queries, optionally with per-dimension learned scale.
+
+  Matches JAX backend's _scale_query in common.py.
+
+  Args:
+    queries: [b, num_heads, q_time, units_per_head].
+    per_dim_scale: [units_per_head] learned scale or None.
+    query_scale: float scale or None (defaults to 1/sqrt(uph)).
+    units_per_head: int.
+
+  Returns:
+    Scaled queries, same shape.
+  """
+  scale = _query_scale_vector(
+      per_dim_scale, query_scale, units_per_head, queries.dtype
+  )
+  return queries * scale
+
+
+def _causal_mask(q_len, kv_len):
+  """Build a [1, 1, q_len, kv_len] causal mask (True = attend)."""
+  # Each query at position i can attend to keys at positions
+  # [kv_len - q_len, ..., kv_len - q_len + i].
+  row = mx.arange(q_len)
+  col = mx.arange(kv_len)
+  # query i (global pos = kv_len - q_len + i) can see key j
+  # if j <= kv_len - q_len + i.
+  offset = kv_len - q_len
+  mask = mx.expand_dims(col, axis=0) <= (mx.expand_dims(row, axis=1) + offset)
+  return mask.reshape(1, 1, q_len, kv_len)
+
+
+class DotProductSelfAttention(
+    types.Emitting,
+    attention_spec.DotProductSelfAttention[types.Sequence, types.ChannelSpec],
+):
+  """Multi-headed dot-product self attention for MLX.
+
+  Supports:
+  - Grouped Query Attention (num_kv_heads < num_heads)
+  - Causal masking via max_past_horizon
+  - KV cache for step-by-step inference
+  - Optional query/key/value processing networks (e.g. RoPE)
+
+  Kernels are stored in Linen-compatible shapes:
+    q_proj: [in_features, num_heads * units_per_head]
+    k_proj: [in_features, num_kv_heads * units_per_head]
+    v_proj: [in_features, num_kv_heads * units_per_head]
+    out_proj: [num_heads * units_per_head, in_features]
+  """
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(
+      types.SequenceLayerConfig,
+      attention_spec.DotProductSelfAttention.Config,
+  ):
+    """MLX-native configuration for DotProductSelfAttention."""
+
+    num_heads: int
+    units_per_head: int
+    max_past_horizon: int
+    max_future_horizon: int = 0
+    num_kv_heads: int | None = None
+    attention_probabilities_dropout_rate: float = 0.0
+    broadcast_dropout_across_queries: bool = False
+    use_bias: bool = False
+    input_projection: projection_configs.QueryKeyValueProjectionConfig = (
+        dataclasses.field(
+            default_factory=projection_configs.CombinedQueryKeyValueProjection
+        )
+    )
+    query_network: Any = None
+    key_network: Any = None
+    value_network: Any = None
+    attention_logits_soft_cap: float | None = None
+    per_dim_scale: bool = False
+    query_scale: float | None = None
+    zero_fully_masked: bool = False
+    compute_dtype: types.DType | None = None
+    param_dtype: types.DType = mx.float32
+    num_sink_embeddings: int = 0
+    use_sink_scalars: bool = False
+    use_kv_cache_ringbuffer: bool = False
+    name: str | None = None
+
+    @override
+    def make(self) -> 'DotProductSelfAttention':
+      return DotProductSelfAttention(self)
+
+  def __init__(
+      self,
+      config: Config | None = None,
+      *,
+      in_features: int | None = None,
+      num_heads: int | None = None,
+      units_per_head: int | None = None,
+      max_past_horizon: int | None = None,
+      max_future_horizon: int = 0,
+      num_kv_heads: int | None = None,
+      use_bias: bool = False,
+      query_scale: float | None = None,
+      per_dim_scale: bool = False,
+      compute_dtype=None,
+      param_dtype=mx.float32,
+      kernel_init=None,
+      bias_init=None,
+      query_network: types.SequenceLayer | None = None,
+      key_network: types.SequenceLayer | None = None,
+      value_network: types.SequenceLayer | None = None,
+      attention_logits_soft_cap: float | None = None,
+      num_sink_embeddings: int = 0,
+      input_projection=None,
+  ):
+    super().__init__()
+    if config is not None:
+      self.config = config
+    else:
+      if (
+          num_heads is None
+          or units_per_head is None
+          or max_past_horizon is None
+      ):
+        raise ValueError(
+            'Must provide either config or num_heads, units_per_head, and'
+            ' max_past_horizon'
+        )
+      num_heads = cast(int, num_heads)
+      units_per_head = cast(int, units_per_head)
+      max_past_horizon = cast(int, max_past_horizon)
+      input_projection_val = (
+          input_projection
+          or projection_configs.CombinedQueryKeyValueProjection()
+      )
+      self.config = self.Config(
+          num_heads=num_heads,
+          units_per_head=units_per_head,
+          max_past_horizon=max_past_horizon,
+          max_future_horizon=max_future_horizon,
+          num_kv_heads=num_kv_heads,
+          use_bias=use_bias,
+          query_scale=query_scale,
+          per_dim_scale=per_dim_scale,
+          compute_dtype=compute_dtype,
+          param_dtype=param_dtype,
+          query_network=query_network,
+          key_network=key_network,
+          value_network=value_network,
+          attention_logits_soft_cap=attention_logits_soft_cap,
+          num_sink_embeddings=num_sink_embeddings,
+          input_projection=input_projection_val,
+      )
+
+    self.compute_dtype = (
+        init_mapping.to_mx_dtype(self.config.compute_dtype)
+        if self.config.compute_dtype
+        else None
+    )
+    self._param_dtype = (
+        init_mapping.to_mx_dtype(self.config.param_dtype) or mx.float32
+    )
+
+    self.in_features = None
+    self.num_heads = self.config.num_heads
+    self.units_per_head = self.config.units_per_head
+    self.max_past_horizon = self.config.max_past_horizon
+    self.max_future_horizon = self.config.max_future_horizon
+    self.num_kv_heads = self.config.num_kv_heads or self.num_heads
+    self.use_bias = self.config.use_bias
+    self._query_scale = self.config.query_scale
+    self._attention_logits_soft_cap = self.config.attention_logits_soft_cap
+
+    self._kernel_init = kernel_init
+    self._bias_init = bias_init
+    self._per_dim_scale = None
+
+    self.query_network: Any = query_network or self.config.query_network
+    self.key_network: Any = key_network or self.config.key_network
+    self.value_network: Any = value_network or self.config.value_network
+
+    self.sink_key_embeddings: Any = None
+    self.sink_value_embeddings: Any = None
+
+    self.q_proj: Any = None
+    self.kv_proj: Any = None
+    self.qkv_proj_qw: Any = None
+    self.qkv_proj_qs: Any = None
+    self.qkv_proj_qb: Any = None
+    self._quant_group_size: int | None = None
+    self._quant_bits: int | None = None
+    self._project_qkv_fn: Any = None
+
+    self._initialized = False
+
+    if in_features is not None:
+      self._ensure_initialized(in_features)
+
+  def _ensure_initialized(self, in_features: int):
+    """Ensure parameters and submodules are dynamically initialized."""
+    if self._initialized:
+      return
+    self._initialized = True
+    self.in_features = in_features
+
+    # pylint: disable=import-outside-toplevel
+    from sequence_layers.mlx import utils as mlx_utils
+
+    if hasattr(self.query_network, 'make'):
+      self.query_network = mlx_utils.make_layer(self.query_network)
+    if hasattr(self.key_network, 'make'):
+      self.key_network = mlx_utils.make_layer(self.key_network)
+    if hasattr(self.value_network, 'make'):
+      self.value_network = mlx_utils.make_layer(self.value_network)
+
+    param_dtype = self._param_dtype
+    per_dim_scale = self.config.per_dim_scale
+    units_per_head = self.units_per_head
+    num_heads = self.num_heads
+    num_kv_heads = self.num_kv_heads
+    use_bias = self.use_bias
+    input_projection = self.config.input_projection
+    num_sink_embeddings = self.config.num_sink_embeddings
+
+    self._per_dim_scale = (
+        mx.zeros((units_per_head,), dtype=param_dtype)
+        if per_dim_scale
+        else None
+    )
+
+    kernel_init = self._kernel_init
+    bias_init = self._bias_init
+
+    if kernel_init is None:
+      qkv_init = (
+          getattr(input_projection, 'qkv_kernel_init', None)
+          or getattr(input_projection, 'q_kernel_init', None)
+          or getattr(input_projection, 'kv_kernel_init', None)
+      )
+      if qkv_init is not None:
+        kernel_init = init_mapping.map_initializer(qkv_init)
+      else:
+        kernel_init = init_mapping.make_variance_scaling_init(
+            'fan_in', 'truncated_normal'
+        )
+
+    if bias_init is None:
+      qkv_bias_init = (
+          getattr(input_projection, 'bias_init', None)
+          or getattr(input_projection, 'q_bias_init', None)
+          or getattr(input_projection, 'kv_bias_init', None)
+      )
+      if qkv_bias_init is not None:
+        bias_init = init_mapping.map_initializer(qkv_bias_init)
+      else:
+        bias_init = init_mapping.zeros_init
+
+    key = mx.random.key(0)
+    q_dim = num_heads * units_per_head
+    kv_dim = num_kv_heads * units_per_head
+
+    self.input_projection = input_projection
+    if (
+        isinstance(
+            input_projection, projection_configs.CombinedQueryKeyValueProjection
+        )
+        and num_kv_heads == num_heads
+    ):
+      out_dim = q_dim + 2 * kv_dim
+      self.qkv_proj = kernel_init(key, (in_features, out_dim), param_dtype)
+      if use_bias:
+        self.qkv_bias = bias_init(key, (out_dim,), param_dtype)
+    else:
+      self.q_proj = kernel_init(key, (in_features, q_dim), param_dtype)
+      self.kv_proj = mx.concatenate(
+          [
+              kernel_init(key, (in_features, kv_dim), param_dtype),
+              kernel_init(key, (in_features, kv_dim), param_dtype),
+          ],
+          axis=-1,
+      )
+      if use_bias:
+        self.q_bias = bias_init(key, (q_dim,), param_dtype)
+        self.kv_bias = mx.concatenate(
+            [
+                bias_init(key, (kv_dim,), param_dtype),
+                bias_init(key, (kv_dim,), param_dtype),
+            ],
+            axis=-1,
+        )
+
+    self.num_sink_embeddings = num_sink_embeddings
+    if num_sink_embeddings > 0:
+      self.sink_key_embeddings = mx.zeros(
+          (num_sink_embeddings, num_heads, units_per_head), dtype=param_dtype
+      )
+      self.sink_value_embeddings = mx.zeros(
+          (num_sink_embeddings, num_kv_heads, units_per_head), dtype=param_dtype
+      )
+    else:
+      self.sink_key_embeddings = None
+      self.sink_value_embeddings = None
+
+  @property
+  @override
+  def supports_step(self):
+    supports = self.max_past_horizon >= 0 and self.max_future_horizon >= 0
+    if self.query_network is not None:
+      supports = supports and self.query_network.supports_step
+    if self.key_network is not None:
+      supports = supports and self.key_network.supports_step
+    if self.value_network is not None:
+      supports = supports and self.value_network.supports_step
+    return supports
+
+  @property
+  @override
+  def input_latency(self):
+    return max(0, self.max_future_horizon)
+
+  def _project_qkv(self, x):
+    """Project input to Q, K, V sequences."""
+    self._ensure_initialized(x.shape[-1])
+    b, t = x.shape[0], x.shape[1]
+    dtype = self.compute_dtype or x.dtype
+
+    v = x.values.astype(dtype)
+
+    if hasattr(self, 'qkv_proj'):
+      qkv = mx.matmul(v, self.qkv_proj.astype(dtype))
+      if self.use_bias:
+        qkv = qkv + self.qkv_bias.astype(dtype)
+
+      q, k, val = mx.split(qkv, 3, axis=-1)
+    else:
+      q = mx.matmul(v, self.q_proj.astype(dtype))
+      kv = mx.matmul(v, self.kv_proj.astype(dtype))
+      k, val = mx.split(kv, 2, axis=-1)
+
+      if self.use_bias:
+        q = q + self.q_bias.astype(dtype)
+        kv_bias = self.kv_bias.astype(dtype)
+        kb, vb = mx.split(kv_bias, 2, axis=-1)
+        k = k + kb
+        val = val + vb
+
+    # Reshape to [b, t, heads, units_per_head].
+    q = q.reshape(b, t, self.num_heads, self.units_per_head)
+    k = k.reshape(b, t, self.num_kv_heads, self.units_per_head)
+    val = val.reshape(b, t, self.num_kv_heads, self.units_per_head)
+
+    return (
+        Sequence(q, x.mask),
+        Sequence(k, x.mask),
+        Sequence(val, x.mask),
+    )
+
+  def _compute_attention(
+      self, queries, keys, values, mask, emit_attention_weights=False
+  ):
+    """Compute scaled dot-product attention.
+
+    Args:
+      queries: [b, q_t, num_heads, units_per_head]
+      keys: [b, kv_t, num_kv_heads, units_per_head]
+      values: [b, kv_t, num_kv_heads, units_per_head]
+      mask: [b, 1, q_t, kv_t] boolean mask (True = attend)
+      emit_attention_weights: bool. Whether to emit attention weights.
+
+    Returns:
+      (context, weights) tuple.
+        context: [b, q_t, num_heads, units_per_head]
+        weights: [b, q_t, num_heads, kv_t] or ()
+    """
+    # Use mx.fast.scaled_dot_product_attention unless soft_cap forces
+    # manual logit manipulation or emits are requested.
+    has_soft_cap = getattr(self, '_attention_logits_soft_cap', None) is not None
+
+    if not has_soft_cap and not emit_attention_weights:
+      # SDPA path — handles both plain and sink cases.
+      q = mx.transpose(queries, (0, 2, 1, 3))
+      k = mx.transpose(keys, (0, 2, 1, 3))
+      v = mx.transpose(values, (0, 2, 1, 3))
+
+      q = _scale_queries(
+          q, self._per_dim_scale, self._query_scale, self.units_per_head
+      )
+
+      if self.sink_key_embeddings is not None:
+        # JAX computes sink logits with *unscaled* queries.  To use SDPA
+        # we pre-divide sink keys by the scale so that:
+        #   scaled_q @ (sink_k / scale) == unscaled_q @ sink_k
+        scale_vec = _query_scale_vector(
+            self._per_dim_scale,
+            self._query_scale,
+            self.units_per_head,
+            q.dtype,
+        )
+        sink_k = self.sink_key_embeddings.astype(q.dtype) / scale_vec
+        sink_v = self.sink_value_embeddings.astype(v.dtype)
+
+        # GQA: repeat sink heads to match query heads.
+        num_groups = self.num_heads // self.num_kv_heads
+        if num_groups > 1:
+          sink_v = mx.repeat(sink_v, num_groups, axis=1)
+
+        # Transpose [K, nh, h] → [nh, K, h] and broadcast batch.
+        sink_k_b = mx.broadcast_to(
+            mx.transpose(sink_k, (1, 0, 2))[None],
+            (q.shape[0], self.num_heads, sink_k.shape[0], self.units_per_head),
+        )
+        sink_v_b = mx.broadcast_to(
+            mx.transpose(sink_v, (1, 0, 2))[None],
+            (v.shape[0], self.num_heads, sink_v.shape[0], self.units_per_head),
+        )
+
+        # Prepend sinks to K/V.
+        k = mx.concatenate([sink_k_b, k], axis=2)
+        v = mx.concatenate([sink_v_b, v], axis=2)
+
+        # Extend mask — sinks are always valid.
+        if mask is not None:
+          num_sinks = self.sink_key_embeddings.shape[0]
+          sink_mask = mx.ones(
+              (mask.shape[0], mask.shape[1], mask.shape[2], num_sinks),
+              dtype=mx.bool_,
+          )
+          mask = mx.concatenate([sink_mask, mask], axis=-1)
+
+      context = mx.fast.scaled_dot_product_attention(
+          q, k, v, scale=1.0, mask=mask
+      )
+      return mx.transpose(context, (0, 2, 1, 3)), ()
+
+    # Manual path — for attention_logits_soft_cap or when emits are requested.
+    num_groups = self.num_heads // self.num_kv_heads
+    if num_groups > 1:
+      keys = mx.repeat(keys, num_groups, axis=2)
+      values = mx.repeat(values, num_groups, axis=2)
+
+    q = mx.transpose(queries, (0, 2, 1, 3))
+    k = mx.transpose(keys, (0, 2, 1, 3))
+    v = mx.transpose(values, (0, 2, 1, 3))
+
+    # Compute sink logits BEFORE scaling queries, matching JAX behavior.
+    sink_logits = None
+    if self.sink_key_embeddings is not None:
+      sink_k = self.sink_key_embeddings.astype(q.dtype)
+      sink_k_t = mx.transpose(sink_k, (1, 2, 0))
+      sink_logits = mx.matmul(q, sink_k_t)
+
+    q = _scale_queries(
+        q, self._per_dim_scale, self._query_scale, self.units_per_head
+    )
+    logits = mx.matmul(q, mx.transpose(k, (0, 1, 3, 2)))
+
+    if self.sink_key_embeddings is not None:
+      sink_v = self.sink_value_embeddings.astype(v.dtype)
+      if num_groups > 1:
+        sink_v = mx.repeat(sink_v, num_groups, axis=1)
+      sink_v_t = mx.transpose(sink_v, (1, 0, 2))
+      sink_v_b = mx.broadcast_to(sink_v_t[None], (v.shape[0],) + sink_v_t.shape)
+      v = mx.concatenate([sink_v_b, v], axis=2)
+      assert sink_logits is not None
+      logits = mx.concatenate([sink_logits, logits], axis=-1)
+      if mask is not None:
+        num_sinks = self.sink_key_embeddings.shape[0]
+        sink_mask = mx.ones(
+            (mask.shape[0], mask.shape[1], mask.shape[2], num_sinks),
+            dtype=mx.bool_,
+        )
+        mask = mx.concatenate([sink_mask, mask], axis=-1)
+
+    if has_soft_cap:
+      cap = cast(Any, self._attention_logits_soft_cap)
+      logits = mx.tanh(logits / cap) * cap
+
+    if mask is not None:
+      large_neg = mx.array(-1e9, dtype=logits.dtype)
+      logits = mx.where(mask, logits, large_neg)
+
+    logits_f32 = (
+        logits.astype(mx.float32) if logits.dtype != mx.float32 else logits
+    )
+    weights = mx.softmax(logits_f32, axis=-1).astype(v.dtype)
+    context = mx.matmul(weights, v)
+    context = mx.transpose(context, (0, 2, 1, 3))
+
+    emits = ()
+    if emit_attention_weights:
+      # Transpose from [b, nh, q, kv] back to [b, q, nh, kv] to match JAX.
+      emits = mx.transpose(weights, (0, 2, 1, 3))
+
+    return context, emits
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    if len(input_shape) != 1:
+      raise ValueError(
+          'DotProductSelfAttention requires rank 3 input,'
+          f' got channel_shape={input_shape}.'
+      )
+    return (self.num_heads, self.units_per_head)
+
+  @override
+  def get_output_dtype(self, input_dtype, *, constants=None):
+    if self.compute_dtype is not None:
+      return self.compute_dtype
+    return self._param_dtype
+
+  @override
+  def get_initial_state(
+      self, batch_size, input_spec, *, training: bool, constants=None
+  ):
+    self._ensure_initialized(input_spec.shape[-1])
+    compute_dtype = self.get_output_dtype(input_spec.dtype)
+    max_past = max(0, self.max_past_horizon)
+    max_future = max(0, self.max_future_horizon)
+    kv_buffer_size = max_past + max_future
+
+    kv_shape = (
+        batch_size,
+        kv_buffer_size,
+        self.num_kv_heads,
+        self.units_per_head,
+    )
+    kv_buffer_keys = mx.zeros(kv_shape, dtype=compute_dtype)
+    kv_buffer_values = mx.zeros(kv_shape, dtype=compute_dtype)
+    kv_buffer_mask = mx.zeros((batch_size, kv_buffer_size), dtype=mx.bool_)
+    time_step = mx.zeros((batch_size,), dtype=mx.int32)
+
+    # Q/K/V network states.
+    q_net_state = (
+        self.query_network.get_initial_state(
+            batch_size,
+            types.ShapeDType(
+                (self.num_heads, self.units_per_head),
+                compute_dtype,
+            ),
+            training=training,
+            constants=constants,
+        )
+        if self.query_network is not None
+        else ()
+    )
+    k_net_state = (
+        self.key_network.get_initial_state(
+            batch_size,
+            types.ShapeDType(
+                (self.num_kv_heads, self.units_per_head),
+                compute_dtype,
+            ),
+            training=training,
+            constants=constants,
+        )
+        if self.key_network is not None
+        else ()
+    )
+    v_net_state = (
+        self.value_network.get_initial_state(
+            batch_size,
+            types.ShapeDType(
+                (self.num_kv_heads, self.units_per_head),
+                compute_dtype,
+            ),
+            training=training,
+            constants=constants,
+        )
+        if self.value_network is not None
+        else ()
+    )
+
+    # Query delay buffer for future horizon.
+    if max_future:
+      q_delay_values = mx.zeros(
+          (
+              batch_size,
+              max_future,
+              self.num_heads,
+              self.units_per_head,
+          ),
+          dtype=compute_dtype,
+      )
+      q_delay_mask = mx.zeros((batch_size, max_future), dtype=mx.bool_)
+    else:
+      q_delay_values = ()
+      q_delay_mask = ()
+
+    return (
+        kv_buffer_keys,
+        kv_buffer_values,
+        kv_buffer_mask,
+        time_step,
+        q_net_state,
+        k_net_state,
+        v_net_state,
+        q_delay_values,
+        q_delay_mask,
+    )
+
+  @override
+  def layer_with_emits(self, x, *, training: bool, constants=None):
+    proj_fn = self._project_qkv_fn or self._project_qkv
+    queries, keys, values = proj_fn(x)
+
+    # Optional Q/K/V processing networks (e.g. RoPE).
+    # Use `is not None` because parameterless nn.Modules are falsy.
+    if self.query_network is not None:
+      queries = Sequence(
+          self.query_network.layer(
+              queries, training=training, constants=constants
+          ).values,
+          queries.mask,
+      )
+    if self.key_network is not None:
+      keys = Sequence(
+          self.key_network.layer(
+              keys, training=training, constants=constants
+          ).values,
+          keys.mask,
+      )
+    if self.value_network is not None:
+      values = Sequence(
+          self.value_network.layer(
+              values, training=training, constants=constants
+          ).values,
+          values.mask,
+      )
+
+    # Mask invalid values.
+    values = values.mask_invalid()
+
+    t = x.shape[1]
+
+    # Build visibility mask.
+    # Start with key validity: [b, 1, 1, t].
+    valid_mask = x.mask[:, None, None, :]
+
+    # Optionally add causal / banded mask.
+    if self.max_past_horizon >= 0 or self.max_future_horizon >= 0:
+      past = t - 1 if self.max_past_horizon == -1 else self.max_past_horizon
+      future = (
+          t - 1 if self.max_future_horizon == -1 else self.max_future_horizon
+      )
+      # Banded visibility matrix.
+      row = mx.expand_dims(mx.arange(t), axis=1)
+      col = mx.expand_dims(mx.arange(t), axis=0)
+      banded = (col >= row - past) & (col <= row + future)
+      valid_mask = valid_mask & banded.reshape(1, 1, t, t)
+
+    context, probs = self._compute_attention(
+        queries.values,
+        keys.values,
+        values.values,
+        valid_mask,
+        emit_attention_weights=self.config.emit_attention_weights,
+    )
+    emits = ()
+    if self.config.emit_attention_weights:
+      emits = SelfAttentionEmits(Sequence(probs, x.mask))
+    return Sequence(context, x.mask), emits
+
+  @override
+  def step_with_emits(self, x, state: Any, *, training: bool, constants=None):
+    proj_fn = self._project_qkv_fn or self._project_qkv
+    queries, keys, values = proj_fn(x)
+
+    (
+        kv_buf_k,
+        kv_buf_v,
+        kv_buf_mask,
+        time_step,
+        q_net_state,
+        k_net_state,
+        v_net_state,
+        q_delay_values,
+        q_delay_mask,
+    ) = state
+
+    # Optional Q/K/V processing networks.
+    # Use `is not None` because parameterless nn.Modules are falsy.
+    if self.query_network is not None:
+      queries, q_net_state = self.query_network.step(
+          queries, q_net_state, training=training, constants=constants
+      )
+    if self.key_network is not None:
+      keys, k_net_state = self.key_network.step(
+          keys, k_net_state, training=training, constants=constants
+      )
+    if self.value_network is not None:
+      values, v_net_state = self.value_network.step(
+          values, v_net_state, training=training, constants=constants
+      )
+
+    # Mask invalid values.
+    values = values.mask_invalid()
+
+    x_time = x.shape[1]
+    kv_buffer_size = kv_buf_k.shape[1]
+
+    if self.max_future_horizon > 0:
+      # Concatenate new queries to delay buffer.
+      q_delay_values = mx.concatenate([q_delay_values, queries.values], axis=1)
+      q_delay_mask = mx.concatenate([q_delay_mask, x.mask], axis=1)
+
+      # Use the oldest x_time queries as the current step's queries.
+      queries = Sequence(
+          q_delay_values[:, :x_time],
+          q_delay_mask[:, :x_time],
+      )
+
+      # Preserve the last max_future_horizon queries for the next step.
+      q_delay_values = q_delay_values[:, x_time:]
+      q_delay_mask = q_delay_mask[:, x_time:]
+
+    if kv_buffer_size > 0:
+      t0 = time_step[0]  # MLX scalar, no eval.
+
+      # Concatenate old buffer with new elements for attention computation.
+      # This avoids overwriting history needed by current queries.
+      combined_k = mx.concatenate([kv_buf_k, keys.values], axis=1)
+      combined_v = mx.concatenate([kv_buf_v, values.values], axis=1)
+      combined_mask = mx.concatenate([kv_buf_mask, x.mask], axis=1)
+
+      # Build visibility mask: [b, 1, 1, kv_buffer_size + x_time].
+      kv_valid = combined_mask[:, None, None, :]
+
+      # Map physical indices in old buffer to temporal indices.
+      # The newest time in the old buffer was t0 - 1.
+      newest_time_old = t0 - 1
+      newest_pos_old = newest_time_old % kv_buffer_size
+      phys_old = mx.arange(kv_buffer_size)
+      dist_old = (newest_pos_old - phys_old + kv_buffer_size) % kv_buffer_size
+      temporal_old = newest_time_old - dist_old
+
+      # Temporal indices for new elements.
+      temporal_new = t0 + mx.arange(x_time)
+
+      # Combine temporal indices.
+      temporal = mx.concatenate([temporal_old, temporal_new], axis=0)
+
+      # Add causal mask for multi-step queries (respects ring buffer order).
+      q_times = t0 - self.max_future_horizon + mx.arange(x_time)
+      causal = temporal[None, :] <= (q_times[:, None] + self.max_future_horizon)
+
+      # Add finite horizon mask.
+      past = self.max_past_horizon
+      finite_horizon = temporal[None, :] >= (q_times[:, None] - past)
+
+      causal_and_finite = causal & finite_horizon
+      kv_valid = kv_valid & causal_and_finite.reshape(
+          1, 1, x_time, kv_buffer_size + x_time
+      )
+
+      context, probs = self._compute_attention(
+          queries.values,
+          combined_k,
+          combined_v,
+          kv_valid,
+          emit_attention_weights=self.config.emit_attention_weights,
+      )
+
+      if self.config.emit_attention_weights:
+        assert isinstance(probs, mx.array)
+        sort_idx = mx.argsort(temporal)
+        probs = probs[..., sort_idx]
+
+      # Ring buffer write AFTER read: insert new K/V at rotating positions.
+      positions = (t0 + mx.arange(x_time)) % kv_buffer_size  # [x_time]
+
+      # Scatter K/V into buffer at ring positions.
+      idx_4d = mx.broadcast_to(
+          positions.reshape(1, x_time, 1, 1), keys.values.shape
+      )
+      kv_buf_k = mx.put_along_axis(kv_buf_k, idx_4d, keys.values, axis=1)
+      kv_buf_v = mx.put_along_axis(kv_buf_v, idx_4d, values.values, axis=1)
+
+      # Scatter mask into buffer.
+      idx_2d = mx.broadcast_to(positions.reshape(1, x_time), x.mask.shape)
+      kv_buf_mask = mx.put_along_axis(kv_buf_mask, idx_2d, x.mask, axis=1)
+    else:
+      # Degenerate: no history buffer, attend only to current step.
+      kv_valid = x.mask[:, None, None, :]
+      if x_time > 1:
+        causal = _causal_mask(x_time, x_time)
+        kv_valid = kv_valid & causal
+      context, probs = self._compute_attention(
+          queries.values,
+          keys.values,
+          values.values,
+          kv_valid,
+          emit_attention_weights=self.config.emit_attention_weights,
+      )
+
+    new_state = (
+        kv_buf_k,
+        kv_buf_v,
+        kv_buf_mask,
+        time_step + x_time,
+        q_net_state,
+        k_net_state,
+        v_net_state,
+        q_delay_values,
+        q_delay_mask,
+    )
+    emits = ()
+    if self.config.emit_attention_weights:
+      emits = SelfAttentionEmits(Sequence(probs, queries.mask))
+    return Sequence(context, queries.mask), new_state, emits
+
+  def to_quantized(
+      self, group_size: int = 64, bits: int = 4, mode: str = 'affine'
+  ):
+    """Convert attention projection layers to quantized versions."""
+    del mode  # Unused in MLX quantize
+    if (
+        getattr(self, 'q_proj', None) is None
+        or self.q_proj.shape[0] % group_size != 0
+    ):
+      return self
+
+    self._quant_group_size = group_size
+    self._quant_bits = bits
+
+    w_q = self.q_proj.T
+    # kv_proj is already combined [in, 2*kv_dim].
+    w_kv = self.kv_proj.T
+    w_qkv = mx.concatenate([w_q, w_kv], axis=0)
+    self.qkv_proj_qw, self.qkv_proj_qs, self.qkv_proj_qb = mx.quantize(
+        w_qkv, group_size=group_size, bits=bits
+    )
+
+    self.q_proj = cast(Any, None)
+    self.kv_proj = cast(Any, None)
+
+    def _project_qkv(self, x):
+      b, t = x.shape[0], x.shape[1]
+      dtype = self.compute_dtype or x.dtype
+      v = x.values.astype(dtype)
+
+      qkv = _quantized_matmul_proj(
+          v,
+          self.qkv_proj_qw,
+          self.qkv_proj_qs,
+          self.qkv_proj_qb,
+          self._quant_group_size,
+          self._quant_bits,
+      )
+
+      d_q = self.num_heads * self.units_per_head
+      d_k = self.num_kv_heads * self.units_per_head
+      q, k, val = mx.split(qkv, [d_q, d_q + d_k], axis=-1)
+
+      if self.use_bias:
+        q = q + self.q_bias.astype(dtype)
+        kv_bias = self.kv_bias.astype(dtype)
+        kb, vb = mx.split(kv_bias, 2, axis=-1)
+        k = k + kb
+        val = val + vb
+
+      q = q.reshape(b, t, self.num_heads, self.units_per_head)
+      k = k.reshape(b, t, self.num_kv_heads, self.units_per_head)
+      val = val.reshape(b, t, self.num_kv_heads, self.units_per_head)
+
+      return (
+          Sequence(q, x.mask),
+          Sequence(k, x.mask),
+          Sequence(val, x.mask),
+      )
+
+    self._project_qkv_fn = MethodType(_project_qkv, self)
+    return self
+
+  @classmethod
+  def from_config(cls, config: Any) -> 'DotProductSelfAttention':
+    """Create from a Linen DotProductSelfAttention.Config."""
+    mlx_config = cls.Config(
+        num_heads=config.num_heads,
+        units_per_head=config.units_per_head,
+        max_past_horizon=config.max_past_horizon,
+        max_future_horizon=config.max_future_horizon,
+        num_kv_heads=config.num_kv_heads,
+        attention_probabilities_dropout_rate=config.attention_probabilities_dropout_rate,
+        broadcast_dropout_across_queries=config.broadcast_dropout_across_queries,
+        use_bias=config.use_bias,
+        input_projection=_map_projection_config(config.input_projection),
+        query_network=config.query_network,
+        key_network=config.key_network,
+        value_network=config.value_network,
+        attention_logits_soft_cap=config.attention_logits_soft_cap,
+        per_dim_scale=config.per_dim_scale,
+        query_scale=config.query_scale,
+        zero_fully_masked=config.zero_fully_masked,
+        compute_dtype=config.compute_dtype,
+        param_dtype=config.param_dtype or mx.float32,
+        num_sink_embeddings=config.num_sink_embeddings,
+        use_sink_scalars=config.use_sink_scalars,
+        use_kv_cache_ringbuffer=config.use_kv_cache_ringbuffer,
+        name=config.name,
+    )
+    return cls(mlx_config)
+
+
+def _map_projection_config(
+    config: attention_spec.QueryKeyValueProjectionConfig,
+) -> projection_configs.QueryKeyValueProjectionConfig:
+  """Maps a spec-level projection config (which may be JAX) to MLX."""
+  if isinstance(config, attention_spec.CombinedQueryKeyValueProjection):
+    return projection_configs.CombinedQueryKeyValueProjection(
+        share_kv_projection=config.share_kv_projection,
+        qkv_kernel_init=getattr(config, 'qkv_kernel_init', None),
+        bias_init=getattr(config, 'bias_init', None),
+    )
+  if isinstance(config, attention_spec.SeparateQueryKeyValueProjection):
+    return projection_configs.SeparateQueryKeyValueProjection(
+        q_kernel_init=getattr(config, 'q_kernel_init', None),
+        k_kernel_init=getattr(config, 'k_kernel_init', None),
+        v_kernel_init=getattr(config, 'v_kernel_init', None),
+        bias_init=getattr(config, 'bias_init', None),
+    )
+  if isinstance(config, attention_spec.QueryAndKeyValueProjection):
+    return projection_configs.QueryAndKeyValueProjection(
+        q_kernel_init=getattr(config, 'q_kernel_init', None),
+        q_bias_init=getattr(config, 'q_bias_init', None),
+        kv_kernel_init=getattr(config, 'kv_kernel_init', None),
+        kv_bias_init=getattr(config, 'kv_bias_init', None),
+    )
+  if isinstance(config, attention_spec.QueryAndSharedKeyValueProjection):
+    return projection_configs.QueryAndSharedKeyValueProjection(
+        q_kernel_init=getattr(config, 'q_kernel_init', None),
+        q_bias_init=getattr(config, 'q_bias_init', None),
+        kv_kernel_init=getattr(config, 'kv_kernel_init', None),
+        kv_bias_init=getattr(config, 'kv_bias_init', None),
+    )
+  return cast(Any, config)
+
+
+class DotProductAttention(
+    types.Emitting,
+    attention_spec.DotProductAttention[types.Sequence, types.ChannelSpec],
+):
+  """Multi-headed dot-product cross attention for MLX."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(
+      types.SequenceLayerConfig,
+      attention_spec.DotProductAttention.Config,
+  ):
+    """MLX-native configuration for DotProductAttention."""
+
+    source_name: str
+    num_heads: int
+    units_per_head: int
+    attention_probabilities_dropout_rate: float = 0.0
+    broadcast_dropout_across_queries: bool = False
+    use_bias: bool = False
+    input_projection: projection_configs.QueryKeyValueProjectionConfig = (
+        dataclasses.field(
+            default_factory=projection_configs.QueryAndKeyValueProjection
+        )
+    )
+    query_network: Any = None
+    key_network: Any = None
+    value_network: Any = None
+    attention_logits_soft_cap: float | None = None
+    per_dim_scale: bool = False
+    query_scale: float | None = None
+    zero_fully_masked: bool = False
+    compute_dtype: types.DType | None = None
+    param_dtype: types.DType = mx.float32
+    name: str | None = None
+
+    @override
+    def make(self) -> 'DotProductAttention':
+      return DotProductAttention(self)
+
+  def __init__(
+      self,
+      config: Config | None = None,
+      *,
+      in_features: int | None = None,
+      source_features: int | None = None,
+      source_name: str | None = None,
+      num_heads: int | None = None,
+      units_per_head: int | None = None,
+      use_bias: bool = False,
+      query_scale: float | None = None,
+      per_dim_scale: bool = False,
+      compute_dtype=None,
+      param_dtype=mx.float32,
+      kernel_init=None,
+      bias_init=None,
+      query_network: types.SequenceLayer | None = None,
+      key_network: types.SequenceLayer | None = None,
+      value_network: types.SequenceLayer | None = None,
+  ):
+    super().__init__()
+    if config is None:
+      if source_name is None or num_heads is None or units_per_head is None:
+        raise ValueError(
+            'Must provide either config or source_name, num_heads, and'
+            ' units_per_head'
+        )
+      source_name = cast(str, source_name)
+      num_heads = cast(int, num_heads)
+      units_per_head = cast(int, units_per_head)
+      # Reconstruct config to store unified properties
+      config = DotProductAttention.Config(
+          source_name=source_name,
+          num_heads=num_heads,
+          units_per_head=units_per_head,
+          use_bias=use_bias,
+          query_scale=query_scale,
+          per_dim_scale=per_dim_scale,
+          compute_dtype=compute_dtype,
+          param_dtype=param_dtype,
+          query_network=query_network,
+          key_network=key_network,
+          value_network=value_network,
+      )
+    self.config = config
+
+    self.compute_dtype = (
+        init_mapping.to_mx_dtype(self.config.compute_dtype)
+        if self.config.compute_dtype is not None
+        else None
+    )
+    self._param_dtype = (
+        init_mapping.to_mx_dtype(self.config.param_dtype) or mx.float32
+    )
+
+    self.in_features = None
+    self.source_features = None
+    self.source_name = self.config.source_name
+    self.num_heads = self.config.num_heads
+    self.units_per_head = self.config.units_per_head
+    self.use_bias = self.config.use_bias
+    self._query_scale = self.config.query_scale
+
+    self._kernel_init = kernel_init
+    self._bias_init = bias_init
+    self._per_dim_scale = None
+
+    self.query_network: Any = query_network or self.config.query_network
+    self.key_network: Any = key_network or self.config.key_network
+    self.value_network: Any = value_network or self.config.value_network
+
+    self._initialized = False
+
+    if in_features is not None and source_features is not None:
+      self._ensure_initialized(in_features, source_features)
+
+  def _ensure_initialized(self, in_features: int, source_features: int):
+    """Ensure parameters and submodules are dynamically initialized."""
+    if self._initialized:
+      return
+    self._initialized = True
+    self.in_features = in_features
+    self.source_features = source_features
+
+    # pylint: disable=import-outside-toplevel
+    from sequence_layers.mlx import utils as mlx_utils
+
+    if hasattr(self.query_network, 'make'):
+      self.query_network = mlx_utils.make_layer(self.query_network)
+    if hasattr(self.key_network, 'make'):
+      self.key_network = mlx_utils.make_layer(self.key_network)
+    if hasattr(self.value_network, 'make'):
+      self.value_network = mlx_utils.make_layer(self.value_network)
+
+    param_dtype = self._param_dtype
+    per_dim_scale = self.config.per_dim_scale
+    units_per_head = self.units_per_head
+    num_heads = self.num_heads
+    use_bias = self.use_bias
+    input_projection = self.config.input_projection
+
+    self._per_dim_scale = (
+        mx.zeros((units_per_head,), dtype=param_dtype)
+        if per_dim_scale
+        else None
+    )
+
+    kernel_init = self._kernel_init
+    bias_init = self._bias_init
+
+    if kernel_init is None:
+      qkv_init = (
+          getattr(input_projection, 'qkv_kernel_init', None)
+          or getattr(input_projection, 'q_kernel_init', None)
+          or getattr(input_projection, 'kv_kernel_init', None)
+      )
+      if qkv_init is not None:
+        kernel_init = init_mapping.map_initializer(qkv_init)
+      else:
+        kernel_init = init_mapping.make_variance_scaling_init(
+            'fan_in', 'truncated_normal'
+        )
+
+    if bias_init is None:
+      qkv_bias_init = (
+          getattr(input_projection, 'bias_init', None)
+          or getattr(input_projection, 'q_bias_init', None)
+          or getattr(input_projection, 'kv_bias_init', None)
+      )
+      if qkv_bias_init is not None:
+        bias_init = init_mapping.map_initializer(qkv_bias_init)
+      else:
+        bias_init = init_mapping.zeros_init
+
+    key = mx.random.key(0)
+    qkv_dim = num_heads * units_per_head
+
+    self.q_proj = kernel_init(key, (in_features, qkv_dim), param_dtype)
+    self.kv_proj = mx.concatenate(
+        [
+            kernel_init(key, (source_features, qkv_dim), param_dtype),
+            kernel_init(key, (source_features, qkv_dim), param_dtype),
+        ],
+        axis=-1,
+    )
+    if use_bias:
+      self.q_bias = bias_init(key, (qkv_dim,), param_dtype)
+      self.kv_bias = mx.concatenate(
+          [
+              bias_init(key, (qkv_dim,), param_dtype),
+              bias_init(key, (qkv_dim,), param_dtype),
+          ],
+          axis=-1,
+      )
+
+  @property
+  @override
+  def supports_step(self):
+    if self.query_network is not None:
+      return self.query_network.supports_step
+    return True
+
+  @property
+  @override
+  def input_latency(self):
+    return 0
+
+  def _project_q(self, x):
+    """Project input query sequence."""
+    b, t = x.shape[0], x.shape[1]
+    dtype = self.compute_dtype or x.dtype
+    v = x.values.astype(dtype)
+    q = mx.matmul(v, self.q_proj.astype(dtype))
+    if self.use_bias:
+      q = q + self.q_bias.astype(dtype)
+    q = q.reshape(b, t, self.num_heads, self.units_per_head)
+    return Sequence(q, x.mask)
+
+  def _project_kv(self, source):
+    """Project external source sequence to key/value matrices."""
+    b, t = source.shape[0], source.shape[1]
+    dtype = self.compute_dtype or source.dtype
+    v = source.values.astype(dtype)
+    kv = mx.matmul(v, self.kv_proj.astype(dtype))
+    k, val = mx.split(kv, 2, axis=-1)
+    if self.use_bias:
+      kv_bias = self.kv_bias.astype(dtype)
+      kb, vb = mx.split(kv_bias, 2, axis=-1)
+      k = k + kb
+      val = val + vb
+    k = k.reshape(b, t, self.num_heads, self.units_per_head)
+    val = val.reshape(b, t, self.num_heads, self.units_per_head)
+    return Sequence(k, source.mask), Sequence(val, source.mask)
+
+  def _get_source(self, constants):
+    """Helper to resolve the external source sequence from constants."""
+    if constants is None or self.source_name not in constants:
+      raise ValueError(f'Source "{self.source_name}" not found in constants.')
+    return constants[self.source_name]
+
+  def _compute_attention(
+      self, queries, keys, values, mask, emit_attention_weights=False
+  ):
+    """Compute scaled dot-product attention (no causal mask)."""
+    q = mx.transpose(queries, (0, 2, 1, 3))
+    k = mx.transpose(keys, (0, 2, 1, 3))
+    v = mx.transpose(values, (0, 2, 1, 3))
+
+    q = _scale_queries(
+        q, self._per_dim_scale, self._query_scale, self.units_per_head
+    )
+
+    if not emit_attention_weights:
+      context = mx.fast.scaled_dot_product_attention(
+          q, k, v, scale=1.0, mask=mask
+      )
+      return mx.transpose(context, (0, 2, 1, 3)), ()
+
+    # Manual path for emits
+    logits = mx.matmul(q, mx.transpose(k, (0, 1, 3, 2)))
+
+    if mask is not None:
+      large_neg = mx.array(-1e9, dtype=logits.dtype)
+      logits = mx.where(mask, logits, large_neg)
+
+    logits_f32 = (
+        logits.astype(mx.float32) if logits.dtype != mx.float32 else logits
+    )
+    weights = mx.softmax(logits_f32, axis=-1).astype(v.dtype)
+    context = mx.matmul(weights, v)
+    context = mx.transpose(context, (0, 2, 1, 3))
+
+    emits = mx.transpose(weights, (0, 2, 1, 3))
+    return context, emits
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    """Returns the output shape of the layer's features."""
+    if len(input_shape) != 1:
+      raise ValueError(
+          'DotProductAttention requires rank 3 input,'
+          f' got channel_shape={input_shape}.'
+      )
+    return (self.num_heads, self.units_per_head)
+
+  @override
+  def get_output_dtype(self, input_dtype, *, constants=None):
+    """Returns the computation output dtype."""
+    if self.compute_dtype is not None:
+      return self.compute_dtype
+    return self._param_dtype
+
+  @override
+  def get_initial_state(
+      self, batch_size, input_spec, *, training: bool, constants=None
+  ):
+    """Computes and returns the initial cache states for cross attention."""
+    source = self._get_source(constants)
+    self._ensure_initialized(input_spec.shape[-1], source.shape[-1])
+
+    keys, values = self._project_kv(source)
+
+    if self.key_network is not None:
+      keys = self.key_network.layer(
+          keys, training=training, constants=constants
+      )
+    if self.value_network is not None:
+      values = self.value_network.layer(
+          values, training=training, constants=constants
+      )
+
+    keys = keys.mask_invalid()
+    values = values.mask_invalid()
+
+    q_net_state = (
+        self.query_network.get_initial_state(
+            batch_size,
+            types.ShapeDType(
+                (self.num_heads, self.units_per_head),
+                self.get_output_dtype(input_spec.dtype),
+            ),
+            training=training,
+            constants=constants,
+        )
+        if self.query_network is not None
+        else ()
+    )
+
+    time_step = mx.zeros((batch_size,), dtype=mx.int32)
+    return (
+        keys.values,
+        values.values,
+        keys.mask,
+        q_net_state,
+        time_step,
+    )
+
+  @override
+  def layer_with_emits(self, x, *, training: bool, constants=None):
+    source = self._get_source(constants)
+    self._ensure_initialized(x.shape[-1], source.shape[-1])
+
+    keys, values = self._project_kv(source)
+
+    if self.key_network is not None:
+      keys = self.key_network.layer(
+          keys, training=training, constants=constants
+      )
+    if self.value_network is not None:
+      values = self.value_network.layer(
+          values, training=training, constants=constants
+      )
+
+    queries = self._project_q(x)
+    if self.query_network is not None:
+      queries = Sequence(
+          self.query_network.layer(
+              queries, training=training, constants=constants
+          ).values,
+          queries.mask,
+      )
+
+    values = values.mask_invalid()
+    valid_mask = source.mask[:, None, None, :]
+    context, probs = self._compute_attention(
+        queries.values,
+        keys.values,
+        values.values,
+        valid_mask,
+        emit_attention_weights=self.config.emit_attention_weights,
+    )
+    emits = ()
+    if self.config.emit_attention_weights:
+      emits = CrossAttentionEmits({self.source_name: Sequence(probs, x.mask)})
+    return Sequence(context, x.mask), emits
+
+  @override
+  def step_with_emits(self, x, state: Any, *, training: bool, constants=None):
+    keys_v, values_v, kv_mask, q_net_state, time_step = state
+    source = self._get_source(constants)
+    self._ensure_initialized(x.shape[-1], source.shape[-1])
+
+    queries = self._project_q(x)
+    if self.query_network is not None:
+      queries, q_net_state = self.query_network.step(
+          queries, q_net_state, training=training, constants=constants
+      )
+
+    valid_mask = kv_mask[:, None, None, :]
+    context, probs = self._compute_attention(
+        queries.values,
+        keys_v,
+        values_v,
+        valid_mask,
+        emit_attention_weights=self.config.emit_attention_weights,
+    )
+
+    new_state = (
+        keys_v,
+        values_v,
+        kv_mask,
+        q_net_state,
+        time_step + x.shape[1],
+    )
+    emits = ()
+    if self.config.emit_attention_weights:
+      emits = CrossAttentionEmits({self.source_name: Sequence(probs, x.mask)})
+    return Sequence(context, x.mask), new_state, emits
+
+  @classmethod
+  def from_config(cls, config: Any) -> 'DotProductAttention':
+    """Create from a Linen DotProductAttention.Config."""
+    mlx_config = cls.Config(
+        source_name=config.source_name,
+        num_heads=config.num_heads,
+        units_per_head=config.units_per_head,
+        attention_probabilities_dropout_rate=config.attention_probabilities_dropout_rate,
+        broadcast_dropout_across_queries=config.broadcast_dropout_across_queries,
+        use_bias=config.use_bias,
+        input_projection=_map_projection_config(config.input_projection),
+        query_network=config.query_network,
+        key_network=config.key_network,
+        value_network=config.value_network,
+        attention_logits_soft_cap=config.attention_logits_soft_cap,
+        per_dim_scale=config.per_dim_scale,
+        query_scale=config.query_scale,
+        zero_fully_masked=config.zero_fully_masked,
+        compute_dtype=config.compute_dtype,
+        param_dtype=config.param_dtype or mx.float32,
+        name=config.name,
+    )
+    return cls(mlx_config)
+
+
+def _banded_mask(q_len, kv_len, num_lower, num_upper):
+  """Build a [1, 1, q_len, kv_len] banded visibility mask.
+
+  Position (i, j) is True iff j >= i - num_lower and j <= i + num_upper.
+  """
+  row = mx.expand_dims(mx.arange(q_len), axis=1)
+  col = mx.expand_dims(mx.arange(kv_len), axis=0)
+  mask = (col >= row - num_lower) & (col <= row + num_upper)
+  return mask.reshape(1, 1, q_len, kv_len)
+
+
+def _step_visibility_mask(
+    max_past_horizon, max_future_horizon, query_time, key_time
+):
+  """Compute step-wise banded visibility mask.
+
+  For a single query (query_time=1), returns None since no causal mask
+  is needed — the KV buffer already contains only visible positions.
+
+  For multi-step queries, returns a banded matrix with num_lower=0 and
+  num_upper=max_past_horizon + max_future_horizon.
+  """
+  if query_time == 1:
+    return None
+  return _banded_mask(
+      query_time,
+      key_time,
+      num_lower=0,
+      num_upper=max_past_horizon + max_future_horizon,
+  )
+
+
+class StreamingDotProductAttention(
+    types.Emitting,
+    attention_spec.StreamingDotProductAttention[
+        types.Sequence, types.ChannelSpec
+    ],
+):
+  """Multi-headed streaming cross-attention for MLX.
+
+  Also covers StreamingLocalDotProductAttention from the JAX backend.
+
+  Queries come from the input; keys and values come from a source
+  sequence provided in constants at the same streaming rate as input.
+
+  Unlike DotProductAttention (which pre-projects the full source in
+  get_initial_state), this class projects source chunks per-step and
+  maintains a rolling KV buffer, enabling streaming cross-attention.
+
+  Covers both StreamingDotProductAttention and
+  StreamingLocalDotProductAttention from the JAX backend (which differ
+  only in layer-mode efficiency, not in step-mode behavior or output).
+
+  Kernels stored in Linen-compatible shapes:
+    q_proj: [in_features, num_heads * units_per_head]
+    k_proj: [source_features, num_heads * units_per_head]
+    v_proj: [source_features, num_heads * units_per_head]
+  """
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(
+      types.SequenceLayerConfig,
+      attention_spec.StreamingDotProductAttention.Config,
+  ):
+    """MLX-native configuration for StreamingDotProductAttention.
+
+    This Config also serves as the MLX-native equivalent of the JAX
+    StreamingLocalDotProductAttention.Config.
+    """
+
+    source_name: str
+    num_heads: int
+    units_per_head: int
+    block_size: int = 1
+    max_past_horizon: int = 1
+    max_future_horizon: int = 0
+    attention_probabilities_dropout_rate: float = 0.0
+    broadcast_dropout_across_queries: bool = False
+    use_bias: bool = False
+    use_query_delay_buffer: bool = True
+    input_projection: projection_configs.QueryKeyValueProjectionConfig = (
+        dataclasses.field(
+            default_factory=projection_configs.QueryAndKeyValueProjection
+        )
+    )
+    query_network: Any = None
+    key_network: Any = None
+    value_network: Any = None
+    attention_logits_soft_cap: float | None = None
+    per_dim_scale: bool = False
+    query_scale: float | None = None
+    zero_fully_masked: bool = False
+    compute_dtype: types.DType | None = None
+    param_dtype: types.DType = mx.float32
+    num_sink_embeddings: int = 0
+    use_sink_scalars: bool = False
+    use_kv_cache_ringbuffer: bool = False
+    name: str | None = None
+
+    @override
+    def make(self) -> 'StreamingDotProductAttention':
+      return StreamingDotProductAttention(self)
+
+  def __init__(
+      self,
+      config: Config | None = None,
+      *,
+      in_features: int | None = None,
+      source_features: int | None = None,
+      source_name: str | None = None,
+      num_heads: int | None = None,
+      units_per_head: int | None = None,
+      max_past_horizon: int | None = None,
+      max_future_horizon: int = 0,
+      use_bias: bool = False,
+      use_query_delay_buffer: bool = True,
+      query_scale: float | None = None,
+      per_dim_scale: bool = False,
+      compute_dtype=None,
+      param_dtype=mx.float32,
+      kernel_init=None,
+      bias_init=None,
+      query_network: types.SequenceLayer | None = None,
+      key_network: types.SequenceLayer | None = None,
+      value_network: types.SequenceLayer | None = None,
+      num_sink_embeddings: int = 0,
+      input_projection=None,
+  ):
+    super().__init__()
+    if config is None:
+      if (
+          source_name is None
+          or num_heads is None
+          or units_per_head is None
+          or max_past_horizon is None
+      ):
+        raise ValueError(
+            'Must provide either config or source_name, num_heads, '
+            'units_per_head, and max_past_horizon'
+        )
+      source_name = cast(str, source_name)
+      num_heads = cast(int, num_heads)
+      units_per_head = cast(int, units_per_head)
+      max_past_horizon = cast(int, max_past_horizon)
+      input_projection_val = (
+          input_projection or projection_configs.QueryAndKeyValueProjection()
+      )
+      config = StreamingDotProductAttention.Config(
+          source_name=source_name,
+          num_heads=num_heads,
+          units_per_head=units_per_head,
+          max_past_horizon=max_past_horizon,
+          max_future_horizon=max_future_horizon,
+          use_bias=use_bias,
+          use_query_delay_buffer=use_query_delay_buffer,
+          query_scale=query_scale,
+          per_dim_scale=per_dim_scale,
+          compute_dtype=compute_dtype,
+          param_dtype=param_dtype,
+          query_network=query_network,
+          key_network=key_network,
+          value_network=value_network,
+          num_sink_embeddings=num_sink_embeddings,
+          input_projection=input_projection_val,
+      )
+    self.config = config
+
+    if self.config.max_past_horizon < 1:
+      raise ValueError(
+          f'max_past_horizon must be >= 1, got {self.config.max_past_horizon}.'
+      )
+    if self.config.max_future_horizon < 0:
+      raise ValueError(
+          'max_future_horizon must be >= 0, got'
+          f' {self.config.max_future_horizon}.'
+      )
+
+    self.compute_dtype = (
+        init_mapping.to_mx_dtype(self.config.compute_dtype)
+        if self.config.compute_dtype is not None
+        else None
+    )
+    self._param_dtype = (
+        init_mapping.to_mx_dtype(self.config.param_dtype) or mx.float32
+    )
+
+    self.in_features = None
+    self.source_features = None
+    self.source_name = self.config.source_name
+    self.num_heads = self.config.num_heads
+    self.units_per_head = self.config.units_per_head
+    self.max_past_horizon = self.config.max_past_horizon
+    self.max_future_horizon = self.config.max_future_horizon
+    self.use_bias = self.config.use_bias
+    self.use_query_delay_buffer = self.config.use_query_delay_buffer
+    self._query_scale = self.config.query_scale
+
+    self._kernel_init = kernel_init
+    self._bias_init = bias_init
+    self._per_dim_scale = None
+
+    self.query_network: Any = query_network or self.config.query_network
+    self.key_network: Any = key_network or self.config.key_network
+    self.value_network: Any = value_network or self.config.value_network
+
+    self.num_sink_embeddings = self.config.num_sink_embeddings
+    self.sink_key_embeddings: Any = None
+    self.sink_value_embeddings: Any = None
+
+    self.q_proj: Any = None
+    self.kv_proj: Any = None
+    self.q_proj_qw: Any = None
+    self.q_proj_qs: Any = None
+    self.q_proj_qb: Any = None
+    self.kv_proj_qw: Any = None
+    self.kv_proj_qs: Any = None
+    self.kv_proj_qb: Any = None
+    self._quant_group_size: int | None = None
+    self._quant_bits: int | None = None
+    self._project_q_fn: Any = None
+    self._project_kv_fn: Any = None
+
+    self._initialized = False
+
+    if in_features is not None and source_features is not None:
+      self._ensure_initialized(in_features, source_features)
+
+  def _ensure_initialized(self, in_features: int, source_features: int):
+    """Ensure parameters and submodules are dynamically initialized."""
+    if self._initialized:
+      return
+    self._initialized = True
+    self.in_features = in_features
+    self.source_features = source_features
+
+    # pylint: disable=import-outside-toplevel
+    from sequence_layers.mlx import utils as mlx_utils
+
+    if hasattr(self.query_network, 'make'):
+      self.query_network = mlx_utils.make_layer(self.query_network)
+    if hasattr(self.key_network, 'make'):
+      self.key_network = mlx_utils.make_layer(self.key_network)
+    if hasattr(self.value_network, 'make'):
+      self.value_network = mlx_utils.make_layer(self.value_network)
+
+    param_dtype = self._param_dtype
+    per_dim_scale = self.config.per_dim_scale
+    units_per_head = self.units_per_head
+    num_heads = self.num_heads
+    use_bias = self.use_bias
+    input_projection = self.config.input_projection
+    num_sink_embeddings = self.num_sink_embeddings
+
+    self._per_dim_scale = (
+        mx.zeros((units_per_head,), dtype=param_dtype)
+        if per_dim_scale
+        else None
+    )
+
+    kernel_init = self._kernel_init
+    bias_init = self._bias_init
+
+    if kernel_init is None:
+      qkv_init = (
+          getattr(input_projection, 'qkv_kernel_init', None)
+          or getattr(input_projection, 'q_kernel_init', None)
+          or getattr(input_projection, 'kv_kernel_init', None)
+      )
+      if qkv_init is not None:
+        kernel_init = init_mapping.map_initializer(qkv_init)
+      else:
+        kernel_init = init_mapping.make_variance_scaling_init(
+            'fan_in', 'truncated_normal'
+        )
+
+    if bias_init is None:
+      qkv_bias_init = (
+          getattr(input_projection, 'bias_init', None)
+          or getattr(input_projection, 'q_bias_init', None)
+          or getattr(input_projection, 'kv_bias_init', None)
+      )
+      if qkv_bias_init is not None:
+        bias_init = init_mapping.map_initializer(qkv_bias_init)
+      else:
+        bias_init = init_mapping.zeros_init
+
+    key = mx.random.key(0)
+    qkv_dim = num_heads * units_per_head
+
+    self.q_proj = kernel_init(key, (in_features, qkv_dim), param_dtype)
+    self.kv_proj = mx.concatenate(
+        [
+            kernel_init(key, (source_features, qkv_dim), param_dtype),
+            kernel_init(key, (source_features, qkv_dim), param_dtype),
+        ],
+        axis=-1,
+    )
+    if use_bias:
+      self.q_bias = bias_init(key, (qkv_dim,), param_dtype)
+      self.kv_bias = mx.concatenate(
+          [
+              bias_init(key, (qkv_dim,), param_dtype),
+              bias_init(key, (qkv_dim,), param_dtype),
+          ],
+          axis=-1,
+      )
+
+    if num_sink_embeddings > 0:
+      self.sink_key_embeddings = mx.zeros(
+          (num_sink_embeddings, num_heads, units_per_head), dtype=param_dtype
+      )
+      self.sink_value_embeddings = mx.zeros(
+          (num_sink_embeddings, num_heads, units_per_head), dtype=param_dtype
+      )
+
+  @property
+  @override
+  def supports_step(self):
+    supports = True
+    if self.query_network is not None:
+      supports = supports and self.query_network.supports_step
+    if self.key_network is not None:
+      supports = supports and self.key_network.supports_step
+    if self.value_network is not None:
+      supports = supports and self.value_network.supports_step
+    return supports
+
+  @property
+  @override
+  def input_latency(self):
+    if self.max_future_horizon > 0 and self.use_query_delay_buffer:
+      return self.max_future_horizon
+    return 0
+
+  def _project_q(self, x):
+    """Project input to query sequence."""
+    b, t = x.shape[0], x.shape[1]
+    dtype = self.compute_dtype or x.dtype
+    v = x.values.astype(dtype)
+    q = mx.matmul(v, self.q_proj.astype(dtype))
+    if self.use_bias:
+      q = q + self.q_bias.astype(dtype)
+    q = q.reshape(b, t, self.num_heads, self.units_per_head)
+    return Sequence(q, x.mask)
+
+  def _project_kv(self, source):
+    """Project source to key/value sequences."""
+    b, t = source.shape[0], source.shape[1]
+    dtype = self.compute_dtype or source.dtype
+    v = source.values.astype(dtype)
+    kv = mx.matmul(v, self.kv_proj.astype(dtype))
+    k, val = mx.split(kv, 2, axis=-1)
+    if self.use_bias:
+      kv_bias = self.kv_bias.astype(dtype)
+      kb, vb = mx.split(kv_bias, 2, axis=-1)
+      k = k + kb
+      val = val + vb
+    k = k.reshape(b, t, self.num_heads, self.units_per_head)
+    val = val.reshape(b, t, self.num_heads, self.units_per_head)
+    return Sequence(k, source.mask), Sequence(val, source.mask)
+
+  def _get_source(self, constants):
+    """Helper to resolve the external source sequence from constants."""
+    if constants is None or self.source_name not in constants:
+      raise ValueError(f'Source "{self.source_name}" not found in constants.')
+    return constants[self.source_name]
+
+  def _compute_attention(
+      self, queries, keys, values, mask, emit_attention_weights=False
+  ):
+    """Compute scaled dot-product attention."""
+    q = mx.transpose(queries, (0, 2, 1, 3))
+    k = mx.transpose(keys, (0, 2, 1, 3))
+    v = mx.transpose(values, (0, 2, 1, 3))
+
+    q = _scale_queries(
+        q, self._per_dim_scale, self._query_scale, self.units_per_head
+    )
+
+    if not emit_attention_weights:
+      if self.sink_key_embeddings is not None:
+        # JAX computes sink logits with *unscaled* queries.  Pre-divide
+        # sink keys by the scale so that SDPA produces equivalent logits:
+        #   scaled_q @ (sink_k / scale) == unscaled_q @ sink_k
+        scale_vec = _query_scale_vector(
+            self._per_dim_scale,
+            self._query_scale,
+            self.units_per_head,
+            q.dtype,
+        )
+        sink_k = self.sink_key_embeddings.astype(q.dtype) / scale_vec
+        sink_v = self.sink_value_embeddings.astype(v.dtype)
+
+        sink_k_b = mx.broadcast_to(
+            mx.transpose(sink_k, (1, 0, 2))[None],
+            (q.shape[0], self.num_heads, sink_k.shape[0], self.units_per_head),
+        )
+        sink_v_b = mx.broadcast_to(
+            mx.transpose(sink_v, (1, 0, 2))[None],
+            (v.shape[0], self.num_heads, sink_v.shape[0], self.units_per_head),
+        )
+
+        k = mx.concatenate([sink_k_b, k], axis=2)
+        v = mx.concatenate([sink_v_b, v], axis=2)
+
+        if mask is not None:
+          num_sinks = self.sink_key_embeddings.shape[0]
+          sink_mask = mx.ones(
+              (mask.shape[0], mask.shape[1], mask.shape[2], num_sinks),
+              dtype=mx.bool_,
+          )
+          mask = mx.concatenate([sink_mask, mask], axis=-1)
+
+      context = mx.fast.scaled_dot_product_attention(
+          q, k, v, scale=1.0, mask=mask
+      )
+      return mx.transpose(context, (0, 2, 1, 3)), ()
+
+    # Manual path for emits
+    sink_logits = None
+    if self.sink_key_embeddings is not None:
+      sink_k = self.sink_key_embeddings.astype(q.dtype)
+      sink_k_t = mx.transpose(sink_k, (1, 2, 0))
+      sink_logits = mx.matmul(q, sink_k_t)
+
+    logits = mx.matmul(q, mx.transpose(k, (0, 1, 3, 2)))
+
+    if self.sink_key_embeddings is not None:
+      sink_v = self.sink_value_embeddings.astype(v.dtype)
+      sink_v_t = mx.transpose(sink_v, (1, 0, 2))
+      sink_v_b = mx.broadcast_to(sink_v_t[None], (v.shape[0],) + sink_v_t.shape)
+      v = mx.concatenate([sink_v_b, v], axis=2)
+      assert sink_logits is not None
+      logits = mx.concatenate([sink_logits, logits], axis=-1)
+      if mask is not None:
+        num_sinks = self.sink_key_embeddings.shape[0]
+        sink_mask = mx.ones(
+            (mask.shape[0], mask.shape[1], mask.shape[2], num_sinks),
+            dtype=mx.bool_,
+        )
+        mask = mx.concatenate([sink_mask, mask], axis=-1)
+
+    if mask is not None:
+      large_neg = mx.array(-1e9, dtype=logits.dtype)
+      logits = mx.where(mask, logits, large_neg)
+
+    logits_f32 = (
+        logits.astype(mx.float32) if logits.dtype != mx.float32 else logits
+    )
+    weights = mx.softmax(logits_f32, axis=-1).astype(v.dtype)
+    context = mx.matmul(weights, v)
+    context = mx.transpose(context, (0, 2, 1, 3))
+
+    emits = mx.transpose(weights, (0, 2, 1, 3))
+    return context, emits
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    """Returns the feature output shape."""
+    if len(input_shape) != 1:
+      raise ValueError(
+          'StreamingDotProductAttention requires rank 3 input,'
+          f' got channel_shape={input_shape}.'
+      )
+    return (self.num_heads, self.units_per_head)
+
+  @override
+  def get_output_dtype(self, input_dtype, *, constants=None):
+    if self.compute_dtype is not None:
+      return self.compute_dtype
+    return self._param_dtype
+
+  @override
+  def get_initial_state(
+      self, batch_size, input_spec, *, training: bool, constants=None
+  ):
+    source = self._get_source(constants)
+    self._ensure_initialized(input_spec.shape[-1], source.shape[-1])
+
+    compute_dtype = self.get_output_dtype(input_spec.dtype)
+    max_past = max(0, self.max_past_horizon)
+    max_future = max(0, self.max_future_horizon)
+    kv_buffer_size = max_past + max_future
+
+    kv_shape = (
+        batch_size,
+        kv_buffer_size,
+        self.num_heads,
+        self.units_per_head,
+    )
+    kv_buffer_keys = mx.zeros(kv_shape, dtype=compute_dtype)
+    kv_buffer_values = mx.zeros(kv_shape, dtype=compute_dtype)
+    kv_buffer_mask = mx.zeros((batch_size, kv_buffer_size), dtype=mx.bool_)
+    time_step = mx.zeros((batch_size,), dtype=mx.int32)
+
+    # Q/K/V network states.
+    q_net_state = (
+        self.query_network.get_initial_state(
+            batch_size,
+            types.ShapeDType(
+                (self.num_heads, self.units_per_head),
+                compute_dtype,
+            ),
+            training=training,
+            constants=constants,
+        )
+        if self.query_network is not None
+        else ()
+    )
+    k_net_state = (
+        self.key_network.get_initial_state(
+            batch_size,
+            types.ShapeDType(
+                (self.num_heads, self.units_per_head),
+                compute_dtype,
+            ),
+            training=training,
+            constants=constants,
+        )
+        if self.key_network is not None
+        else ()
+    )
+    v_net_state = (
+        self.value_network.get_initial_state(
+            batch_size,
+            types.ShapeDType(
+                (self.num_heads, self.units_per_head),
+                compute_dtype,
+            ),
+            training=training,
+            constants=constants,
+        )
+        if self.value_network is not None
+        else ()
+    )
+
+    # Query delay buffer for future horizon.
+    if max_future and self.use_query_delay_buffer:
+      q_delay_values = mx.zeros(
+          (
+              batch_size,
+              max_future,
+              self.num_heads,
+              self.units_per_head,
+          ),
+          dtype=compute_dtype,
+      )
+      q_delay_mask = mx.zeros((batch_size, max_future), dtype=mx.bool_)
+    else:
+      q_delay_values = ()
+      q_delay_mask = ()
+
+    return (
+        kv_buffer_keys,
+        kv_buffer_values,
+        kv_buffer_mask,
+        time_step,
+        q_net_state,
+        k_net_state,
+        v_net_state,
+        q_delay_values,
+        q_delay_mask,
+    )
+
+  @override
+  def layer_with_emits(self, x, *, training: bool, constants=None):
+    source = self._get_source(constants)
+    self._ensure_initialized(x.shape[-1], source.shape[-1])
+
+    proj_q = self._project_q_fn or self._project_q
+    queries = proj_q(x)
+    proj_kv = self._project_kv_fn or self._project_kv
+    keys, values = proj_kv(source)
+    queries_time = queries.shape[1]
+    keys_time = keys.shape[1]
+
+    # Optional Q/K/V processing networks.
+    if self.query_network is not None:
+      queries = Sequence(
+          self.query_network.layer(
+              queries, training=training, constants=constants
+          ).values,
+          queries.mask,
+      )
+    if self.key_network is not None:
+      keys = Sequence(
+          self.key_network.layer(
+              keys, training=training, constants=constants
+          ).values,
+          keys.mask,
+      )
+    if self.value_network is not None:
+      values = Sequence(
+          self.value_network.layer(
+              values, training=training, constants=constants
+          ).values,
+          values.mask,
+      )
+
+    # Mask invalid values.
+    values = values.mask_invalid()
+
+    # Build visibility mask: banded + source validity.
+    valid_mask = source.mask[:, None, None, :]
+    banded = _banded_mask(
+        queries_time,
+        keys_time,
+        num_lower=self.max_past_horizon,
+        num_upper=self.max_future_horizon,
+    )
+    valid_mask = valid_mask & banded
+
+    context, probs = self._compute_attention(
+        queries.values,
+        keys.values,
+        values.values,
+        valid_mask,
+        emit_attention_weights=self.config.emit_attention_weights,
+    )
+    emits = ()
+    if self.config.emit_attention_weights:
+      emits = CrossAttentionEmits({self.source_name: Sequence(probs, x.mask)})
+    return Sequence(context, x.mask), emits
+
+  @override
+  def step_with_emits(self, x, state: Any, *, training: bool, constants=None):
+    source = self._get_source(constants)
+    self._ensure_initialized(x.shape[-1], source.shape[-1])
+
+    if x.shape[1] != source.shape[1]:
+      raise ValueError(
+          f'Expected x.shape[1]={x.shape[1]} to match'
+          f' source.shape[1]={source.shape[1]}'
+      )
+
+    (
+        kv_buf_k,
+        kv_buf_v,
+        kv_buf_mask,
+        time_step,
+        q_net_state,
+        k_net_state,
+        v_net_state,
+        q_delay_values,
+        q_delay_mask,
+    ) = state
+
+    kv_buffer_size = kv_buf_k.shape[1]
+    x_time = x.shape[1]
+
+    proj_q = self._project_q_fn or self._project_q
+    queries = proj_q(x)
+    proj_kv = self._project_kv_fn or self._project_kv
+    keys, values = proj_kv(source)
+
+    # Optional Q/K/V processing networks.
+    if self.query_network is not None:
+      queries, q_net_state = self.query_network.step(
+          queries, q_net_state, training=training, constants=constants
+      )
+    if self.key_network is not None:
+      keys, k_net_state = self.key_network.step(
+          keys, k_net_state, training=training, constants=constants
+      )
+    if self.value_network is not None:
+      values, v_net_state = self.value_network.step(
+          values, v_net_state, training=training, constants=constants
+      )
+
+    # Mask invalid values.
+    values = values.mask_invalid()
+
+    # Concatenate new K/V to buffer.
+    new_k = mx.concatenate([kv_buf_k, keys.values], axis=1)
+    new_v = mx.concatenate([kv_buf_v, values.values], axis=1)
+    new_mask = mx.concatenate([kv_buf_mask, source.mask], axis=1)
+
+    # Handle query delay buffer.
+    has_delay_buffer = not isinstance(q_delay_values, tuple)
+    if has_delay_buffer:
+      # Insert new queries into delay buffer.
+      all_q_values = mx.concatenate([q_delay_values, queries.values], axis=1)
+      all_q_mask = mx.concatenate([q_delay_mask, queries.mask], axis=1)
+      # Pop oldest x_time queries as current.
+      queries = Sequence(all_q_values[:, :x_time], all_q_mask[:, :x_time])
+      # Preserve remaining for next step.
+      q_delay_values = all_q_values[:, -self.max_future_horizon :]
+      q_delay_mask = all_q_mask[:, -self.max_future_horizon :]
+
+    # Build visibility mask.
+    kv_time = new_k.shape[1]
+    valid_mask = new_mask[:, None, None, :]
+
+    vis_mask = _step_visibility_mask(
+        self.max_past_horizon,
+        self.max_future_horizon,
+        x_time,
+        kv_time,
+    )
+    if vis_mask is not None:
+      valid_mask = valid_mask & vis_mask
+
+    context, probs = self._compute_attention(
+        queries.values,
+        new_k,
+        new_v,
+        valid_mask,
+        emit_attention_weights=self.config.emit_attention_weights,
+    )
+
+    # Trim KV buffer to keep only last kv_buffer_size entries.
+    new_k = new_k[:, -kv_buffer_size:]
+    new_v = new_v[:, -kv_buffer_size:]
+    new_mask = new_mask[:, -kv_buffer_size:]
+
+    new_state = (
+        new_k,
+        new_v,
+        new_mask,
+        time_step + x_time,
+        q_net_state,
+        k_net_state,
+        v_net_state,
+        q_delay_values,
+        q_delay_mask,
+    )
+    emits = ()
+    if self.config.emit_attention_weights:
+      emits = CrossAttentionEmits(
+          {self.source_name: Sequence(probs, queries.mask)}
+      )
+    return Sequence(context, queries.mask), new_state, emits
+
+  def to_quantized(
+      self, group_size: int = 64, bits: int = 4, mode: str = 'affine'
+  ):
+    """Convert projection layers to quantized equivalents."""
+    del mode  # Unused in MLX quantize
+    if (
+        getattr(self, 'q_proj', None) is None
+        or self.q_proj.shape[0] % group_size != 0
+    ):
+      return self
+
+    self._quant_group_size = group_size
+    self._quant_bits = bits
+
+    w_q = self.q_proj.T
+    self.q_proj_qw, self.q_proj_qs, self.q_proj_qb = mx.quantize(
+        w_q, group_size=group_size, bits=bits
+    )
+
+    # kv_proj is already combined [source, 2*qkv_dim].
+    w_kv = self.kv_proj.T
+    self.kv_proj_qw, self.kv_proj_qs, self.kv_proj_qb = mx.quantize(
+        w_kv, group_size=group_size, bits=bits
+    )
+
+    self.q_proj = cast(Any, None)
+    self.kv_proj = cast(Any, None)
+
+    def _project_q(self, x):
+      b, t = x.shape[0], x.shape[1]
+      dtype = self.compute_dtype or x.dtype
+      v = x.values.astype(dtype)
+      q = _quantized_matmul_proj(
+          v,
+          self.q_proj_qw,
+          self.q_proj_qs,
+          self.q_proj_qb,
+          self._quant_group_size,
+          self._quant_bits,
+      )
+      if self.use_bias:
+        q = q + self.q_bias.astype(dtype)
+      q = q.reshape(b, t, self.num_heads, self.units_per_head)
+      return Sequence(q, x.mask)
+
+    def _project_kv(self, source):
+      b, t = source.shape[0], source.shape[1]
+      dtype = self.compute_dtype or source.dtype
+      v = source.values.astype(dtype)
+      kv = _quantized_matmul_proj(
+          v,
+          self.kv_proj_qw,
+          self.kv_proj_qs,
+          self.kv_proj_qb,
+          self._quant_group_size,
+          self._quant_bits,
+      )
+      k, val = mx.split(kv, 2, axis=-1)
+      if self.use_bias:
+        kv_bias = self.kv_bias.astype(dtype)
+        kb, vb = mx.split(kv_bias, 2, axis=-1)
+        k = k + kb
+        val = val + vb
+      k = k.reshape(b, t, self.num_heads, self.units_per_head)
+      val = val.reshape(b, t, self.num_heads, self.units_per_head)
+      return Sequence(k, source.mask), Sequence(val, source.mask)
+
+    self._project_q_fn = MethodType(_project_q, self)
+    self._project_kv_fn = MethodType(_project_kv, self)
+
+    return self
+
+  @classmethod
+  def from_config(cls, config: Any) -> 'StreamingDotProductAttention':
+    """Create from a Linen StreamingDotProductAttention.Config."""
+    mlx_config = cls.Config(
+        source_name=config.source_name,
+        num_heads=config.num_heads,
+        units_per_head=config.units_per_head,
+        block_size=getattr(config, 'block_size', 1),
+        max_past_horizon=config.max_past_horizon,
+        max_future_horizon=config.max_future_horizon,
+        attention_probabilities_dropout_rate=config.attention_probabilities_dropout_rate,
+        broadcast_dropout_across_queries=config.broadcast_dropout_across_queries,
+        use_bias=config.use_bias,
+        use_query_delay_buffer=getattr(config, 'use_query_delay_buffer', True),
+        input_projection=_map_projection_config(config.input_projection),
+        query_network=config.query_network,
+        key_network=config.key_network,
+        value_network=config.value_network,
+        attention_logits_soft_cap=config.attention_logits_soft_cap,
+        per_dim_scale=config.per_dim_scale,
+        query_scale=config.query_scale,
+        zero_fully_masked=config.zero_fully_masked,
+        compute_dtype=config.compute_dtype,
+        param_dtype=config.param_dtype or mx.float32,
+        num_sink_embeddings=getattr(config, 'num_sink_embeddings', 0),
+        use_sink_scalars=getattr(config, 'use_sink_scalars', False),
+        use_kv_cache_ringbuffer=getattr(
+            config, 'use_kv_cache_ringbuffer', False
+        ),
+        name=config.name,
+    )
+    return cls(mlx_config)
+
+
+class LocalDotProductSelfAttention(
+    DotProductSelfAttention,
+    attention_spec.LocalDotProductSelfAttention[
+        types.Sequence, types.ChannelSpec
+    ],
+):
+  """Local dot-product self attention with configurable block_size."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(
+      DotProductSelfAttention.Config,
+      attention_spec.LocalDotProductSelfAttention.Config,
+  ):
+    """MLX-native configuration for LocalDotProductSelfAttention."""
+
+    block_size: int = 1
+
+    @override
+    def make(self) -> 'LocalDotProductSelfAttention':
+      return LocalDotProductSelfAttention(self)
+
+  def __init__(
+      self,
+      config: Config | None = None,
+      *,
+      in_features: int | None = None,
+      num_heads: int | None = None,
+      units_per_head: int | None = None,
+      max_past_horizon: int | None = None,
+      max_future_horizon: int = 0,
+      num_kv_heads: int | None = None,
+      use_bias: bool = False,
+      query_scale: float | None = None,
+      per_dim_scale: bool = False,
+      compute_dtype=None,
+      param_dtype=mx.float32,
+      kernel_init=None,
+      bias_init=None,
+      query_network: types.SequenceLayer | None = None,
+      key_network: types.SequenceLayer | None = None,
+      value_network: types.SequenceLayer | None = None,
+      attention_logits_soft_cap: float | None = None,
+      num_sink_embeddings: int = 0,
+      input_projection=None,
+      block_size: int | None = None,
+      block_size_config: int | None = None,
+  ):
+    if block_size is None:
+      block_size = block_size_config if block_size_config is not None else 1
+
+    if config is None:
+      if (
+          num_heads is None
+          or units_per_head is None
+          or max_past_horizon is None
+      ):
+        raise ValueError(
+            'Must provide either config or num_heads, units_per_head, and'
+            ' max_past_horizon'
+        )
+      num_heads = cast(int, num_heads)
+      units_per_head = cast(int, units_per_head)
+      max_past_horizon = cast(int, max_past_horizon)
+      input_projection_val = (
+          input_projection
+          or projection_configs.CombinedQueryKeyValueProjection()
+      )
+      config = LocalDotProductSelfAttention.Config(
+          num_heads=num_heads,
+          units_per_head=units_per_head,
+          max_past_horizon=max_past_horizon,
+          max_future_horizon=max_future_horizon,
+          num_kv_heads=num_kv_heads,
+          use_bias=use_bias,
+          query_scale=query_scale,
+          per_dim_scale=per_dim_scale,
+          compute_dtype=compute_dtype,
+          param_dtype=param_dtype,
+          query_network=query_network,
+          key_network=key_network,
+          value_network=value_network,
+          attention_logits_soft_cap=attention_logits_soft_cap,
+          num_sink_embeddings=num_sink_embeddings,
+          input_projection=input_projection_val,
+          block_size=block_size,
+      )
+    super().__init__(
+        config,
+        in_features=in_features,
+        kernel_init=kernel_init,
+        bias_init=bias_init,
+    )
+    self._block_size_config = config.block_size
+
+  @property
+  @override
+  def block_size(self):
+    return self._block_size_config
+
+  @classmethod
+  @override
+  def from_config(cls, config: Any) -> 'LocalDotProductSelfAttention':
+    # pylint: disable=unexpected-keyword-arg
+    mlx_config = LocalDotProductSelfAttention.Config(
+        num_heads=config.num_heads,
+        units_per_head=config.units_per_head,
+        max_past_horizon=config.max_past_horizon,
+        max_future_horizon=config.max_future_horizon,
+        num_kv_heads=getattr(config, 'num_kv_heads', None),
+        attention_probabilities_dropout_rate=config.attention_probabilities_dropout_rate,
+        broadcast_dropout_across_queries=config.broadcast_dropout_across_queries,
+        use_bias=config.use_bias,
+        input_projection=_map_projection_config(config.input_projection),
+        query_network=config.query_network,
+        key_network=config.key_network,
+        value_network=config.value_network,
+        attention_logits_soft_cap=config.attention_logits_soft_cap,
+        per_dim_scale=config.per_dim_scale,
+        query_scale=config.query_scale,
+        zero_fully_masked=config.zero_fully_masked,
+        compute_dtype=config.compute_dtype,
+        param_dtype=config.param_dtype or mx.float32,
+        num_sink_embeddings=config.num_sink_embeddings,
+        use_sink_scalars=config.use_sink_scalars,
+        use_kv_cache_ringbuffer=config.use_kv_cache_ringbuffer,
+        block_size=config.block_size,
+        name=config.name,
+    )
+    return cls(mlx_config)
+
+
+StreamingLocalDotProductAttention = StreamingDotProductAttention
diff --git a/sequence_layers/mlx/attention_test.py b/sequence_layers/mlx/attention_test.py
new file mode 100644
index 0000000..cdebb47
--- /dev/null
+++ b/sequence_layers/mlx/attention_test.py
@@ -0,0 +1,281 @@
+"""Tests for attention MLX sequence layers."""
+
+from absl.testing import absltest
+from absl.testing import parameterized
+import mlx.core as mx
+import numpy as np
+
+from sequence_layers.mlx import attention
+from sequence_layers.mlx import basic_types as bt
+from sequence_layers.mlx import position
+from sequence_layers.mlx import test_utils
+from sequence_layers.specs import attention_behaviors as spec
+
+
+class DotProductSelfAttentionTest(
+    test_utils.SequenceLayerTest, spec.DotProductSelfAttentionTest
+):
+
+  def test_step_builds_kv_cache(self):
+    layer = attention.DotProductSelfAttention(
+        in_features=8,
+        num_heads=2,
+        units_per_head=4,
+        max_past_horizon=10,
+    )
+    spec = bt.ShapeDType((8,), mx.float32)
+    state = layer.get_initial_state(1, spec, training=False)
+
+    for i in range(5):
+      x = bt.MaskedSequence(
+          mx.random.normal(shape=(1, 1, 8)),
+          mx.ones((1, 1), dtype=mx.bool_),
+      )
+      _, state = layer.step(x, state, training=False)
+
+    # Check KV cache has been populated.
+    kv_mask = state[2]
+    self.assertEqual(mx.sum(kv_mask).item(), 5)
+
+  def test_with_query_key_networks(self):
+    """Test with RoPE on Q/K."""
+    rope = position.ApplyRotaryPositionalEncoding(
+        max_wavelength=10000.0, axis=-1
+    )
+    layer = attention.DotProductSelfAttention(
+        in_features=8,
+        num_heads=2,
+        units_per_head=4,
+        max_past_horizon=32,
+        query_network=rope,
+        key_network=position.ApplyRotaryPositionalEncoding(
+            max_wavelength=10000.0, axis=-1
+        ),
+    )
+    x = test_utils.random_sequence(1, 5, 8)
+    y = layer.layer(x, training=False)
+    self.assertEqual(y.shape, (1, 5, 2, 4))
+
+  def test_per_dim_scale(self):
+    """Test per_dim_scale creates parameter and affects output."""
+    layer = attention.DotProductSelfAttention(
+        in_features=8,
+        num_heads=2,
+        units_per_head=4,
+        max_past_horizon=32,
+        per_dim_scale=True,
+    )
+    self.assertIsNotNone(layer._per_dim_scale)
+    self.assertEqual(layer._per_dim_scale.shape, (4,))
+    np.testing.assert_array_equal(layer._per_dim_scale, np.zeros(4))
+
+    # At initialization (zeros), output should match per_dim_scale=False.
+    layer_no_pds = attention.DotProductSelfAttention(
+        in_features=8,
+        num_heads=2,
+        units_per_head=4,
+        max_past_horizon=32,
+        per_dim_scale=False,
+    )
+    # Copy weights so projections match.
+    layer_no_pds.q_proj = layer.q_proj
+    layer_no_pds.kv_proj = layer.kv_proj
+
+    x = test_utils.random_sequence(1, 5, 8)
+    y_pds = layer.layer(x, training=False)
+    y_no_pds = layer_no_pds.layer(x, training=False)
+    np.testing.assert_allclose(
+        np.array(y_pds.values), np.array(y_no_pds.values), atol=1e-5
+    )
+
+    # After modifying per_dim_scale, output should differ.
+    layer._per_dim_scale = mx.ones((4,))
+    y_modified = layer.layer(x, training=False)
+    self.assertFalse(
+        np.allclose(
+            np.array(y_pds.values), np.array(y_modified.values), atol=1e-5
+        )
+    )
+
+
+class DotProductSelfAttentionFromConfigTest(test_utils.SequenceLayerTest):
+
+  def test_from_config(self):
+    from sequence_layers.jax.attention import dot_product_self_attention as jax_attn
+    import sequence_layers.mlx
+
+    config = jax_attn.DotProductSelfAttention.Config(
+        num_heads=4,
+        units_per_head=8,
+        max_past_horizon=32,
+    )
+    mlx_layer = attention.DotProductSelfAttention.from_config(config)
+    self.assertIsInstance(
+        mlx_layer,
+        attention.DotProductSelfAttention,
+    )
+
+    x = test_utils.random_sequence(1, 5, 16)
+    y = mlx_layer.layer(x, training=False)
+    self.assertEqual(y.channel_shape, (4, 8))
+
+
+class DotProductAttentionTest(
+    test_utils.SequenceLayerTest,
+    spec.DotProductAttentionTest,
+):
+  """Tests for cross-attention."""
+
+  def test_from_config(self):
+    from sequence_layers.jax.attention import dot_product_attention as jax_cross_attn
+    import sequence_layers.mlx
+
+    config = jax_cross_attn.DotProductAttention.Config(
+        source_name='enc',
+        num_heads=4,
+        units_per_head=8,
+    )
+    mlx_layer = attention.DotProductAttention.from_config(config)
+    self.assertIsInstance(
+        mlx_layer,
+        attention.DotProductAttention,
+    )
+    source = test_utils.random_sequence(1, 6, 16)
+    constants = {'enc': source}
+    x = test_utils.random_sequence(1, 4, 16)
+    y = mlx_layer.layer(x, constants=constants, training=False)
+    self.assertEqual(y.channel_shape, (4, 8))
+
+
+class StreamingDotProductAttentionTest(
+    test_utils.SequenceLayerTest, spec.StreamingDotProductAttentionTest
+):
+  """Tests for streaming cross-attention."""
+
+  def _make_source(self, batch, time, features, name='source'):
+    return test_utils.random_sequence(batch, time, features)
+
+  def test_step_builds_kv_cache(self):
+    """KV buffer grows correctly during step mode."""
+    layer = attention.StreamingDotProductAttention(
+        in_features=8,
+        source_features=12,
+        source_name='source',
+        num_heads=2,
+        units_per_head=4,
+        max_past_horizon=10,
+    )
+    source = self._make_source(1, 1, 12)
+    spec = bt.ShapeDType((8,), mx.float32)
+    state = layer.get_initial_state(
+        1, spec, training=False, constants={'source': source}
+    )
+
+    for _ in range(5):
+      x = bt.MaskedSequence(
+          mx.random.normal(shape=(1, 1, 8)),
+          mx.ones((1, 1), dtype=mx.bool_),
+      )
+      src = bt.MaskedSequence(
+          mx.random.normal(shape=(1, 1, 12)),
+          mx.ones((1, 1), dtype=mx.bool_),
+      )
+      _, state, _ = layer.step_with_emits(
+          x, state, training=False, constants={'source': src}
+      )
+
+    kv_keys = state[0]
+    self.assertEqual(kv_keys.shape[1], 10)  # buffer size
+
+  def test_no_query_delay_buffer(self):
+    """use_query_delay_buffer=False has no delay."""
+    layer = attention.StreamingDotProductAttention(
+        in_features=8,
+        source_features=8,
+        source_name='source',
+        num_heads=2,
+        units_per_head=4,
+        max_past_horizon=4,
+        max_future_horizon=2,
+        use_query_delay_buffer=False,
+    )
+    self.assertEqual(layer.input_latency, 0)
+    source = self._make_source(1, 8, 8)
+    spec = bt.ShapeDType((8,), mx.float32)
+    state = layer.get_initial_state(
+        1, spec, constants={'source': source}, training=False
+    )
+    # Delay buffer should be empty tuples.
+    self.assertIsInstance(state[7], tuple)
+    self.assertEqual(state[7], ())
+
+  def test_from_config(self):
+    """Both Streaming and StreamingLocal configs produce correct layer."""
+    from sequence_layers.jax.attention import streaming_dot_product_attention as jax_streaming_attn
+    from sequence_layers.jax.attention import streaming_local_dot_product_attention as jax_streaming_local_attn
+    import sequence_layers.mlx
+
+    config = jax_streaming_attn.StreamingDotProductAttention.Config(
+        source_name='source',
+        num_heads=2,
+        units_per_head=4,
+        max_past_horizon=8,
+    )
+    mlx_layer = attention.StreamingDotProductAttention.from_config(config)
+    self.assertIsInstance(
+        mlx_layer,
+        attention.StreamingDotProductAttention,
+    )
+
+    source = test_utils.random_sequence(1, 6, 8)
+    x = test_utils.random_sequence(1, 6, 8)
+    y = mlx_layer.layer(x, constants={'source': source}, training=False)
+    self.assertEqual(y.channel_shape, (2, 4))
+
+    # StreamingLocal config should also work.
+    local_config = (
+        jax_streaming_local_attn.StreamingLocalDotProductAttention.Config(
+            source_name='source',
+            num_heads=2,
+            units_per_head=4,
+            block_size=2,
+            max_past_horizon=8,
+        )
+    )
+    mlx_local = attention.StreamingDotProductAttention.from_config(local_config)
+    self.assertIsInstance(
+        mlx_local,
+        attention.StreamingDotProductAttention,
+    )
+
+
+class LocalDotProductSelfAttentionTest(
+    test_utils.SequenceLayerTest, spec.LocalDotProductSelfAttentionTest
+):
+
+  test_step_in_future_horizon = False
+
+  def test_from_config(self):
+    from sequence_layers.jax.attention import local_dot_product_self_attention as jax_local_attn
+    import sequence_layers.mlx
+
+    config = jax_local_attn.LocalDotProductSelfAttention.Config(
+        num_heads=2,
+        units_per_head=4,
+        block_size=2,
+        max_past_horizon=8,
+    )
+    mlx_layer = attention.LocalDotProductSelfAttention.from_config(config)
+    self.assertIsInstance(
+        mlx_layer,
+        attention.LocalDotProductSelfAttention,
+    )
+    self.assertEqual(mlx_layer.block_size, 2)
+
+    x = test_utils.random_sequence(1, 8, 8)
+    y = mlx_layer.layer(x, training=False)
+    self.assertEqual(y.channel_shape, (2, 4))
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/sequence_layers/mlx/combinators.py b/sequence_layers/mlx/combinators.py
new file mode 100644
index 0000000..a3dc1ad
--- /dev/null
+++ b/sequence_layers/mlx/combinators.py
@@ -0,0 +1,712 @@
+"""Combinators (Serial, Residual, Repeat, Parallel) for MLX."""
+
+import dataclasses
+from fractions import Fraction
+from functools import reduce
+from math import lcm
+from typing import Any, Callable, override
+from typing import Sequence as _Sequence
+
+import mlx.core as mx
+
+from sequence_layers.mlx import simple as simple_lib
+from sequence_layers.mlx import types
+from sequence_layers.mlx import utils as mlx_utils
+from sequence_layers.specs import combinators as spec
+
+from . import types as bt
+
+Sequence = bt.Sequence
+CombinationMode = spec.CombinationMode
+
+
+def _broadcast_shapes(*shapes):
+  """Numpy-style shape broadcasting."""
+  if not shapes:
+    return ()
+  max_dims = max(len(s) for s in shapes)
+  if max_dims == 0:
+    return ()
+  padded = [(1,) * (max_dims - len(s)) + tuple(s) for s in shapes]
+  result = []
+  for dims in zip(*padded):
+    max_dim = max(dims)
+    for d in dims:
+      if d not in (1, max_dim):
+        raise ValueError(f'Shapes not broadcastable: {shapes}')
+    result.append(max_dim)
+  return tuple(result)
+
+
+def _combine_output_channel_shape(mode, *channel_shapes):
+  """Compute the output channel shape for a combination mode."""
+  max_dims = max(len(x) for x in channel_shapes)
+  padded = tuple((1,) * (max_dims - len(x)) + tuple(x) for x in channel_shapes)
+
+  if mode == CombinationMode.STACK:
+    bcast = _broadcast_shapes(*padded)
+    return (len(channel_shapes),) + bcast
+  if mode == CombinationMode.CONCAT:
+    if max_dims == 0:
+      # All scalar → treat as (1,) each.
+      padded = tuple((1,) for _ in channel_shapes)
+    prefixes = tuple(x[:-1] for x in padded)
+    bcast_prefix = _broadcast_shapes(*prefixes)
+    final_dim = sum(x[-1] for x in padded)
+    return bcast_prefix + (final_dim,)
+  # ADD, MEAN, PRODUCT
+  return _broadcast_shapes(*padded)
+
+
+def _combine_sequences(mode, sequences):
+  """Combine parallel output sequences."""
+  values_list = [s.values for s in sequences]
+  masks = [s.mask for s in sequences]
+  mask = masks[0]
+  for m in masks[1:]:
+    mask = mask & m
+
+  if mode == CombinationMode.STACK:
+    values = mx.stack(values_list, axis=2)
+  elif mode == CombinationMode.CONCAT:
+    values = mx.concatenate(values_list, axis=-1)
+  elif mode == CombinationMode.ADD:
+    values = values_list[0]
+    for v in values_list[1:]:
+      values = values + v
+  elif mode == CombinationMode.MEAN:
+    values = values_list[0]
+    for v in values_list[1:]:
+      values = values + v
+    values = values / len(values_list)
+  elif mode == CombinationMode.PRODUCT:
+    values = values_list[0]
+    for v in values_list[1:]:
+      values = values * v
+  else:
+    raise ValueError(f'Unknown combination mode: {mode}')
+
+  return Sequence(values, mask)
+
+
+class SerialCombinatorMixin:
+  """Mixin for Serial logic.
+
+  Provides serial processing (layer, step, initial state) for classes that
+  define a ``layers`` attribute containing a sequence of SequenceLayers.
+  """
+
+  @property
+  def layers(self) -> list[types.SequenceLayer]:
+    """Returns the list of layers in the serial combinator."""
+    raise NotImplementedError()
+
+  @property
+  def supports_step(self):
+    """Returns whether all layers support step-wise execution."""
+    return all(l.supports_step for l in self.layers)
+
+  @property
+  def block_size(self):
+    """Returns the accumulated block size of the layers."""
+    return reduce(lcm, (l.block_size for l in self.layers), 1)
+
+  @property
+  def output_ratio(self):
+    """Returns the accumulated output ratio of the layers."""
+    r = self.layers[0].output_ratio if self.layers else Fraction(1)
+    for l in self.layers[1:]:
+      r = r * l.output_ratio
+    return r
+
+  @property
+  def input_latency(self):
+    """Returns the accumulated input latency of the layers."""
+    latency = 0
+    for l in self.layers:
+      latency = l.get_accumulated_input_latency(latency)
+    return latency
+
+  @property
+  def output_latency(self):
+    """Returns the accumulated output latency of the layers."""
+    return int(self.input_latency * self.output_ratio)
+
+  def get_output_shape(self, input_shape, *, constants=None):
+    """Returns the output shape of the serial combination."""
+    shape = input_shape
+    for l in self.layers:
+      shape = l.get_output_shape(shape, constants=constants)
+    return shape
+
+  def get_output_dtype(self, input_dtype, *, constants=None):
+    """Returns the output dtype of the serial combination."""
+    dtype = input_dtype
+    for l in self.layers:
+      dtype = l.get_output_dtype(dtype, constants=constants)
+    return dtype
+
+  def get_initial_state(
+      self,
+      batch_size,
+      input_spec,
+      *,
+      training: bool = False,
+      constants=None,
+      **kwargs,
+  ):
+    """Returns the initial state for all layers in the serial combination."""
+    curr_spec = input_spec
+    states = []
+    for l in self.layers:
+      states.append(
+          mlx_utils.call_get_initial_state(
+              l,
+              batch_size,
+              curr_spec,
+              training=training,
+              constants=constants,
+              **kwargs,
+          )
+      )
+      curr_spec = l.get_output_spec(curr_spec, constants=constants)
+    return tuple(states)
+
+  def layer_with_emits(
+      self, x, *, training: bool = False, constants=None, **kwargs
+  ):
+    """Process layer-wise through all child layers, accumulating emits."""
+    emits = {}
+    for i, l in enumerate(self.layers):
+      x, e = mlx_utils.call_layer_with_emits(
+          l, x, training=training, constants=constants, **kwargs
+      )
+      emits[f'layer_{i}'] = e
+    return x, emits
+
+  def step_with_emits(
+      self, x, state, *, training: bool = False, constants=None, **kwargs
+  ):
+    """Process step-wise through all child layers, accumulating emits."""
+    new_state = []
+    emits = {}
+    for i, (l, s) in enumerate(zip(self.layers, state)):
+      x, s, e = mlx_utils.call_step_with_emits(
+          l, x, s, training=training, constants=constants, **kwargs
+      )
+      new_state.append(s)
+      emits[f'layer_{i}'] = e
+    return x, tuple(new_state), emits
+
+
+class SerialModules(
+    SerialCombinatorMixin,
+    types.Emitting,
+    spec.SerialModules[types.Sequence, types.ShapeDType],
+):
+  """A Serial combinator that wraps pre-existing SequenceLayers.
+
+  Unlike Serial (which owns its layers), SerialModules references
+  pre-constructed modules parented elsewhere. This avoids duplication
+  when a module graph shares sub-layers across different combinators.
+  """
+
+  def __init__(self, layers: _Sequence[types.SequenceLayer]):
+    super().__init__()
+    self._layers = list(layers)
+
+  @property
+  @override
+  def layers(self) -> list[types.SequenceLayer]:
+    return self._layers
+
+
+class Serial(
+    SerialCombinatorMixin,
+    types.Emitting,
+    spec.Serial[types.Sequence, types.ShapeDType],
+):
+  """Processes SequenceLayers serially."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types.SequenceLayerConfig, spec.Serial.Config):
+    """Configuration for Serial."""
+
+    layers: _Sequence[types.SequenceLayerConfig] = ()
+    name: str | None = None
+
+    def __post_init__(self):
+      object.__setattr__(self, 'layers', tuple(self.layers))
+
+    @override
+    def make(self) -> 'Serial':
+      return Serial.from_config(self)
+
+  def __init__(
+      self,
+      layers: list[types.SequenceLayer],
+      names: list[str | None] | None = None,
+  ):
+    super().__init__()
+    self.config = None
+    self._layer_names = []
+    for i, l in enumerate(layers):
+      name = f'layers_{i}'
+      if names is not None:
+        name_opt = names[i]
+        if isinstance(name_opt, str):
+          name = name_opt
+      self._layer_names.append(name)
+      setattr(self, name, l)
+      setattr(self, f'layers_{i}', l)
+
+  @property
+  @override
+  def layers(self) -> list[types.SequenceLayer]:
+    return [getattr(self, name) for name in self._layer_names]
+
+  @classmethod
+  def from_config(cls, config, backend='mlx'):
+    """Creates a Serial layer from a configuration object."""
+    layers = [mlx_utils.make_layer(c, backend=backend) for c in config.layers]
+    names = [getattr(c, 'name', None) for c in config.layers]
+    instance = cls(layers, names=names)
+    instance.config = config
+    return instance
+
+
+class Residual(types.Emitting, spec.Residual[types.Sequence, types.ShapeDType]):
+  """Residual wrapper: y = body(x) + shortcut(x)."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types.SequenceLayerConfig, spec.Residual.Config):
+    """Configuration for Residual."""
+
+    layers: _Sequence[types.SequenceLayerConfig] = ()
+    shortcut_layers: _Sequence[types.SequenceLayerConfig] | None = None
+    name: str | None = None
+
+    def __post_init__(self):
+      object.__setattr__(self, 'layers', tuple(self.layers))
+      if self.shortcut_layers is not None:
+        object.__setattr__(self, 'shortcut_layers', tuple(self.shortcut_layers))
+
+    @override
+    def make(self) -> 'Residual':
+      return Residual.from_config(self)
+
+  def __init__(
+      self,
+      layers: list[types.SequenceLayer],
+      *,
+      names: list[str | None] | None = None,
+      shortcut: types.SequenceLayer | None = None,
+  ):
+    super().__init__()
+    self.config = None
+    self.body = Serial(layers, names=names)
+    self.shortcut = (
+        shortcut
+        if shortcut is not None
+        else simple_lib.Identity(simple_lib.Identity.Config())
+    )
+
+  @property
+  @override
+  def supports_step(self):
+    return self.body.supports_step and self.shortcut.supports_step
+
+  @property
+  @override
+  def block_size(self):
+    return lcm(self.body.block_size, self.shortcut.block_size)
+
+  @property
+  @override
+  def output_ratio(self):
+    return self.body.output_ratio
+
+  @property
+  @override
+  def input_latency(self):
+    return self.body.input_latency
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    return self.body.get_output_shape(input_shape, constants=constants)
+
+  @override
+  def get_output_dtype(self, input_dtype, *, constants=None):
+    return self.body.get_output_dtype(input_dtype, constants=constants)
+
+  @override
+  def get_initial_state(
+      self,
+      batch_size,
+      input_spec,
+      *,
+      training: bool = False,
+      constants=None,
+      **kwargs,
+  ):
+    body_state = mlx_utils.call_get_initial_state(
+        self.body,
+        batch_size,
+        input_spec,
+        training=training,
+        constants=constants,
+        **kwargs,
+    )
+    shortcut_state = mlx_utils.call_get_initial_state(
+        self.shortcut,
+        batch_size,
+        input_spec,
+        training=training,
+        constants=constants,
+        **kwargs,
+    )
+    return (body_state, shortcut_state)
+
+  def _residual_fn(self, y_body, y_shortcut):
+    """Combines output of body and shortcut layers residuals."""
+    y_values = y_body.values + y_shortcut.values
+    y_mask = y_body.mask & y_shortcut.mask
+    return Sequence(y_values, y_mask)
+
+  @override
+  def layer_with_emits(
+      self, x, *, training: bool = False, constants=None, **kwargs
+  ):
+    y_body, body_emits = mlx_utils.call_layer_with_emits(
+        self.body, x, training=training, constants=constants, **kwargs
+    )
+    y_shortcut, shortcut_emits = mlx_utils.call_layer_with_emits(
+        self.shortcut, x, training=training, constants=constants, **kwargs
+    )
+    y = self._residual_fn(y_body, y_shortcut)
+    return y, (body_emits, shortcut_emits)
+
+  @override
+  def step_with_emits(
+      self, x, state: Any, *, training: bool = False, constants=None, **kwargs
+  ):
+    body_state, shortcut_state = state
+    y_body, body_state, body_emits = mlx_utils.call_step_with_emits(
+        self.body,
+        x,
+        body_state,
+        training=training,
+        constants=constants,
+        **kwargs,
+    )
+    y_shortcut, shortcut_state, shortcut_emits = mlx_utils.call_step_with_emits(
+        self.shortcut,
+        x,
+        shortcut_state,
+        training=training,
+        constants=constants,
+        **kwargs,
+    )
+    y = self._residual_fn(y_body, y_shortcut)
+    return (
+        y,
+        (body_state, shortcut_state),
+        (body_emits, shortcut_emits),
+    )
+
+  @classmethod
+  def from_config(cls, config, backend='mlx'):
+    """Creates a Residual layer from a configuration object."""
+    layers = [mlx_utils.make_layer(c, backend=backend) for c in config.layers]
+    names = [getattr(c, 'name', None) for c in config.layers]
+    shortcut = None
+    if hasattr(config, 'shortcut_layers') and config.shortcut_layers:
+      shortcut_layers = [
+          mlx_utils.make_layer(c, backend=backend)
+          for c in config.shortcut_layers
+      ]
+      shortcut_names = [
+          getattr(c, 'name', None) for c in config.shortcut_layers
+      ]
+      if len(shortcut_layers) == 1:
+        shortcut = shortcut_layers[0]
+      else:
+        shortcut = Serial(shortcut_layers, names=shortcut_names)
+    instance = cls(layers, names=names, shortcut=shortcut)
+    instance.config = config
+    return instance
+
+
+class Repeat(types.Emitting, spec.Repeat[types.Sequence, types.ShapeDType]):
+  """Repeats a SequenceLayer N times.
+
+  Unlike Linen/NNX which use scan/vmap to share stacked params,
+  MLX Repeat creates N independent copies of the child layer.
+  Each copy has its own parameters.
+  """
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types.SequenceLayerConfig, spec.Repeat.Config):
+    """Configuration for Repeat."""
+
+    layer: types.SequenceLayerConfig
+    num_repeats: int
+    remat: bool = False
+    prevent_cse: bool = False
+    policy: Callable[..., bool] | None = None
+    unroll_layer: bool = False
+    unroll_step: bool = False
+    name: str | None = None
+
+    @override
+    def make(self) -> 'Repeat':
+      return Repeat.from_config(self)
+
+  def __init__(
+      self,
+      layers: _Sequence[types.SequenceLayer],
+  ):
+    super().__init__()
+    self.config = None
+    if not layers:
+      raise ValueError('Repeat requires at least one layer.')
+    self.layers = list(layers)
+    self.num_repeats = len(layers)
+
+  @property
+  @override
+  def supports_step(self):
+    return all(l.supports_step for l in self.layers)
+
+  @property
+  @override
+  def block_size(self):
+    return self.layers[0].block_size
+
+  @property
+  @override
+  def output_ratio(self):
+    return self.layers[0].output_ratio
+
+  @property
+  @override
+  def input_latency(self):
+    latency = 0
+    for l in self.layers:
+      latency = l.get_accumulated_input_latency(latency)
+    return latency
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    return self.layers[0].get_output_shape(input_shape, constants=constants)
+
+  @override
+  def get_output_dtype(self, input_dtype, *, constants=None):
+    return self.layers[0].get_output_dtype(input_dtype, constants=constants)
+
+  @override
+  def get_initial_state(
+      self,
+      batch_size,
+      input_spec,
+      *,
+      training: bool = False,
+      constants=None,
+      **kwargs,
+  ):
+    states = []
+    curr_spec = input_spec
+    for l in self.layers:
+      states.append(
+          mlx_utils.call_get_initial_state(
+              l,
+              batch_size,
+              curr_spec,
+              training=training,
+              constants=constants,
+              **kwargs,
+          )
+      )
+    return tuple(states)
+
+  @override
+  def layer_with_emits(
+      self, x, *, training: bool = False, constants=None, **kwargs
+  ):
+    emits = {}
+    for i, l in enumerate(self.layers):
+      x, e = mlx_utils.call_layer_with_emits(
+          l, x, training=training, constants=constants, **kwargs
+      )
+      emits[f'repeat_{i}'] = e
+    return x, emits
+
+  @override
+  def step_with_emits(
+      self, x, state: Any, *, training: bool = False, constants=None, **kwargs
+  ):
+    new_state = []
+    emits = {}
+    for i, (l, s) in enumerate(zip(self.layers, state)):
+      x, s, e = mlx_utils.call_step_with_emits(
+          l, x, s, training=training, constants=constants, **kwargs
+      )
+      new_state.append(s)
+      emits[f'repeat_{i}'] = e
+    return x, tuple(new_state), emits
+
+  @classmethod
+  def from_config(cls, config, backend='mlx'):
+    """Creates a Repeat layer from a configuration object."""
+    layers = [
+        mlx_utils.make_layer(config.layer, backend=backend)
+        for _ in range(config.num_repeats)
+    ]
+    instance = cls(layers)
+    instance.config = config
+    return instance
+
+
+class Parallel(types.Emitting, spec.Parallel[types.Sequence, types.ShapeDType]):
+  """Runs N children on the same input and combines outputs.
+
+  All children must have equal output_ratio and block_size.
+  """
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types.SequenceLayerConfig, spec.Parallel.Config):
+    """Configuration for Parallel."""
+
+    layers: _Sequence[types.SequenceLayerConfig]
+    combination: CombinationMode = CombinationMode.STACK
+    share_scope: bool | _Sequence[bool] = False
+    name: str | None = None
+
+    def __post_init__(self):
+      object.__setattr__(self, 'layers', tuple(self.layers))
+
+    @override
+    def make(self) -> 'Parallel':
+      return Parallel.from_config(self)
+
+  def __init__(
+      self,
+      layers: _Sequence[types.SequenceLayer],
+      *,
+      combination: CombinationMode = CombinationMode.STACK,
+  ):
+    super().__init__()
+    self.config = None
+    if not layers:
+      raise ValueError('Parallel requires at least one layer.')
+    self.layers = list(layers)
+    self.combination = combination
+
+    # Validate constraints.
+    ratios = {l.output_ratio for l in self.layers}
+    if len(ratios) > 1:
+      raise ValueError(
+          f'All Parallel children must have equal output_ratio, got {ratios}.'
+      )
+    blocks = {l.block_size for l in self.layers}
+    if len(blocks) > 1:
+      raise ValueError(
+          f'All Parallel children must have equal block_size, got {blocks}.'
+      )
+
+  @property
+  @override
+  def supports_step(self):
+    return all(l.supports_step for l in self.layers)
+
+  @property
+  @override
+  def block_size(self):
+    return reduce(lcm, (l.block_size for l in self.layers), 1)
+
+  @property
+  @override
+  def output_ratio(self):
+    return self.layers[0].output_ratio
+
+  @property
+  @override
+  def input_latency(self):
+    return self.layers[0].input_latency
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    shapes = tuple(
+        l.get_output_shape(input_shape, constants=constants)
+        for l in self.layers
+    )
+    return _combine_output_channel_shape(self.combination, *shapes)
+
+  @override
+  def get_output_dtype(self, input_dtype, *, constants=None):
+    return self.layers[0].get_output_dtype(input_dtype, constants=constants)
+
+  @override
+  def get_initial_state(
+      self,
+      batch_size,
+      input_spec,
+      *,
+      training: bool = False,
+      constants=None,
+      **kwargs,
+  ):
+    states = []
+    for l in self.layers:
+      states.append(
+          mlx_utils.call_get_initial_state(
+              l,
+              batch_size,
+              input_spec,
+              training=training,
+              constants=constants,
+              **kwargs,
+          )
+      )
+    return tuple(states)
+
+  @override
+  def layer_with_emits(
+      self, x, *, training: bool = False, constants=None, **kwargs
+  ):
+    outputs = []
+    emits = {}
+    for i, l in enumerate(self.layers):
+      y, e = mlx_utils.call_layer_with_emits(
+          l, x, training=training, constants=constants, **kwargs
+      )
+      outputs.append(y)
+      emits[f'parallel_{i}'] = e
+    combined = _combine_sequences(self.combination, outputs)
+    return combined, emits
+
+  @override
+  def step_with_emits(
+      self, x, state: Any, *, training: bool = False, constants=None, **kwargs
+  ):
+    outputs = []
+    new_state = []
+    emits = {}
+    for i, (l, s) in enumerate(zip(self.layers, state)):
+      y, s, e = mlx_utils.call_step_with_emits(
+          l, x, s, training=training, constants=constants, **kwargs
+      )
+      outputs.append(y)
+      new_state.append(s)
+      emits[f'parallel_{i}'] = e
+    combined = _combine_sequences(self.combination, outputs)
+    return combined, tuple(new_state), emits
+
+  @classmethod
+  def from_config(cls, config, backend='mlx'):
+    """Creates a Parallel layer from a configuration object."""
+    layers = [mlx_utils.make_layer(c, backend=backend) for c in config.layers]
+    combination = CombinationMode(config.combination.value)
+    instance = cls(layers, combination=combination)
+    instance.config = config
+    return instance
diff --git a/sequence_layers/mlx/combinators_test.py b/sequence_layers/mlx/combinators_test.py
new file mode 100644
index 0000000..4f62916
--- /dev/null
+++ b/sequence_layers/mlx/combinators_test.py
@@ -0,0 +1,284 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for combinator MLX sequence layers."""
+
+# pylint: disable=import-outside-toplevel
+from absl.testing import absltest
+import mlx.core as mx
+import numpy as np
+
+from sequence_layers.mlx import basic_types as bt
+from sequence_layers.mlx import combinators
+from sequence_layers.mlx import dense
+from sequence_layers.mlx import simple
+from sequence_layers.mlx import test_utils
+from sequence_layers.specs import combinators_behaviors as spec_behaviors
+
+
+class CombinatorBehaviorsTest(
+    test_utils.SequenceLayerTest, spec_behaviors.CombinatorBehaviorsTest
+):
+  """Shared behavior tests for combinators in MLX."""
+
+
+class SerialTest(test_utils.SequenceLayerTest):
+
+  def test_identity_serial(self):
+    layer = combinators.Serial([
+        simple.Identity.Config().make(),
+        simple.Identity.Config().make(),
+    ])
+    x = self.random_sequence(2, 5, 4)
+    self.verify_contract(layer, x)
+
+  def test_dense_serial(self):
+    layer = combinators.Serial([
+        dense.Dense.Config(features=8).make(),
+        dense.Dense.Config(features=16).make(),
+    ])
+    x = self.random_sequence(2, 5, 4)
+    self.verify_contract(layer, x)
+
+  def test_output_shape(self):
+    layer = combinators.Serial([
+        dense.Dense.Config(features=8).make(),
+        dense.Dense.Config(features=16).make(),
+    ])
+    self.assertEqual(layer.get_output_shape((4,)), (16,))
+
+  def test_from_config(self):
+    import sequence_layers.jax as sl
+
+    config = sl.Serial.Config([  # pyrefly: ignore[bad-argument-type]
+        sl.Identity.Config(),
+        sl.Dense.Config(features=8),
+    ])
+    mlx_layer = combinators.Serial.from_config(config)
+    self.assertIsInstance(mlx_layer, combinators.Serial)
+
+
+class ResidualTest(test_utils.SequenceLayerTest):
+
+  def test_identity_residual(self):
+    layer = combinators.Residual([simple.Identity.Config().make()])
+    x = self.random_sequence(2, 5, 4)
+    self.verify_contract(layer, x)
+
+  def test_residual_adds(self):
+    layer = combinators.Residual([simple.Identity.Config().make()])
+    x = self.random_sequence(1, 3, 4)
+    y = layer.layer(x, training=False)
+    # y = identity(x) + x = 2 * x
+    expected = x.values * 2
+    np.testing.assert_allclose(y.values, expected, atol=1e-6)
+
+  def test_from_config(self):
+    import sequence_layers.jax as sl
+
+    config = sl.Residual.Config(
+        [sl.Identity.Config()]  # pyrefly: ignore[bad-argument-type]
+    )
+    mlx_layer = combinators.Residual.from_config(config)
+    self.assertIsInstance(mlx_layer, combinators.Residual)
+
+
+class RepeatTest(test_utils.SequenceLayerTest):
+
+  def test_repeat_identity(self):
+    layers = [simple.Identity.Config().make() for _ in range(3)]
+    layer = combinators.Repeat(layers)
+    x = self.random_sequence(2, 5, 4)
+    self.verify_contract(layer, x)
+
+  def test_repeat_dense(self):
+    layers = [dense.Dense.Config(features=4).make() for _ in range(3)]
+    layer = combinators.Repeat(layers)
+    x = self.random_sequence(2, 5, 4)
+    self.verify_contract(layer, x)
+
+  def test_num_repeats(self):
+    layers = [simple.Identity.Config().make() for _ in range(5)]
+    layer = combinators.Repeat(layers)
+    self.assertEqual(layer.num_repeats, 5)
+
+  def test_from_config(self):
+    import sequence_layers.jax as sl
+
+    config = sl.Repeat.Config(
+        layer=sl.Identity.Config(),  # pyrefly: ignore[bad-argument-type]
+        num_repeats=4,
+    )
+    mlx_layer = combinators.Repeat.from_config(config)
+    self.assertIsInstance(mlx_layer, combinators.Repeat)
+    self.assertEqual(mlx_layer.num_repeats, 4)
+
+
+class ParallelTest(test_utils.SequenceLayerTest):
+
+  def test_stack(self):
+    layer = combinators.Parallel(
+        [simple.Identity.Config().make(), simple.Identity.Config().make()],
+        combination=combinators.CombinationMode.STACK,
+    )
+    x = self.random_sequence(1, 4, 3)
+    y = layer.layer(x, training=False)
+    # STACK: (3,) + (3,) -> (2, 3)
+    self.assertEqual(y.channel_shape, (2, 3))
+
+  def test_concat(self):
+    layer = combinators.Parallel(
+        [
+            dense.Dense.Config(features=3).make(),
+            dense.Dense.Config(features=5).make(),
+        ],
+        combination=combinators.CombinationMode.CONCAT,
+    )
+    x = self.random_sequence(1, 4, 4)
+    y = layer.layer(x, training=False)
+    self.assertEqual(y.channel_shape, (8,))
+
+  def test_add(self):
+    layer = combinators.Parallel(
+        [simple.Identity.Config().make(), simple.Identity.Config().make()],
+        combination=combinators.CombinationMode.ADD,
+    )
+    x = self.random_sequence(1, 4, 3)
+    y = layer.layer(x, training=False)
+    self.assertEqual(y.channel_shape, (3,))
+    # ADD of two identities = 2x
+    np.testing.assert_allclose(y.values, x.values * 2, atol=1e-6)
+
+  def test_mean(self):
+    layer = combinators.Parallel(
+        [simple.Identity.Config().make(), simple.Identity.Config().make()],
+        combination=combinators.CombinationMode.MEAN,
+    )
+    x = self.random_sequence(1, 4, 3)
+    y = layer.layer(x, training=False)
+    # MEAN of two identities = x
+    np.testing.assert_allclose(y.values, x.values, atol=1e-6)
+
+  def test_product(self):
+    layer = combinators.Parallel(
+        [simple.Identity.Config().make(), simple.Identity.Config().make()],
+        combination=combinators.CombinationMode.PRODUCT,
+    )
+    x = self.random_sequence(1, 4, 3)
+    y = layer.layer(x, training=False)
+    # PRODUCT of two identities = x^2
+    np.testing.assert_allclose(y.values, x.values * x.values, atol=1e-6)
+
+  def test_step_consistency(self):
+    layer = combinators.Parallel(
+        [simple.Identity.Config().make(), simple.Identity.Config().make()],
+        combination=combinators.CombinationMode.ADD,
+    )
+    x = self.random_sequence(2, 5, 4)
+    self.verify_contract(layer, x)
+
+  def test_output_shape_stack(self):
+    layer = combinators.Parallel(
+        [simple.Identity.Config().make(), simple.Identity.Config().make()],
+        combination=combinators.CombinationMode.STACK,
+    )
+    self.assertEqual(layer.get_output_shape((4,)), (2, 4))
+
+  def test_output_shape_concat(self):
+    layer = combinators.Parallel(
+        [
+            dense.Dense.Config(features=3).make(),
+            dense.Dense.Config(features=5).make(),
+        ],
+        combination=combinators.CombinationMode.CONCAT,
+    )
+    self.assertEqual(layer.get_output_shape((4,)), (8,))
+
+  def test_from_config(self):
+    from sequence_layers.jax import utils as jax_utils
+    import sequence_layers.jax as sl
+
+    config = sl.Parallel.Config(
+        layers=[  # pyrefly: ignore[bad-argument-type]
+            sl.Identity.Config(),
+            sl.Identity.Config(),
+        ],
+        combination=jax_utils.CombinationMode.ADD,
+    )
+    mlx_layer = combinators.Parallel.from_config(config)
+    self.assertIsInstance(mlx_layer, combinators.Parallel)
+    x = self.random_sequence(2, 5, 4)
+    self.verify_contract(mlx_layer, x)
+
+  def test_unequal_ratio_raises(self):
+    from sequence_layers.mlx import convolution
+
+    with self.assertRaises(ValueError):
+      combinators.Parallel([
+          simple.Identity.Config().make(),
+          convolution.Conv1D(
+              in_features=4,
+              filters=4,
+              kernel_size=3,
+              strides=2,
+              padding='causal',
+          ),
+      ])
+
+
+class TransformerEndToEndTest(test_utils.SequenceLayerTest):
+  """End-to-end test with a full Transformer config."""
+
+  def test_decoder_transformer(self):
+    import jax
+
+    import sequence_layers.jax as sl
+    from sequence_layers.jax.attention import dot_product_self_attention as dpa
+
+    # Attention outputs [b, t, num_heads, units_per_head].
+    # A Dense layer after it projects back to model dim.
+    config = sl.Serial.Config([
+        sl.Residual.Config([
+            sl.RMSNormalization.Config(),
+            dpa.DotProductSelfAttention.Config(
+                num_heads=4,
+                units_per_head=8,
+                max_past_horizon=64,
+            ),
+            sl.Flatten.Config(),
+            sl.Dense.Config(features=32),
+        ]),
+        sl.Residual.Config([
+            sl.RMSNormalization.Config(),
+            sl.Dense.Config(features=64, activation=jax.nn.gelu),
+            sl.Dense.Config(features=32),
+        ]),
+    ])
+    model = combinators.Serial.from_config(config)
+
+    # Layer mode.
+    x = self.random_sequence(1, 10, 32)
+    y = model.layer(x, training=False)
+    self.assertEqual(y.shape, (1, 10, 32))
+
+    # Step mode.
+    spec = bt.ShapeDType((32,), mx.float32)
+    state = model.get_initial_state(1, spec)
+    x_step = self.random_sequence(1, 1, 32)
+    for _ in range(5):
+      y_step, state = model.step(x_step, state, training=False)
+    self.assertEqual(y_step.shape, (1, 1, 32))
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/sequence_layers/mlx/conditioning.py b/sequence_layers/mlx/conditioning.py
new file mode 100644
index 0000000..94d1101
--- /dev/null
+++ b/sequence_layers/mlx/conditioning.py
@@ -0,0 +1,456 @@
+"""Conditioning layers for MLX."""
+
+# pylint: disable=protected-access
+
+import dataclasses
+from typing import Any, cast, override
+
+import mlx.core as mx
+
+from sequence_layers.mlx import types
+from sequence_layers.mlx.init_mapping import _to_mx_dtype
+from sequence_layers.specs import conditioning as conditioning_spec
+
+Sequence = types.Sequence
+MaskedSequence = types.MaskedSequence
+
+
+# ---------------------------------------------------------------------------
+# Broadcast helpers
+# ---------------------------------------------------------------------------
+
+
+def _broadcast_shapes(shape1, shape2):
+  """Compute the broadcast shape of two shapes (numpy-style)."""
+  s1 = list(shape1)
+  s2 = list(shape2)
+  while len(s1) < len(s2):
+    s1.insert(0, 1)
+  while len(s2) < len(s1):
+    s2.insert(0, 1)
+  result = []
+  for a, b in zip(s1, s2):
+    if a == 1:
+      result.append(b)
+    elif b == 1:
+      result.append(a)
+    elif a == b:
+      result.append(a)
+    else:
+      raise ValueError(f'Shapes {shape1} and {shape2} are not broadcastable')
+  return tuple(result)
+
+
+def _reshape_for_broadcast(*seqs):
+  """Reshape channel dims of many sequences to be broadcastable."""
+  max_dims = max(x.ndim for x in seqs)
+
+  def _maybe_reshape(values):
+    extra_dims = max_dims - values.ndim
+    if extra_dims == 0:
+      return values
+    batch_size, time = values.shape[:2]
+    shape = (batch_size, time) + (1,) * extra_dims + values.shape[2:]
+    return mx.reshape(values, shape)
+
+  return tuple(x.apply_values(_maybe_reshape) for x in seqs)
+
+
+def _combine_mask(*masks):
+  """AND together multiple masks."""
+  result = masks[0]
+  for m in masks[1:]:
+    if m is not result:
+      result = mx.logical_and(result, m)
+  return result
+
+
+def _sequence_broadcast_add(x, y):
+  """Broadcast-add two sequences."""
+  x, y = _reshape_for_broadcast(x, y)
+  return Sequence(x.values + y.values, _combine_mask(x.mask, y.mask))
+
+
+def _sequence_broadcast_product(x, y):
+  """Broadcast-multiply two sequences."""
+  x, y = _reshape_for_broadcast(x, y)
+  return Sequence(x.values * y.values, _combine_mask(x.mask, y.mask))
+
+
+def _sequence_broadcast_concat(x, y):
+  """Broadcast-concat on last axis."""
+  x, y = _reshape_for_broadcast(x, y)
+  x_shape = x.values.shape
+  y_shape = y.values.shape
+  # Broadcast all dims except the last.
+  target_outer = []
+  for i in range(len(x_shape) - 1):
+    target_outer.append(max(x_shape[i], y_shape[i]))
+  x_vals = mx.broadcast_to(x.values, tuple(target_outer) + (x_shape[-1],))
+  y_vals = mx.broadcast_to(y.values, tuple(target_outer) + (y_shape[-1],))
+  return Sequence(
+      mx.concatenate([x_vals, y_vals], axis=-1),
+      _combine_mask(x.mask, y.mask),
+  )
+
+
+def _sequence_unstack(seq, axis):
+  """Unstack a sequence along a channel axis."""
+  if axis < 0:
+    axis += seq.ndim
+  if axis <= 1 or axis >= seq.ndim:
+    raise ValueError(f'Invalid axis: {axis=} {seq.ndim=}')
+  n = seq.values.shape[axis]
+  splits = []
+  for i in range(n):
+    v = mx.take(seq.values, mx.array([i]), axis=axis)
+    v = mx.squeeze(v, axis=axis)
+    splits.append(v)
+  return [type(seq)(v, seq.mask) for v in splits]
+
+
+# ---------------------------------------------------------------------------
+# Conditioning helpers
+# ---------------------------------------------------------------------------
+
+
+def _get_conditioning(layer, conditioning_name, constants):
+  """Gets the conditioning from constants."""
+  if constants is None:
+    raise ValueError(
+        f'{layer} requires conditioning via constants, got: {constants}'
+    )
+  conditioning = constants.get(conditioning_name)
+  if conditioning is None:
+    raise ValueError(
+        f'{layer} expected {conditioning_name!r} in constants,'
+        f' got keys: {list(constants.keys())}'
+    )
+  return conditioning
+
+
+def _tensor_to_fake_sequence(t):
+  """Wrap a [B, ...] tensor as a [B, 1, ...] MaskedSequence."""
+  batch_size = t.shape[0]
+  return MaskedSequence(
+      mx.expand_dims(t, axis=1),
+      mx.ones((batch_size, 1), dtype=mx.bool_),
+  )
+
+
+# ---------------------------------------------------------------------------
+# Conditioning layer
+# ---------------------------------------------------------------------------
+
+
+class Conditioning(
+    types.SequenceLayer,
+    conditioning_spec.Conditioning[types.Sequence, types.ChannelSpec],
+):
+  """Conditions x on a conditioning signal from constants.
+
+  Conditioning is done time-synchronized: each timestep of x is conditioned
+  on the corresponding timestep of c.
+
+  Conditioning = Combine(x, Project(c)).
+  """
+
+  # Aliases for backward compatibility
+  Projection = conditioning_spec.Projection
+  Combination = conditioning_spec.Combination
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(
+      types.SequenceLayerConfig, conditioning_spec.Conditioning.Config
+  ):
+    """Configuration for Conditioning."""
+
+    conditioning_name: str
+    projection: conditioning_spec.Projection
+    combination: conditioning_spec.Combination
+    projection_channel_shape: types.Shape | None = None
+    streaming: bool = False
+    affine_scale_offset: complex = 1.0
+    compute_dtype: Any = None
+    param_dtype: types.DType = mx.float32
+    name: str | None = None
+
+    @override
+    def make(self) -> 'Conditioning':
+      return Conditioning(self)
+
+  def __init__(
+      self,
+      config: Config | None = None,
+      *,
+      conditioning_name: str | None = None,
+      projection: conditioning_spec.Projection | None = None,
+      combination: conditioning_spec.Combination | None = None,
+      projection_channel_shape=None,
+      streaming: bool = False,
+      affine_scale_offset=1.0,
+      compute_dtype=None,
+      param_dtype=mx.float32,
+  ):
+    super().__init__()
+    if config is not None:
+      self.config = config
+    else:
+      if conditioning_name is None or projection is None or combination is None:
+        raise ValueError(
+            'Must provide either config or conditioning_name, projection, and'
+            ' combination'
+        )
+      self.config = self.Config(
+          conditioning_name=conditioning_name,
+          projection=projection,
+          combination=combination,
+          projection_channel_shape=projection_channel_shape,
+          streaming=streaming,
+          affine_scale_offset=affine_scale_offset,
+          compute_dtype=compute_dtype,
+          param_dtype=param_dtype,
+      )
+
+    self._conditioning_name = self.config.conditioning_name
+    self._projection = self.config.projection
+    self._combination = self.config.combination
+    self._projection_channel_shape = self.config.projection_channel_shape
+    self._streaming = self.config.streaming
+    self._affine_scale_offset = self.config.affine_scale_offset
+    self._compute_dtype = (
+        _to_mx_dtype(self.config.compute_dtype)
+        if self.config.compute_dtype is not None
+        else None
+    )
+    self._param_dtype = _to_mx_dtype(self.config.param_dtype)
+
+    # Projection kernel/bias (deferred until first use).
+    self.kernel = None
+    self.bias = None
+    self._equation = None
+    self._proj_initialized = False
+
+  def _validate(self):
+    """Validates the configuration parameters for consistency."""
+    if (
+        self._combination == self.Combination.AFFINE
+        and self._projection != self.Projection.LINEAR_AFFINE
+    ):
+      raise ValueError('AFFINE combination requires LINEAR_AFFINE projection.')
+    if (
+        self._combination == self.Combination.AFFINE_SHIFT
+        and self._projection != self.Projection.LINEAR
+    ):
+      raise ValueError('AFFINE_SHIFT combination requires LINEAR projection.')
+    if (
+        self._combination == self.Combination.AFFINE_SCALE
+        and self._projection != self.Projection.LINEAR
+    ):
+      raise ValueError('AFFINE_SCALE combination requires LINEAR projection.')
+    if (
+        self._combination != self.Combination.AFFINE
+        and self._projection == self.Projection.LINEAR_AFFINE
+    ):
+      raise ValueError('LINEAR_AFFINE projection requires AFFINE combination.')
+
+  def _ensure_projection_initialized(self, x_channel_shape, cond_channel_shape):
+    """Initialize projection kernel/bias on first use."""
+    if self._proj_initialized:
+      return
+    if self._projection == self.Projection.IDENTITY:
+      self._proj_initialized = True
+      return
+
+    proj_shape = self._projection_channel_shape
+    if proj_shape is None:
+      proj_shape = x_channel_shape
+
+    if self._projection == self.Projection.LINEAR_AFFINE:
+      output_shape = (2,) + tuple(proj_shape)
+    else:
+      output_shape = tuple(proj_shape)
+
+    # Build einsum equation matching Linen DenseShaped.
+    input_dims = ''.join(
+        chr(ord('a') + i) for i in range(len(cond_channel_shape))
+    )
+    output_dims = ''.join(
+        chr(ord('a') + i + len(cond_channel_shape))
+        for i in range(len(output_shape))
+    )
+
+    input_weight_dims = input_dims if input_dims else 'I'
+    output_weight_dims = output_dims if output_dims else 'O'
+    input_kernel_shape = cond_channel_shape if cond_channel_shape else (1,)
+    output_kernel_shape = output_shape if output_shape else (1,)
+
+    self._equation = (
+        f'...{input_dims},{input_weight_dims}{output_weight_dims}'
+        f'->...{output_dims}'
+    )
+    kernel_shape = input_kernel_shape + output_kernel_shape
+    self.kernel = mx.zeros(kernel_shape, dtype=self._param_dtype)
+    self.bias = mx.zeros(output_kernel_shape, dtype=self._param_dtype)
+    self._proj_initialized = True
+
+  def _projected_condition_shape(self, input_shape, condition_shape):
+    """Compute the channel shape after projection."""
+    proj_shape = self._projection_channel_shape
+    if proj_shape is None:
+      proj_shape = input_shape
+    if self._projection == self.Projection.IDENTITY:
+      return condition_shape
+    if self._projection == self.Projection.LINEAR:
+      return tuple(proj_shape)
+    if self._projection == self.Projection.LINEAR_AFFINE:
+      return (2,) + tuple(proj_shape)
+    raise ValueError(f'Unsupported projection: {self._projection}')
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    self._validate()
+    cond = _get_conditioning(self, self._conditioning_name, constants)
+    if isinstance(cond, (Sequence, MaskedSequence)):
+      cond_shape = cond.channel_shape
+    else:
+      cond_shape = cond.shape[1:]
+    proj_shape = self._projected_condition_shape(input_shape, cond_shape)
+
+    if self._combination in (
+        self.Combination.ADD,
+        self.Combination.MUL,
+        self.Combination.AFFINE_SHIFT,
+        self.Combination.AFFINE_SCALE,
+    ):
+      return _broadcast_shapes(input_shape, proj_shape)
+    if self._combination in (
+        self.Combination.CONCAT,
+        self.Combination.CONCAT_BEFORE,
+    ):
+      input_inner = input_shape[-1] if input_shape else 1
+      proj_inner = proj_shape[-1] if proj_shape else 1
+      outer = _broadcast_shapes(input_shape[:-1], proj_shape[:-1])
+      return outer + (input_inner + proj_inner,)
+    if self._combination == self.Combination.AFFINE:
+      proj_shape = proj_shape[1:]  # Remove the '2' dim.
+      return _broadcast_shapes(input_shape, proj_shape)
+    raise ValueError(f'Unsupported combination: {self._combination}')
+
+  @override
+  def get_output_dtype(self, input_dtype, *, constants=None):
+    if self._compute_dtype is not None:
+      return self._compute_dtype
+    return self._param_dtype
+
+  def _project(self, x, conditioning):
+    """Apply projection to conditioning."""
+    self._validate()
+    if self._projection == self.Projection.IDENTITY:
+      return conditioning
+
+    self._ensure_projection_initialized(
+        x.channel_shape, conditioning.channel_shape
+    )
+    compute_dtype = self._compute_dtype or self._param_dtype
+
+    def project_fn(v):
+      y = mx.einsum(
+          cast(str, self._equation), v.astype(compute_dtype), self.kernel
+      )
+      y = y + cast(Any, self.bias)
+      return y
+
+    return conditioning.apply_values(project_fn)
+
+  def _combine(self, x, conditioning):
+    """Combine projected conditioning with input."""
+    self._validate()
+    if self._combination == self.Combination.ADD:
+      return _sequence_broadcast_add(x, conditioning)
+    if self._combination == self.Combination.CONCAT:
+      return _sequence_broadcast_concat(x, conditioning)
+    if self._combination == self.Combination.CONCAT_BEFORE:
+      return _sequence_broadcast_concat(conditioning, x)
+    if self._combination == self.Combination.AFFINE:
+      scale, shift = _sequence_unstack(conditioning, axis=2)
+      scale = scale.apply_values(lambda v: v + self._affine_scale_offset)
+      x_s, scale_s = _reshape_for_broadcast(x, scale)
+      _, shift_s = _reshape_for_broadcast(x, shift)
+      values = x_s.values * scale_s.values + shift_s.values
+      mask = _combine_mask(x.mask, scale.mask, shift.mask)
+      return Sequence(values, mask)
+    if self._combination == self.Combination.AFFINE_SHIFT:
+      return _sequence_broadcast_add(x, conditioning)
+    if self._combination == self.Combination.AFFINE_SCALE:
+      conditioning = conditioning.apply_values(
+          lambda v: v + self._affine_scale_offset
+      )
+      return _sequence_broadcast_product(x, conditioning)
+    if self._combination == self.Combination.MUL:
+      return _sequence_broadcast_product(x, conditioning)
+    raise ValueError(f'Unsupported combination: {self._combination}')
+
+  @override
+  @types.check_layer
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    conditioning = _get_conditioning(self, self._conditioning_name, constants)
+    if not isinstance(conditioning, (Sequence, MaskedSequence)):
+      conditioning = _tensor_to_fake_sequence(conditioning)
+    projected = self._project(x, conditioning)
+    return self._combine(x, projected)
+
+  @override
+  def get_initial_state(
+      self, batch_size, input_spec, *, training: bool, constants=None
+  ):
+    if not self._streaming:
+      conditioning = _get_conditioning(self, self._conditioning_name, constants)
+      if isinstance(conditioning, (Sequence, MaskedSequence)):
+        return mx.zeros((batch_size,), mx.int32)
+    return ()
+
+  @override
+  @types.check_step
+  def step(  # pyrefly: ignore[missing-override-decorator]
+      self, x, state: Any, *, training: bool, constants=None
+  ):
+    conditioning = _get_conditioning(self, self._conditioning_name, constants)
+    if not isinstance(conditioning, (Sequence, MaskedSequence)):
+      conditioning = _tensor_to_fake_sequence(conditioning)
+    elif not self._streaming:
+      time_index = state
+      step_size = x.shape[1]
+      idx = int(time_index[0])
+      conditioning = type(conditioning)(
+          conditioning.values[:, idx : idx + step_size],
+          conditioning.mask[:, idx : idx + step_size],
+      )
+      state = time_index + step_size
+    projected = self._project(x, conditioning)
+    result = self._combine(x, projected)
+    return result, state
+
+  @classmethod
+  def from_config(cls, config):
+    """Create from a JAX Conditioning.Config."""
+    compute_dtype = getattr(config, 'compute_dtype', None)
+    if compute_dtype is not None:
+      compute_dtype = _to_mx_dtype(compute_dtype)
+    # Map JAX enum values to MLX enum values.
+    projection = cls.Projection(config.projection.value)
+    combination = cls.Combination(config.combination.value)
+    mlx_config = cls.Config(
+        conditioning_name=config.conditioning_name,
+        projection=projection,
+        combination=combination,
+        projection_channel_shape=config.projection_channel_shape,
+        streaming=config.streaming,
+        affine_scale_offset=config.affine_scale_offset,
+        compute_dtype=compute_dtype,
+        param_dtype=_to_mx_dtype(config.param_dtype),
+        name=config.name,
+    )
+    return cls(mlx_config)
diff --git a/sequence_layers/mlx/conditioning_test.py b/sequence_layers/mlx/conditioning_test.py
new file mode 100644
index 0000000..951526e
--- /dev/null
+++ b/sequence_layers/mlx/conditioning_test.py
@@ -0,0 +1,33 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for Conditioning MLX sequence layer."""
+
+from absl.testing import absltest
+from absl.testing import parameterized
+
+from sequence_layers.mlx import conditioning
+from sequence_layers.mlx import test_utils
+from sequence_layers.specs import conditioning_behaviors
+
+
+class ConditioningTest(
+    conditioning_behaviors.ConditioningTest,
+    test_utils.SequenceLayerTest,
+    parameterized.TestCase,
+):
+  pass
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/sequence_layers/mlx/convolution.py b/sequence_layers/mlx/convolution.py
new file mode 100644
index 0000000..b7c96dd
--- /dev/null
+++ b/sequence_layers/mlx/convolution.py
@@ -0,0 +1,1150 @@
+"""Convolution layers for MLX."""
+
+# pylint: disable=protected-access,abstract-method
+
+import dataclasses
+import fractions
+from typing import Any, Callable, cast, override
+
+from mlx import nn
+import mlx.core as mx
+
+from sequence_layers.mlx import init_mapping
+from sequence_layers.mlx import types
+from sequence_layers.mlx.types import \
+    SequenceLayerConfig as _SequenceLayerConfig
+from sequence_layers.specs import convolution as spec
+
+from . import types as bt
+
+Sequence = bt.Sequence
+MaskedSequence = bt.MaskedSequence
+PaddingMode = bt.PaddingMode
+
+
+# ---------------------------------------------------------------------------
+# Padding utilities (ported from jax/utils.py and jax/convolution.py)
+# ---------------------------------------------------------------------------
+
+
+def _effective_kernel_size(kernel_size, dilation_rate):
+  """Returns the effective kernel size after dilation."""
+  return (kernel_size - 1) * dilation_rate + 1
+
+
+def _explicit_padding(padding, kernel_size, stride, dilation_rate):
+  """Returns (pad_left, pad_right) for the given padding mode."""
+  if not isinstance(padding, str):
+    return tuple(padding)
+
+  ek = _effective_kernel_size(kernel_size, dilation_rate)
+
+  if padding in (PaddingMode.CAUSAL_VALID.value, PaddingMode.CAUSAL.value):
+    return (ek - 1, 0)
+  if padding == PaddingMode.SEMICAUSAL.value:
+    pad_left = max(ek - stride, 0)
+    return (pad_left, ek - 1 - pad_left)
+  if padding in (
+      PaddingMode.REVERSE_CAUSAL_VALID.value,
+      PaddingMode.REVERSE_CAUSAL.value,
+  ):
+    return (0, ek - 1)
+  if padding == PaddingMode.SAME.value:
+    pad_amount = ek - 1
+    pad_left = pad_amount // 2
+    return (pad_left, pad_amount - pad_left)
+  if padding == PaddingMode.VALID.value:
+    return (0, 0)
+  if padding == PaddingMode.SEMICAUSAL_FULL.value:
+    return (ek - stride, ek - 1)
+  raise ValueError(f'Unsupported padding mode: {padding}')
+
+
+def _compute_output_length(l_in, kernel_size, stride, dilation_rate, padding):
+  """Computes the expected output sequence length."""
+  pad_left, pad_right = _explicit_padding(
+      padding, kernel_size, stride, dilation_rate
+  )
+  l_pad = l_in + pad_left + pad_right
+  k_eff = _effective_kernel_size(kernel_size, dilation_rate)
+  l_out = (l_pad - k_eff) // stride + 1
+  return max(l_out, 0)
+
+
+def _buffer_width(padding, kernel_size, stride, dilation_rate):
+  """Returns the buffer width for step mode."""
+  ek = _effective_kernel_size(kernel_size, dilation_rate)
+
+  if padding == PaddingMode.SEMICAUSAL.value:
+    return max(ek - stride, 0)
+  if padding in (
+      PaddingMode.REVERSE_CAUSAL.value,
+      PaddingMode.REVERSE_CAUSAL_VALID.value,
+  ):
+    return (ek - 1) // stride * stride
+  if padding in (
+      PaddingMode.CAUSAL.value,
+      PaddingMode.CAUSAL_VALID.value,
+  ):
+    return ek - 1
+  raise ValueError(f'Unsupported step padding: {padding}')
+
+
+def _supports_step(padding):
+  """Returns True if the padding mode supports step-by-step processing."""
+  return padding in (
+      PaddingMode.CAUSAL_VALID.value,
+      PaddingMode.REVERSE_CAUSAL_VALID.value,
+      PaddingMode.CAUSAL.value,
+      PaddingMode.REVERSE_CAUSAL.value,
+      PaddingMode.SEMICAUSAL.value,
+  )
+
+
+def _compute_conv_mask(
+    mask, kernel_size, stride, dilation_rate, padding, is_step
+):
+  """Computes the mask for convolution layers."""
+  if not is_step:
+    l_out = _compute_output_length(
+        mask.shape[1], kernel_size, stride, dilation_rate, padding
+    )
+    if l_out == 0:
+      return mx.zeros((mask.shape[0], 0), dtype=mx.bool_)
+
+  if is_step:
+    if isinstance(padding, str) and padding in (
+        PaddingMode.SAME.value,
+        PaddingMode.CAUSAL.value,
+        PaddingMode.REVERSE_CAUSAL.value,
+        PaddingMode.SEMICAUSAL.value,
+    ):
+      pad_left, pad_right = _explicit_padding(
+          padding, kernel_size, stride, dilation_rate
+      )
+      # Use a simple convolution-like mask computation with float kernel.
+      kernel = [0.0] * pad_left + [1.0] + [0.0] * pad_right
+      kernel = mx.array(kernel, dtype=mx.float32).reshape(1, -1, 1)
+      mask_f = mask[:, :, None].astype(mx.float32)
+      mask_conv = mx.conv1d(mask_f, kernel, stride=stride)
+      return mx.squeeze(mask_conv, axis=-1).astype(mx.bool_)
+    if not isinstance(padding, str) or padding in (
+        PaddingMode.VALID.value,
+        PaddingMode.CAUSAL_VALID.value,
+        PaddingMode.REVERSE_CAUSAL_VALID.value,
+    ):
+      return _compute_conv_mask_logical(
+          mask, kernel_size, stride, dilation_rate
+      )
+    return _compute_conv_mask_logical(mask, kernel_size, stride, dilation_rate)
+
+  # Layer mode.
+  if isinstance(padding, str) and padding in (
+      PaddingMode.SAME.value,
+      PaddingMode.CAUSAL.value,
+      PaddingMode.REVERSE_CAUSAL.value,
+      PaddingMode.SEMICAUSAL.value,
+  ):
+    if stride > 1:
+      mask = mask[:, ::stride]
+    return mask
+
+  # VALID-like modes: need to compute mask through reduce_window equiv.
+  pad_left, pad_right = _explicit_padding(
+      padding, kernel_size, stride, dilation_rate
+  )
+  is_causal_valid = (
+      isinstance(padding, str) and padding == PaddingMode.CAUSAL_VALID.value
+  )
+  mask = mx.pad(
+      mask,
+      [(0, 0), (pad_left, pad_right)],
+      constant_values=is_causal_valid,
+  )
+  is_semicausal_full = (
+      isinstance(padding, str) and padding == PaddingMode.SEMICAUSAL_FULL.value
+  )
+  return _compute_conv_mask_logical(
+      mask,
+      kernel_size,
+      stride,
+      dilation_rate,
+      use_logical_or=is_semicausal_full,
+  )
+
+
+def _compute_conv_mask_logical(
+    mask, kernel_size, stride, dilation_rate, use_logical_or=False
+):
+  """Windowed AND/OR mask computation."""
+  # Optimized path for dilation=1 and kernel_size divisible by stride.
+  if dilation_rate == 1 and kernel_size % stride == 0:
+    num_frames = mask.shape[1] // stride
+    mask = mask[:, : num_frames * stride]
+    mask = mask.reshape(mask.shape[0], num_frames, stride)
+    if use_logical_or:
+      mask = mx.max(mask, axis=-1)
+    else:
+      mask = mx.min(mask, axis=-1)
+    kernel_size = kernel_size // stride
+    stride = 1
+
+  if kernel_size == 1 and stride == 1:
+    return mask
+
+  # Use float conv to simulate reduce_window.
+  mask_f = mask[:, :, None].astype(mx.float32)
+  # Build a kernel with ones at dilated positions.
+  if dilation_rate == 1:
+    kernel = mx.ones((1, kernel_size, 1), dtype=mx.float32)
+  else:
+    ek = _effective_kernel_size(kernel_size, dilation_rate)
+    k = [0.0] * ek
+    for i in range(kernel_size):
+      k[i * dilation_rate] = 1.0
+    kernel = mx.array(k, dtype=mx.float32).reshape(1, -1, 1)
+
+  result = mx.conv1d(mask_f, kernel, stride=stride)
+  result = mx.squeeze(result, axis=-1)
+
+  if use_logical_or:
+    return result > 0.0
+  return result >= float(kernel_size)
+
+
+def _compute_initial_state(batch_size, input_spec, buf_width, padding):
+  """Create initial buffer state for step mode."""
+  if padding in (
+      PaddingMode.CAUSAL_VALID.value,
+      PaddingMode.REVERSE_CAUSAL_VALID.value,
+      PaddingMode.SEMICAUSAL_FULL.value,
+  ):
+    mask = mx.ones((batch_size, buf_width), dtype=bt.MASK_DTYPE)
+  elif padding in (
+      PaddingMode.CAUSAL.value,
+      PaddingMode.REVERSE_CAUSAL.value,
+      PaddingMode.SEMICAUSAL.value,
+  ):
+    mask = mx.zeros((batch_size, buf_width), dtype=bt.MASK_DTYPE)
+  else:
+    raise ValueError(f'Step not supported with padding: {padding}')
+
+  values = mx.zeros(
+      (batch_size, buf_width) + input_spec.shape,
+      dtype=input_spec.dtype,
+  )
+  return MaskedSequence(values, mask)
+
+
+# ---------------------------------------------------------------------------
+# Conv1D
+# ---------------------------------------------------------------------------
+
+
+class Conv1D(types.SequenceLayer, spec.Conv1D[bt.Sequence, bt.ChannelSpec]):
+  """1D strided or dilated convolution layer.
+
+  Supports causal, reverse_causal, same, and valid padding modes.
+  Step-by-step processing is supported for causal padding modes.
+  """
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types.SequenceLayerConfig, spec.Conv1D.Config):
+    """Configuration for Conv1D."""
+
+    filters: int
+    kernel_size: int
+    strides: int = 1
+    dilation_rate: int = 1
+    padding: bt.PaddingModeString = PaddingMode.VALID.value
+    groups: int = 1
+    use_bias: bool = True
+    activation: Callable | None = None
+    compute_dtype: Any = None
+    param_dtype: Any = mx.float32
+    name: str | None = None
+
+    @override
+    def make(self) -> 'Conv1D':
+      return Conv1D(self)
+
+  def __init__(
+      self,
+      config: Config | None = None,
+      *,
+      in_features: int | None = None,
+      filters: int | None = None,
+      kernel_size: int | None = None,
+      strides: int = 1,
+      dilation_rate: int = 1,
+      padding: Any = 'valid',
+      groups: int = 1,
+      use_bias: bool = True,
+      activation=None,
+      compute_dtype=None,
+      param_dtype=mx.float32,
+  ):
+    super().__init__()
+    if config is not None:
+      self.config = config
+    else:
+      if filters is None or kernel_size is None:
+        raise ValueError(
+            'Must provide either config or filters and kernel_size'
+        )
+      self.config = self.Config(
+          filters=filters,
+          kernel_size=kernel_size,
+          strides=strides,
+          dilation_rate=dilation_rate,
+          padding=padding,
+          groups=groups,
+          use_bias=use_bias,
+          activation=activation,
+          compute_dtype=compute_dtype,
+          param_dtype=param_dtype,
+      )
+
+    self.in_features = in_features
+    self.filters = self.config.filters
+    self.kernel_size = self.config.kernel_size
+    self.strides = self.config.strides
+    self.dilation_rate = self.config.dilation_rate
+    self.padding = self.config.padding
+    self.groups = self.config.groups
+    self.use_bias = self.config.use_bias
+    self.activation = init_mapping.map_activation(self.config.activation)
+    self.compute_dtype = (
+        init_mapping._to_mx_dtype(self.config.compute_dtype)
+        if self.config.compute_dtype is not None
+        else None
+    )
+    self._param_dtype = init_mapping._to_mx_dtype(self.config.param_dtype)
+
+    self._conv: Any = None
+    if in_features is not None:
+      self._ensure_initialized(in_features)
+
+  def _ensure_initialized(self, in_features: int):
+    """Initializes the convolution layer weight and bias."""
+    if self._conv is not None:
+      return
+    self.in_features = in_features
+    if in_features % self.groups != 0:
+      raise ValueError(f'{in_features=} must be divisible by {self.groups=}.')
+
+    self._conv = nn.Conv1d(
+        in_channels=in_features,
+        out_channels=self.filters,
+        kernel_size=self.kernel_size,
+        stride=self.strides,
+        # Padding handled manually.
+        padding=0,
+        dilation=self.dilation_rate,
+        bias=self.use_bias,
+    )
+
+  @property
+  @override
+  def supports_step(self):
+    return _supports_step(self.padding)
+
+  @property
+  @override
+  def block_size(self):
+    return self.strides
+
+  @property
+  @override
+  def output_ratio(self):
+    return fractions.Fraction(1, self.strides)
+
+  @property
+  @override
+  def input_latency(self):
+    ek = _effective_kernel_size(self.kernel_size, self.dilation_rate)
+    if self.padding in (
+        PaddingMode.CAUSAL_VALID.value,
+        PaddingMode.CAUSAL.value,
+        PaddingMode.SEMICAUSAL.value,
+    ):
+      return 0
+    if self.padding in (
+        PaddingMode.REVERSE_CAUSAL_VALID.value,
+        PaddingMode.REVERSE_CAUSAL.value,
+        PaddingMode.SEMICAUSAL_FULL.value,
+    ):
+      return ek - 1
+    return 0
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    if len(input_shape) != 1:
+      raise ValueError(
+          f'Conv1D requires rank 3 input, got channel_shape={input_shape}.'
+      )
+    return (self.filters,)
+
+  @override
+  def get_output_dtype(self, input_dtype, *, constants=None):
+    return self.compute_dtype or self._param_dtype
+
+  def _forward(self, values, pad_left, pad_right):
+    """Apply convolution with explicit padding."""
+    if pad_left > 0 or pad_right > 0:
+      values = mx.pad(
+          values,
+          [(0, 0), (pad_left, pad_right), (0, 0)],
+      )
+    compute_dtype = self.compute_dtype or self._param_dtype
+    values = values.astype(compute_dtype)
+    y = self._conv(values)
+    if self.activation is not None:
+      y = self.activation(y)
+    return y
+
+  @override
+  def get_initial_state(
+      self, batch_size, input_spec, *, training: bool, constants=None
+  ):
+    self._ensure_initialized(input_spec.shape[-1])
+    bw = _buffer_width(
+        self.padding,
+        self.kernel_size,
+        self.strides,
+        self.dilation_rate,
+    )
+    if not bw:
+      return ()
+    return _compute_initial_state(
+        batch_size,
+        input_spec,
+        bw,
+        self.padding,
+    )
+
+  @override
+  @types.check_step
+  def step(  # pyrefly: ignore[missing-override-decorator]
+      self, x, state, *, training: bool, constants=None
+  ):
+    self._ensure_initialized(x.shape[-1])
+    ek = _effective_kernel_size(self.kernel_size, self.dilation_rate)
+    if ek > 1:
+      x = x.mask_invalid()
+
+    bw = _buffer_width(
+        self.padding,
+        self.kernel_size,
+        self.strides,
+        self.dilation_rate,
+    )
+
+    if bw:
+      state = state.concatenate(x)  # pyrefly: ignore[missing-attribute]
+    else:
+      state = x
+
+    # In step mode, padding is provided by the buffer — use valid conv.
+    values = self._forward(state.values, 0, 0)
+    mask = _compute_conv_mask(
+        state.mask,
+        self.kernel_size,
+        self.strides,
+        self.dilation_rate,
+        self.padding,
+        is_step=True,
+    )
+
+    if bw:
+      state = state[:, -bw:]
+    else:
+      state = ()
+
+    return Sequence(values, mask), state
+
+  @override
+  @types.check_layer
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+
+    self._ensure_initialized(x.shape[-1])
+    l_out = _compute_output_length(
+        x.shape[1],
+        self.kernel_size,
+        self.strides,
+        self.dilation_rate,
+        self.padding,
+    )
+    if l_out == 0:
+      output_spec = self.get_output_spec(x.channel_spec, constants=constants)
+      empty_values = mx.zeros(
+          (x.shape[0], 0, *output_spec.shape), dtype=x.values.dtype
+      )
+      empty_mask = mx.zeros((x.shape[0], 0), dtype=mx.bool_)
+      return Sequence(empty_values, empty_mask)
+
+    if self.kernel_size > 1:
+      x = x.mask_invalid()
+
+    pad_left, pad_right = _explicit_padding(
+        self.padding,
+        self.kernel_size,
+        self.strides,
+        self.dilation_rate,
+    )
+    values = self._forward(x.values, pad_left, pad_right)
+    mask = _compute_conv_mask(
+        x.mask,
+        self.kernel_size,
+        self.strides,
+        self.dilation_rate,
+        self.padding,
+        is_step=False,
+    )
+    return Sequence(values, mask)
+
+  @classmethod
+  def from_config(cls, config):
+    """Creates a Conv1D instance from its configuration object."""
+    return cls(config)
+
+
+# ---------------------------------------------------------------------------
+# DepthwiseConv1D
+# ---------------------------------------------------------------------------
+
+
+class DepthwiseConv1D(
+    types.SequenceLayer, spec.DepthwiseConv1D[bt.Sequence, bt.ChannelSpec]
+):
+  """1D depthwise convolution layer.
+
+  Each input channel is convolved independently. The output has
+  in_features * channel_multiplier channels.
+  """
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types.SequenceLayerConfig, spec.DepthwiseConv1D.Config):
+    """Configuration for DepthwiseConv1D."""
+
+    kernel_size: int
+    strides: int = 1
+    dilation_rate: int = 1
+    padding: bt.PaddingModeString = PaddingMode.VALID.value
+    channel_multiplier: int = 1
+    use_bias: bool = True
+    activation: Callable | None = None
+    compute_dtype: Any = None
+    param_dtype: Any = mx.float32
+    name: str | None = None
+
+    @override
+    def make(self) -> 'DepthwiseConv1D':
+      return DepthwiseConv1D(self)
+
+  def __init__(
+      self,
+      config: Config | None = None,
+      *,
+      in_features: int | None = None,
+      kernel_size: int | None = None,
+      channel_multiplier: int = 1,
+      strides: int = 1,
+      dilation_rate: int = 1,
+      padding: Any = 'valid',
+      use_bias: bool = True,
+      activation=None,
+      compute_dtype=None,
+      param_dtype=mx.float32,
+  ):
+    super().__init__()
+    if config is not None:
+      self.config = config
+    else:
+      if kernel_size is None:
+        raise ValueError('Must provide either config or kernel_size')
+      self.config = self.Config(
+          kernel_size=kernel_size,
+          channel_multiplier=channel_multiplier,
+          strides=strides,
+          dilation_rate=dilation_rate,
+          padding=padding,
+          use_bias=use_bias,
+          activation=activation,
+          compute_dtype=compute_dtype,
+          param_dtype=param_dtype,
+      )
+
+    self.in_features = in_features
+    self.kernel_size = self.config.kernel_size
+    self.channel_multiplier = self.config.channel_multiplier
+    self.strides = self.config.strides
+    self.dilation_rate = self.config.dilation_rate
+    self.padding = self.config.padding
+    self.use_bias = self.config.use_bias
+    self.activation = init_mapping.map_activation(self.config.activation)
+    self.compute_dtype = (
+        init_mapping._to_mx_dtype(self.config.compute_dtype)
+        if self.config.compute_dtype is not None
+        else None
+    )
+    self._param_dtype = init_mapping._to_mx_dtype(self.config.param_dtype)
+
+    self._conv: Any = None
+    if in_features is not None:
+      self._ensure_initialized(in_features)
+
+  def _ensure_initialized(self, in_features: int):
+    """Initializes the depthwise convolution weights."""
+    if self._conv is not None:
+      return
+    self.in_features = in_features
+    out_features = in_features * self.channel_multiplier
+    self._conv = nn.Conv1d(
+        in_channels=in_features,
+        out_channels=out_features,
+        kernel_size=self.kernel_size,
+        stride=self.strides,
+        padding=0,
+        dilation=self.dilation_rate,
+        groups=in_features,
+        bias=self.use_bias,
+    )
+
+  @property
+  @override
+  def supports_step(self):
+    return _supports_step(self.padding)
+
+  @property
+  @override
+  def block_size(self):
+    return self.strides
+
+  @property
+  @override
+  def output_ratio(self):
+    return fractions.Fraction(1, self.strides)
+
+  @property
+  @override
+  def input_latency(self):
+    ek = _effective_kernel_size(self.kernel_size, self.dilation_rate)
+    if self.padding in (
+        PaddingMode.CAUSAL_VALID.value,
+        PaddingMode.CAUSAL.value,
+        PaddingMode.SEMICAUSAL.value,
+    ):
+      return 0
+    if self.padding in (
+        PaddingMode.REVERSE_CAUSAL_VALID.value,
+        PaddingMode.REVERSE_CAUSAL.value,
+        PaddingMode.SEMICAUSAL_FULL.value,
+    ):
+      return ek - 1
+    return 0
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    if len(input_shape) != 1:
+
+      raise ValueError(
+          'DepthwiseConv1D requires rank 3 input, got '
+          f'channel_shape={input_shape}.'
+      )
+    return (input_shape[0] * self.channel_multiplier,)
+
+  @override
+  def get_output_dtype(self, input_dtype, *, constants=None):
+    return self.compute_dtype or self._param_dtype
+
+  def _forward(self, values, pad_left, pad_right):
+    """Applies the depthwise convolution with explicit padding."""
+    if pad_left > 0 or pad_right > 0:
+      values = mx.pad(
+          values,
+          [(0, 0), (pad_left, pad_right), (0, 0)],
+      )
+    compute_dtype = self.compute_dtype or self._param_dtype
+    values = values.astype(compute_dtype)
+    y = self._conv(values)
+    if self.activation is not None:
+      y = self.activation(y)
+    return y
+
+  @override
+  def get_initial_state(
+      self, batch_size, input_spec, *, training: bool, constants=None
+  ):
+    self._ensure_initialized(input_spec.shape[-1])
+    bw = _buffer_width(
+        self.padding,
+        self.kernel_size,
+        self.strides,
+        self.dilation_rate,
+    )
+    if not bw:
+      return ()
+    return _compute_initial_state(
+        batch_size,
+        input_spec,
+        bw,
+        self.padding,
+    )
+
+  @override
+  @types.check_step
+  def step(  # pyrefly: ignore[missing-override-decorator]
+      self, x, state, *, training: bool, constants=None
+  ):
+    self._ensure_initialized(x.shape[-1])
+    ek = _effective_kernel_size(self.kernel_size, self.dilation_rate)
+    if ek > 1:
+      x = x.mask_invalid()
+
+    bw = _buffer_width(
+        self.padding,
+        self.kernel_size,
+        self.strides,
+        self.dilation_rate,
+    )
+
+    if bw:
+      state = state.concatenate(x)  # pyrefly: ignore[missing-attribute]
+    else:
+      state = x
+
+    values = self._forward(state.values, 0, 0)
+    mask = _compute_conv_mask(
+        state.mask,
+        self.kernel_size,
+        self.strides,
+        self.dilation_rate,
+        self.padding,
+        is_step=True,
+    )
+
+    if bw:
+      state = state[:, -bw:]
+    else:
+      state = ()
+
+    return Sequence(values, mask), state
+
+  @override
+  @types.check_layer
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+
+    self._ensure_initialized(x.shape[-1])
+    l_out = _compute_output_length(
+        x.shape[1],
+        self.kernel_size,
+        self.strides,
+        self.dilation_rate,
+        self.padding,
+    )
+    if l_out == 0:
+      output_spec = self.get_output_spec(x.channel_spec, constants=constants)
+      empty_values = mx.zeros(
+          (x.shape[0], 0, *output_spec.shape), dtype=x.values.dtype
+      )
+      empty_mask = mx.zeros((x.shape[0], 0), dtype=mx.bool_)
+      return Sequence(empty_values, empty_mask)
+
+    if self.kernel_size > 1:
+      x = x.mask_invalid()
+
+    pad_left, pad_right = _explicit_padding(
+        self.padding,
+        self.kernel_size,
+        self.strides,
+        self.dilation_rate,
+    )
+    values = self._forward(x.values, pad_left, pad_right)
+    mask = _compute_conv_mask(
+        x.mask,
+        self.kernel_size,
+        self.strides,
+        self.dilation_rate,
+        self.padding,
+        is_step=False,
+    )
+    return Sequence(values, mask)
+
+  @classmethod
+  def from_config(cls, config):
+    """Creates a DepthwiseConv1D instance from its configuration object."""
+    return cls(config)
+
+
+# ---------------------------------------------------------------------------
+# Conv1DTranspose
+# ---------------------------------------------------------------------------
+
+
+def _transpose_conv_output_trim(kernel_size, stride, dilation_rate, padding):
+  """Output-side trimming for transpose convolutions in MLX.
+
+  MLX conv_transpose1d with padding=0 produces output of size:
+    raw = (t - 1) * stride + ek
+  This function returns (trim_left, trim_right) to cut raw output
+  to the desired size.
+  """
+  ek = _effective_kernel_size(kernel_size, dilation_rate)
+  total_trim = max(0, ek - stride)
+
+  if padding == PaddingMode.CAUSAL.value:
+    return (0, total_trim)
+  if padding == PaddingMode.SAME.value:
+    trim_left = total_trim // 2
+    return (trim_left, total_trim - trim_left)
+  if padding == PaddingMode.VALID.value:
+    return (0, 0)
+  if padding == PaddingMode.SEMICAUSAL_FULL.value:
+    return (0, 0)
+  raise ValueError(f'Unsupported padding: {padding}')
+
+
+def _compute_conv_transpose_output_length(
+    time, kernel_size, stride, dilation_rate, padding
+):
+  """Computes the expected output length for transpose convolution."""
+  ek = _effective_kernel_size(kernel_size, dilation_rate)
+  if padding in (
+      PaddingMode.SAME.value,
+      PaddingMode.CAUSAL.value,
+      PaddingMode.SEMICAUSAL_FULL.value,
+  ):
+    return time * stride
+  if padding == PaddingMode.VALID.value:
+    return time * stride + max(ek - stride, 0)
+  raise ValueError(f'Unsupported padding: {padding}')
+
+
+def _compute_conv_transpose_mask(
+    mask, kernel_size, stride, dilation_rate, padding
+):
+  """Compute output mask for a transpose convolution."""
+  ek = _effective_kernel_size(kernel_size, dilation_rate)
+
+  if ek <= stride or padding in (
+      PaddingMode.SAME.value,
+      PaddingMode.CAUSAL.value,
+  ):
+    return mx.repeat(mask, stride, axis=1)
+
+  # Use transpose convolution to compute the mask.
+  tl, tr = _transpose_conv_output_trim(
+      kernel_size,
+      stride,
+      dilation_rate,
+      padding,
+  )
+
+  if padding == PaddingMode.SEMICAUSAL_FULL.value:
+    test_signal = mask
+
+    def test_fn(m):
+      return m > 0.0
+
+  else:
+    test_signal = mx.logical_not(mask)
+
+    def test_fn(m):
+      return m == 0.0
+
+  kernel = mx.ones((1, kernel_size, 1), dtype=mx.float32)
+  signal = test_signal.astype(mx.float32)[:, :, None]
+
+  result = mx.conv_transpose1d(
+      signal,
+      kernel,
+      stride=stride,
+      padding=0,
+      dilation=dilation_rate,
+  )
+  # Trim to match desired output.
+  if tl > 0:
+    result = result[:, tl:]
+  if tr > 0:
+    result = result[:, :-tr]
+  result = mx.squeeze(result, axis=-1)
+  return test_fn(result)
+
+
+class Conv1DTranspose(
+    types.SequenceLayer, spec.Conv1DTranspose[bt.Sequence, bt.ChannelSpec]
+):
+  """1D transpose (deconvolution) layer for upsampling.
+
+  Supports 'valid', 'causal', and 'same' padding modes.
+  """
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types.SequenceLayerConfig, spec.Conv1DTranspose.Config):
+    """Configuration for Conv1DTranspose."""
+
+    filters: int
+    kernel_size: int
+    strides: int = 1
+    dilation_rate: int = 1
+    padding: bt.PaddingModeString = PaddingMode.VALID.value
+    groups: int = 1
+    use_bias: bool = True
+    activation: Callable | None = None
+    compute_dtype: Any = None
+    param_dtype: Any = mx.float32
+    name: str | None = None
+
+    @override
+    def make(self) -> 'Conv1DTranspose':
+      return Conv1DTranspose(self)
+
+  def __init__(
+      self,
+      config: Config | None = None,
+      *,
+      in_features: int | None = None,
+      filters: int | None = None,
+      kernel_size: int | None = None,
+      strides: int = 1,
+      dilation_rate: int = 1,
+      padding: Any = 'valid',
+      groups: int = 1,
+      use_bias: bool = True,
+      activation=None,
+      compute_dtype=None,
+      param_dtype=mx.float32,
+  ):
+    super().__init__()
+    if config is not None:
+      self.config = config
+    else:
+      if filters is None or kernel_size is None:
+        raise ValueError(
+            'Must provide either config or filters and kernel_size'
+        )
+      self.config = self.Config(
+          filters=filters,
+          kernel_size=kernel_size,
+          strides=strides,
+          dilation_rate=dilation_rate,
+          padding=padding,
+          groups=groups,
+          use_bias=use_bias,
+          activation=activation,
+          compute_dtype=compute_dtype,
+          param_dtype=param_dtype,
+      )
+
+    self.in_features = in_features
+    self.filters = self.config.filters
+    self.kernel_size = self.config.kernel_size
+    self.strides = self.config.strides
+    self.dilation_rate = self.config.dilation_rate
+    self.padding = self.config.padding
+    self.groups = self.config.groups
+    self.use_bias = self.config.use_bias
+    self.activation = init_mapping.map_activation(self.config.activation)
+    self.compute_dtype = (
+        init_mapping._to_mx_dtype(self.config.compute_dtype)
+        if self.config.compute_dtype is not None
+        else None
+    )
+    self._param_dtype = init_mapping._to_mx_dtype(self.config.param_dtype)
+
+    self.kernel: Any = None
+    self.bias: Any = None
+    if in_features is not None:
+      self._ensure_initialized(in_features)
+
+  def _ensure_initialized(self, in_features: int):
+    """Initializes the transpose convolution weights and biases."""
+    if self.kernel is not None:
+      return
+    self.in_features = in_features
+    key = mx.random.key(0)
+    init = init_mapping._make_variance_scaling_init(
+        'fan_in', 'truncated_normal'
+    )
+    # Kernel: [out_channels, kernel_size, in_channels // groups]
+    self.kernel = init(
+        key,
+        (self.filters, self.kernel_size, in_features // self.groups),
+        self._param_dtype,
+    )
+    if self.use_bias:
+      self.bias = mx.zeros((self.filters,), dtype=self._param_dtype)
+
+  @property
+  @override
+  def supports_step(self):
+    return self.padding == PaddingMode.CAUSAL.value
+
+  @property
+  @override
+  def block_size(self):
+    return 1
+
+  @property
+  @override
+  def output_ratio(self):
+    return fractions.Fraction(self.strides)
+
+  @property
+  @override
+  def input_latency(self):
+    return 0
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    if len(input_shape) != 1:
+      raise ValueError(
+          'Conv1DTranspose requires rank 3 input, got '
+          f'channel_shape={input_shape}.'
+      )
+    return (self.filters,)
+
+  @override
+  def get_output_dtype(self, input_dtype, *, constants=None):
+    return self.compute_dtype or self._param_dtype
+
+  def _raw_conv_transpose(self, values):
+    """Apply raw transpose convolution (no padding trim)."""
+    compute_dtype = self.compute_dtype or self._param_dtype
+    values = values.astype(compute_dtype)
+    y = mx.conv_transpose1d(
+        values,
+        self.kernel.astype(compute_dtype),
+        stride=self.strides,
+        padding=0,
+        dilation=self.dilation_rate,
+        groups=self.groups,
+    )
+    if self.use_bias:
+      y = y + self.bias.astype(compute_dtype)
+    if self.activation is not None:
+      y = self.activation(y)
+    return y
+
+  def _forward(self, values):
+    """Apply transpose convolution with output trimming."""
+    y = self._raw_conv_transpose(values)
+    tl, tr = _transpose_conv_output_trim(
+        self.kernel_size,
+        self.strides,
+        self.dilation_rate,
+        self.padding,
+    )
+    if tl > 0:
+      y = y[:, tl:]
+    if tr > 0:
+      y = y[:, :-tr]
+    return y
+
+  @property
+  def _ola_buffer_width(self):
+    """Returns the buffer width required for overlap-add step mode."""
+    return max(
+        0,
+        _effective_kernel_size(self.kernel_size, self.dilation_rate)
+        - self.strides,
+    )
+
+  @override
+  def get_initial_state(
+      self, batch_size, input_spec, *, training: bool, constants=None
+  ):
+    self._ensure_initialized(input_spec.shape[-1])
+    if not self.supports_step:
+      return ()
+    bw = self._ola_buffer_width
+    if not bw:
+      return ()
+    compute_dtype = self.compute_dtype or self._param_dtype
+    return mx.zeros(
+        (batch_size, bw, self.filters),
+        dtype=compute_dtype,
+    )
+
+  @override
+  @types.check_step
+  def step(  # pyrefly: ignore[missing-override-decorator]
+      self, x, state, *, training: bool, constants=None
+  ):
+    self._ensure_initialized(x.shape[-1])
+    # Use raw conv (no trimming) for overlap-add.
+    values = self._raw_conv_transpose(x.values)
+    mask = mx.repeat(x.mask, self.strides, axis=1)
+    out_time = self.strides * x.shape[1]
+
+    bw = self._ola_buffer_width
+    if bw:
+      # Overlap-add: the first bw samples overlap with buffer.
+      overlap = values[:, :bw] + cast(Any, state)
+      rest = values[:, bw:]
+      values = mx.concatenate([overlap, rest], axis=1)
+
+      output_samples = out_time
+      output = values[:, :output_samples]
+      state = values[:, output_samples : output_samples + bw]
+      if state.shape[1] < bw:
+        pad_right = bw - state.shape[1]
+        state = mx.pad(state, [(0, 0), (0, pad_right), (0, 0)])
+      values = output
+    else:
+      if values.shape[1] < out_time:
+        pad_width = out_time - values.shape[1]
+        values = mx.pad(values, [(0, 0), (0, pad_width), (0, 0)])
+
+    return Sequence(values, mask), state
+
+  @override
+  @types.check_layer
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+
+    self._ensure_initialized(x.shape[-1])
+    if self.padding == PaddingMode.CAUSAL.value:
+      # For causal, use raw conv and trim trailing overlap.
+      values = self._raw_conv_transpose(x.values)
+      expected_time = x.shape[1] * self.strides
+      values = values[:, :expected_time]
+      mask = mx.repeat(x.mask, self.strides, axis=1)
+    else:
+      values = self._forward(x.values)
+      mask = _compute_conv_transpose_mask(
+          x.mask,
+          self.kernel_size,
+          self.strides,
+          self.dilation_rate,
+          self.padding,
+      )
+      expected_time = _compute_conv_transpose_output_length(
+          x.shape[1],
+          self.kernel_size,
+          self.strides,
+          self.dilation_rate,
+          self.padding,
+      )
+      values = values[:, :expected_time]
+      mask = mask[:, :expected_time]
+
+    if values.shape[1] < expected_time:
+      pad_width = expected_time - values.shape[1]
+      values = mx.pad(values, [(0, 0), (0, pad_width), (0, 0)])
+    if mask.shape[1] < expected_time:
+      pad_width = expected_time - mask.shape[1]
+      mask = mx.pad(mask, [(0, 0), (0, pad_width)])
+
+    return Sequence(values, mask)
+
+  @classmethod
+  def from_config(cls, config):
+    """Creates a Conv1DTranspose instance from its configuration object."""
+    return cls(config)
diff --git a/sequence_layers/mlx/convolution2d.py b/sequence_layers/mlx/convolution2d.py
new file mode 100644
index 0000000..21f2130
--- /dev/null
+++ b/sequence_layers/mlx/convolution2d.py
@@ -0,0 +1,1269 @@
+"""2D Convolution, transpose convolution, pooling, and upsampling layers for MLX."""
+
+# pylint: disable=protected-access,abstract-method
+
+import dataclasses
+import fractions
+from typing import Any, Callable, override
+from typing import Sequence as TypingSequence
+
+import mlx.core as mx
+
+from sequence_layers.mlx import convolution as conv_utils
+from sequence_layers.mlx import init_mapping
+from sequence_layers.mlx import types
+from sequence_layers.mlx import utils as mlx_utils
+from sequence_layers.mlx.types import \
+    SequenceLayerConfig as _SequenceLayerConfig
+from sequence_layers.specs import convolution as spec
+
+from . import types as bt
+
+Sequence = bt.Sequence
+MaskedSequence = bt.MaskedSequence
+PaddingMode = bt.PaddingMode
+
+
+def _normalize_2tuple(x):
+  """Normalizes an int or sequence to a 2-tuple."""
+  if isinstance(x, int):
+    return (x, x)
+  return tuple(x)
+
+
+def _explicit_padding_2d(padding, kernel_size, stride, dilation_rate):
+  """Returns ((pad_time_left, pad_time_right), (pad_spatial_left, pad_spatial_right))."""
+  time_pad = conv_utils._explicit_padding(
+      padding[0] if isinstance(padding, (list, tuple)) else padding,
+      kernel_size[0],
+      stride[0],
+      dilation_rate[0],
+  )
+  spatial_padding = (
+      padding[1] if isinstance(padding, (list, tuple)) else padding
+  )
+  spatial_pad = conv_utils._explicit_padding(
+      spatial_padding,
+      kernel_size[1],
+      stride[1],
+      dilation_rate[1],
+  )
+  return time_pad, spatial_pad
+
+
+# ---------------------------------------------------------------------------
+# Conv2D
+# ---------------------------------------------------------------------------
+
+
+class Conv2D(types.SequenceLayer, spec.Conv2D[bt.Sequence, bt.ChannelSpec]):
+  """2D convolution layer with separate time and spatial padding."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types.SequenceLayerConfig, spec.Conv2D.Config):
+    """Configuration for Conv2D."""
+
+    filters: int
+    kernel_size: int | TypingSequence[int]
+    strides: int | TypingSequence[int] = 1
+    dilation_rate: int | TypingSequence[int] = 1
+    time_padding: bt.PaddingModeString = PaddingMode.VALID.value
+    spatial_padding: bt.PaddingModeString | tuple[int, int] = (
+        PaddingMode.SAME.value
+    )
+    groups: int = 1
+    use_bias: bool = True
+    activation: Callable | None = None
+    compute_dtype: Any = None
+    param_dtype: Any = mx.float32
+    name: str | None = None
+
+    @override
+    def make(self) -> 'Conv2D':
+      return Conv2D(self)
+
+  def __init__(
+      self,
+      config: Config | None = None,
+      *,
+      in_features: int | None = None,
+      filters: int | None = None,
+      kernel_size: int | TypingSequence[int] | None = None,
+      strides: int | TypingSequence[int] = (1, 1),
+      dilation_rate: int | TypingSequence[int] = (1, 1),
+      time_padding: Any = 'valid',
+      spatial_padding: Any = 'same',
+      groups: int = 1,
+      use_bias: bool = True,
+      activation=None,
+      compute_dtype=None,
+      param_dtype=mx.float32,
+  ):
+    super().__init__()
+    if config is not None:
+      self.config = config
+    else:
+      if filters is None or kernel_size is None:
+        raise ValueError(
+            'Must provide either config or filters and kernel_size'
+        )
+      self.config = self.Config(
+          filters=filters,
+          kernel_size=kernel_size,
+          strides=strides,
+          dilation_rate=dilation_rate,
+          time_padding=time_padding,
+          spatial_padding=spatial_padding,
+          groups=groups,
+          use_bias=use_bias,
+          activation=activation,
+          compute_dtype=compute_dtype,
+          param_dtype=param_dtype,
+      )
+
+    self.in_features = in_features
+    self.filters = self.config.filters
+    self.kernel_size = _normalize_2tuple(self.config.kernel_size)
+    self.strides = _normalize_2tuple(self.config.strides)
+    self.dilation_rate = _normalize_2tuple(self.config.dilation_rate)
+    self.time_padding = self.config.time_padding
+    self.spatial_padding = self.config.spatial_padding
+    self.groups = self.config.groups
+    self.use_bias = self.config.use_bias
+    self.activation = init_mapping.map_activation(self.config.activation)
+    self.compute_dtype = (
+        init_mapping._to_mx_dtype(self.config.compute_dtype)
+        if self.config.compute_dtype is not None
+        else None
+    )
+    self._param_dtype = init_mapping._to_mx_dtype(self.config.param_dtype)
+
+    self.kernel: Any = None
+    self.bias: Any = None
+    if in_features is not None:
+      self._ensure_initialized(in_features)
+
+  def _ensure_initialized(self, in_features: int):
+    """Initializes the Conv2D layer weights and biases."""
+    if self.kernel is not None:
+      return
+    self.in_features = in_features
+    # Create kernel: [out_channels, kH, kW, in_channels // groups]
+    key = mx.random.key(0)
+    init_fn = init_mapping._make_variance_scaling_init(
+        'fan_in', 'truncated_normal'
+    )
+    self.kernel = init_fn(
+        key,
+        (
+            self.filters,
+            self.kernel_size[0],
+            self.kernel_size[1],
+            in_features // self.groups,
+        ),
+        self._param_dtype,
+    )
+    if self.use_bias:
+      self.bias = mx.zeros((self.filters,), dtype=self._param_dtype)
+
+  @property
+  @override
+  def supports_step(self):
+    return conv_utils._supports_step(self.time_padding)
+
+  @property
+  @override
+  def block_size(self):
+    return self.strides[0]
+
+  @property
+  @override
+  def output_ratio(self):
+    return fractions.Fraction(1, self.strides[0])
+
+  @property
+  @override
+  def input_latency(self):
+    ek = conv_utils._effective_kernel_size(
+        self.kernel_size[0], self.dilation_rate[0]
+    )
+    if self.time_padding in (
+        PaddingMode.CAUSAL_VALID.value,
+        PaddingMode.CAUSAL.value,
+        PaddingMode.SEMICAUSAL.value,
+    ):
+      return 0
+    if self.time_padding in (
+        PaddingMode.REVERSE_CAUSAL_VALID.value,
+        PaddingMode.REVERSE_CAUSAL.value,
+    ):
+      return ek - 1
+    return 0
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    if len(input_shape) != 2:
+      raise ValueError(
+          f'Conv2D requires rank 4 input. Got channel_shape={input_shape}'
+      )
+
+    freq_dim = input_shape[0]
+    # Compute spatial output size.
+    if isinstance(self.spatial_padding, str):
+      sp_pad = conv_utils._explicit_padding(
+          self.spatial_padding,
+          self.kernel_size[1],
+          self.strides[1],
+          self.dilation_rate[1],
+      )
+    else:
+      sp_pad = self.spatial_padding
+    ek_sp = conv_utils._effective_kernel_size(
+        self.kernel_size[1], self.dilation_rate[1]
+    )
+    out_freq = (freq_dim + sp_pad[0] + sp_pad[1] - ek_sp) // self.strides[1] + 1
+    return (out_freq, self.filters)
+
+  @override
+  def get_output_dtype(self, input_dtype, *, constants=None):
+    return self.compute_dtype or self._param_dtype
+
+  def _forward(self, values, time_pad, spatial_pad):
+    """Apply 2D conv with explicit padding."""
+    if (
+        time_pad[0] > 0
+        or time_pad[1] > 0
+        or spatial_pad[0] > 0
+        or spatial_pad[1] > 0
+    ):
+      values = mx.pad(
+          values,
+          [
+              (0, 0),
+              (time_pad[0], time_pad[1]),
+              (spatial_pad[0], spatial_pad[1]),
+              (0, 0),
+          ],
+      )
+    compute_dtype = self.compute_dtype or self._param_dtype
+    values = values.astype(compute_dtype)
+    # mlx.core.conv2d: input [B, H, W, C_in], weight [C_out, kH, kW, C_in/groups]
+    y = mx.conv2d(
+        values,
+        self.kernel.astype(compute_dtype),
+        stride=self.strides,
+        padding=0,
+        dilation=self.dilation_rate,
+        groups=self.groups,
+    )
+    if self.use_bias:
+      y = y + self.bias.astype(compute_dtype)
+    if self.activation is not None:
+      y = self.activation(y)
+    return y
+
+  @override
+  def get_initial_state(
+      self, batch_size, input_spec, *, training: bool, constants=None
+  ):
+    self._ensure_initialized(input_spec.shape[-1])
+    bw = conv_utils._buffer_width(
+        self.time_padding,
+        self.kernel_size[0],
+        self.strides[0],
+        self.dilation_rate[0],
+    )
+    if not bw:
+      return ()
+    # State is a MaskedSequence of shape [B, bw, freq, channels].
+    if self.time_padding in (
+        PaddingMode.CAUSAL_VALID.value,
+        PaddingMode.REVERSE_CAUSAL_VALID.value,
+    ):
+      mask = mx.ones((batch_size, bw), dtype=bt.MASK_DTYPE)
+    else:
+      mask = mx.zeros((batch_size, bw), dtype=bt.MASK_DTYPE)
+    values = mx.zeros(
+        (batch_size, bw) + input_spec.shape,
+        dtype=input_spec.dtype,
+    )
+    return MaskedSequence(values, mask)
+
+  @types.check_step
+  @override
+  def step(  # pyrefly: ignore[missing-override-decorator]
+      self, x, state, *, training: bool, constants=None
+  ):
+    self._ensure_initialized(x.channel_shape[-1])
+    ek_time = conv_utils._effective_kernel_size(
+        self.kernel_size[0], self.dilation_rate[0]
+    )
+    if ek_time > 1:
+      x = x.mask_invalid()
+
+    bw = conv_utils._buffer_width(
+        self.time_padding,
+        self.kernel_size[0],
+        self.strides[0],
+        self.dilation_rate[0],
+    )
+
+    if bw:
+      state = state.concatenate(x)  # pyrefly: ignore[missing-attribute]
+    else:
+      state = x
+
+    # Spatial padding always applied; time padding from buffer.
+    if isinstance(self.spatial_padding, str):
+      sp_pad = conv_utils._explicit_padding(
+          self.spatial_padding,
+          self.kernel_size[1],
+          self.strides[1],
+          self.dilation_rate[1],
+      )
+    else:
+      sp_pad = self.spatial_padding
+
+    values = self._forward(state.values, (0, 0), sp_pad)
+    mask = conv_utils._compute_conv_mask(
+        state.mask,
+        self.kernel_size[0],
+        self.strides[0],
+        self.dilation_rate[0],
+        self.time_padding,
+        is_step=True,
+    )
+
+    if bw:
+      state = state[:, -bw:]
+    else:
+      state = ()
+
+    return Sequence(values, mask), state
+
+  @types.check_layer
+  @override
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+
+    self._ensure_initialized(x.channel_shape[-1])
+    l_out_time = conv_utils._compute_output_length(
+        x.shape[1],
+        self.kernel_size[0],
+        self.strides[0],
+        self.dilation_rate[0],
+        self.time_padding,
+    )
+    if l_out_time == 0:
+      output_spec = self.get_output_spec(x.channel_spec, constants=constants)
+      empty_values = mx.zeros(
+          (x.shape[0], 0, *output_spec.shape), dtype=x.values.dtype
+      )
+      empty_mask = mx.zeros((x.shape[0], 0), dtype=mx.bool_)
+      return Sequence(empty_values, empty_mask)
+
+    if self.kernel_size[0] > 1:
+      x = x.mask_invalid()
+
+    time_pad = conv_utils._explicit_padding(
+        self.time_padding,
+        self.kernel_size[0],
+        self.strides[0],
+        self.dilation_rate[0],
+    )
+    if isinstance(self.spatial_padding, str):
+      sp_pad = conv_utils._explicit_padding(
+          self.spatial_padding,
+          self.kernel_size[1],
+          self.strides[1],
+          self.dilation_rate[1],
+      )
+    else:
+      sp_pad = self.spatial_padding
+
+    values = self._forward(x.values, time_pad, sp_pad)
+    mask = conv_utils._compute_conv_mask(
+        x.mask,
+        self.kernel_size[0],
+        self.strides[0],
+        self.dilation_rate[0],
+        self.time_padding,
+        is_step=False,
+    )
+    return Sequence(values, mask)
+
+  @classmethod
+  def from_config(cls, config):
+    """Creates a Conv2D instance from its configuration object."""
+    return cls(config)
+
+
+# ---------------------------------------------------------------------------
+# Conv2DTranspose
+# ---------------------------------------------------------------------------
+
+
+class Conv2DTranspose(
+    types.SequenceLayer, spec.Conv2DTranspose[bt.Sequence, bt.ChannelSpec]
+):
+  """2D transposed convolution layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types.SequenceLayerConfig, spec.Conv2DTranspose.Config):
+    """Configuration for Conv2DTranspose."""
+
+    filters: int
+    kernel_size: int | TypingSequence[int]
+    strides: int | TypingSequence[int] = 1
+    dilation_rate: int | TypingSequence[int] = 1
+    time_padding: bt.PaddingModeString = PaddingMode.VALID.value
+    spatial_padding: bt.PaddingModeString | tuple[int, int] = (
+        PaddingMode.SAME.value
+    )
+    groups: int = 1
+    use_bias: bool = True
+    activation: Callable | None = None
+    compute_dtype: Any = None
+    param_dtype: Any = mx.float32
+    name: str | None = None
+
+    @override
+    def make(self) -> 'Conv2DTranspose':
+      return Conv2DTranspose(self)
+
+  def __init__(
+      self,
+      config: Config | None = None,
+      *,
+      in_features: int | None = None,
+      filters: int | None = None,
+      kernel_size: int | TypingSequence[int] | None = None,
+      strides: int | TypingSequence[int] = (1, 1),
+      dilation_rate: int | TypingSequence[int] = (1, 1),
+      time_padding: Any = 'valid',
+      spatial_padding: Any = 'same',
+      groups: int = 1,
+      use_bias: bool = True,
+      activation=None,
+      compute_dtype=None,
+      param_dtype=mx.float32,
+  ):
+    super().__init__()
+    if config is not None:
+      self.config = config
+    else:
+      if filters is None or kernel_size is None:
+        raise ValueError(
+            'Must provide either config or filters and kernel_size'
+        )
+      self.config = self.Config(
+          filters=filters,
+          kernel_size=kernel_size,
+          strides=strides,
+          dilation_rate=dilation_rate,
+          time_padding=time_padding,
+          spatial_padding=spatial_padding,
+          groups=groups,
+          use_bias=use_bias,
+          activation=activation,
+          compute_dtype=compute_dtype,
+          param_dtype=param_dtype,
+      )
+
+    self.in_features = in_features
+    self.filters = self.config.filters
+    self.kernel_size = _normalize_2tuple(self.config.kernel_size)
+    self.strides = _normalize_2tuple(self.config.strides)
+    self.dilation_rate = _normalize_2tuple(self.config.dilation_rate)
+    self.time_padding = self.config.time_padding
+    self.spatial_padding = self.config.spatial_padding
+    self.groups = self.config.groups
+    self.use_bias = self.config.use_bias
+    self.activation = init_mapping.map_activation(self.config.activation)
+    self.compute_dtype = (
+        init_mapping._to_mx_dtype(self.config.compute_dtype)
+        if self.config.compute_dtype is not None
+        else None
+    )
+    self._param_dtype = init_mapping._to_mx_dtype(self.config.param_dtype)
+
+    self.kernel: Any = None
+    self.bias: Any = None
+    if in_features is not None:
+      self._ensure_initialized(in_features)
+
+  def _ensure_initialized(self, in_features: int):
+    """Initializes the Conv2DTranspose weights and biases."""
+    if self.kernel is not None:
+      return
+    self.in_features = in_features
+    # Kernel: [out_channels, kH, kW, in_channels // groups]
+    key = mx.random.key(0)
+    init_fn = init_mapping._make_variance_scaling_init(
+        'fan_in', 'truncated_normal'
+    )
+    self.kernel = init_fn(
+        key,
+        (
+            self.filters,
+            self.kernel_size[0],
+            self.kernel_size[1],
+            in_features // self.groups,
+        ),
+        self._param_dtype,
+    )
+    if self.use_bias:
+      self.bias = mx.zeros((self.filters,), dtype=self._param_dtype)
+
+  @property
+  @override
+  def supports_step(self):
+    return self.time_padding == PaddingMode.CAUSAL.value
+
+  @property
+  @override
+  def block_size(self):
+    return 1
+
+  @property
+  @override
+  def output_ratio(self):
+    return fractions.Fraction(self.strides[0])
+
+  @property
+  @override
+  def input_latency(self):
+    return 0
+
+  def _time_trim(self):
+    """Returns (trim_left, trim_right) for time dimension."""
+    return conv_utils._transpose_conv_output_trim(
+        self.kernel_size[0],
+        self.strides[0],
+        self.dilation_rate[0],
+        self.time_padding,
+    )
+
+  def _spatial_trim(self):
+    """Returns (trim_left, trim_right) for spatial dimension."""
+    if isinstance(self.spatial_padding, str):
+      return conv_utils._transpose_conv_output_trim(
+          self.kernel_size[1],
+          self.strides[1],
+          self.dilation_rate[1],
+          self.spatial_padding,
+      )
+    return self.spatial_padding
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    if len(input_shape) != 2:
+      raise ValueError(
+          'Conv2DTranspose requires rank 4 input. Got'
+          f' channel_shape={input_shape}'
+      )
+    freq_dim = input_shape[0]
+    ek_sp = conv_utils._effective_kernel_size(
+        self.kernel_size[1], self.dilation_rate[1]
+    )
+    raw_sp = (freq_dim - 1) * self.strides[1] + ek_sp
+    sp_trim = self._spatial_trim()
+    out_freq = raw_sp - sp_trim[0] - sp_trim[1]
+    return (out_freq, self.filters)
+
+  @override
+  def get_output_dtype(self, input_dtype, *, constants=None):
+    return self.compute_dtype or self._param_dtype
+
+  def _conv_raw(self, values, trim_time=True):
+    """Compute raw conv_transpose2d, optionally trimming time.
+
+    Args:
+      values: Input values.
+      trim_time: If True, trim time dimension (for layer mode).
+                 If False, skip time trim (for step mode overlap-add).
+    Returns:
+      Raw convolution output WITHOUT bias or activation.
+    """
+    compute_dtype = self.compute_dtype or self._param_dtype
+    values = values.astype(compute_dtype)
+    # mx.conv_transpose2d: input [B, H, W, C_in], weight [C_out, kH, kW, C_in/groups]
+    y = mx.conv_transpose2d(
+        values,
+        self.kernel.astype(compute_dtype),
+        stride=self.strides,
+        padding=0,
+        dilation=self.dilation_rate,
+        groups=self.groups,
+    )
+    # Time trim (only in layer mode; step mode handles it via overlap-add).
+    if trim_time:
+      tl, tr = self._time_trim()
+      if tl > 0:
+        y = y[:, tl:]
+      if tr > 0:
+        y = y[:, :-tr]
+    # Spatial trim (always applied).
+    sl_val, sr = self._spatial_trim()
+    if sl_val > 0:
+      y = y[:, :, sl_val:]
+    if sr > 0:
+      y = y[:, :, :-sr]
+    return y
+
+  def _apply_bias_and_activation(self, y):
+    """Apply bias and activation to conv output."""
+    compute_dtype = self.compute_dtype or self._param_dtype
+    if self.use_bias:
+      y = y + self.bias.astype(compute_dtype)
+    if self.activation is not None:
+      y = self.activation(y)
+    return y
+
+  def _forward(self, values):
+    """Full forward: conv + trim + bias + activation (for layer mode)."""
+    y = self._conv_raw(values, trim_time=True)
+    return self._apply_bias_and_activation(y)
+
+  @types.check_layer
+  @override
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    self._ensure_initialized(x.channel_shape[-1])
+    values = self._forward(x.values)
+    mask = conv_utils._compute_conv_transpose_mask(
+        x.mask,
+        self.kernel_size[0],
+        self.strides[0],
+        self.dilation_rate[0],
+        self.time_padding,
+    )
+    return Sequence(values, mask)
+
+  @override
+  def get_initial_state(
+      self, batch_size, input_spec, *, training: bool, constants=None
+  ):
+    self._ensure_initialized(input_spec.shape[-1])
+    if not self.supports_step:
+      raise ValueError(
+          'Conv2DTranspose step only supported with causal padding.'
+      )
+    ola_buf = max(
+        0,
+        conv_utils._effective_kernel_size(
+            self.kernel_size[0], self.dilation_rate[0]
+        )
+        - self.strides[0],
+    )
+    if not ola_buf:
+      return ()
+    out_shape = self.get_output_shape(input_spec.shape, constants=constants)
+    values = mx.zeros(
+        (batch_size, ola_buf) + out_shape,
+        dtype=self.get_output_dtype(input_spec.dtype),
+    )
+    mask = mx.zeros((batch_size, ola_buf), dtype=bt.MASK_DTYPE)
+    return MaskedSequence(values, mask)
+
+  @types.check_step
+  @override
+  def step(  # pyrefly: ignore[missing-override-decorator]
+      self, x, state, *, training: bool, constants=None
+  ):
+
+    self._ensure_initialized(x.channel_shape[-1])
+    x = x.mask_invalid()
+    # Conv WITHOUT time trimming — keep full temporal output for overlap-add.
+    # Bias is also deferred until after overlap-add (matching JAX behavior).
+    raw = self._conv_raw(x.values, trim_time=False)
+    input_time = x.shape[1]
+    out_time = input_time * self.strides[0]
+    mask = mx.repeat(x.mask, self.strides[0], axis=1)
+
+    ola_buf = max(
+        0,
+        conv_utils._effective_kernel_size(
+            self.kernel_size[0], self.dilation_rate[0]
+        )
+        - self.strides[0],
+    )
+    if ola_buf:
+      # Pad the state buffer to match the raw output length, then overlap-add.
+      # raw has shape (B, raw_time, ...) where raw_time >= out_time + ola_buf
+      buf_values = state.values  # pyrefly: ignore[missing-attribute]
+
+      pad_len = raw.shape[1] - ola_buf
+      if pad_len > 0:
+        buf_values = mx.concatenate(
+            [buf_values, mx.zeros_like(raw[:, :pad_len])], axis=1
+        )
+      # Overlap-add: add state to raw output.
+      out_values = buf_values + raw
+      # Split: first out_time samples are output, rest is new buffer.
+      out = out_values[:, :out_time]
+      new_buf = out_values[:, out_time:]
+      if new_buf.shape[1] < ola_buf:
+        pad_width = ola_buf - new_buf.shape[1]
+        new_buf = mx.pad(
+            new_buf, [(0, 0), (0, pad_width)] + [(0, 0)] * (new_buf.ndim - 2)
+        )
+      elif new_buf.shape[1] > ola_buf:
+        new_buf = new_buf[:, :ola_buf]
+      new_mask = mx.zeros((x.values.shape[0], ola_buf), dtype=bt.MASK_DTYPE)
+      state = MaskedSequence(new_buf, new_mask)
+    else:
+      out = raw[:, :out_time]
+      state = ()
+
+    # Apply bias and activation AFTER overlap-add (only once per sample).
+    out = self._apply_bias_and_activation(out)
+
+    out_mask = mask[:, : out.shape[1]]
+    return Sequence(out, out_mask), state
+
+  @classmethod
+  def from_config(cls, config):
+    """Creates a Conv2DTranspose instance from its configuration object."""
+    return cls(config)
+
+
+# ---------------------------------------------------------------------------
+# AveragePooling2D
+# ---------------------------------------------------------------------------
+
+
+class AveragePooling2D(types.SequenceLayer):
+  """2D average pooling with separate time and spatial padding."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(_SequenceLayerConfig):
+    """Configuration for AveragePooling2D."""
+
+    pool_size: tuple[int, int] = (1, 1)
+    strides: tuple[int, int] = (1, 1)
+    dilation_rate: tuple[int, int] = (1, 1)
+    time_padding: str = 'valid'
+    spatial_padding: str | tuple[int, int] = 'same'
+    masked_average: bool = False
+    name: str | None = None
+
+    def __post_init__(self):
+      object.__setattr__(self, 'pool_size', _normalize_2tuple(self.pool_size))
+      object.__setattr__(self, 'strides', _normalize_2tuple(self.strides))
+      object.__setattr__(
+          self, 'dilation_rate', _normalize_2tuple(self.dilation_rate)
+      )
+
+    @override
+    def make(self) -> types.SequenceLayer:
+      return AveragePooling2D.from_config(self)
+
+  def __init__(
+      self,
+      *,
+      pool_size,
+      strides=(1, 1),
+      dilation_rate=(1, 1),
+      time_padding='valid',
+      spatial_padding='same',
+      masked_average=False,
+  ):
+    super().__init__()
+    self.pool_size = _normalize_2tuple(pool_size)
+    self.strides = _normalize_2tuple(strides)
+    self.dilation_rate = _normalize_2tuple(dilation_rate)
+    self.time_padding = time_padding
+    self.spatial_padding = spatial_padding
+    self.masked_average = masked_average
+
+  @property
+  @override
+  def supports_step(self):
+    return conv_utils._supports_step(self.time_padding)
+
+  @property
+  @override
+  def block_size(self):
+    return self.strides[0]
+
+  @property
+  @override
+  def output_ratio(self):
+    return fractions.Fraction(1, self.strides[0])
+
+  @property
+  @override
+  def input_latency(self):
+
+    ek = conv_utils._effective_kernel_size(
+        self.pool_size[0], self.dilation_rate[0]
+    )
+    if self.time_padding in (
+        PaddingMode.CAUSAL_VALID.value,
+        PaddingMode.CAUSAL.value,
+        PaddingMode.SEMICAUSAL.value,
+    ):
+      return 0
+    if self.time_padding in (
+        PaddingMode.REVERSE_CAUSAL_VALID.value,
+        PaddingMode.REVERSE_CAUSAL.value,
+    ):
+      return ek - 1
+    return 0
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    if len(input_shape) != 2:
+      raise ValueError(
+          'AveragePooling2D requires rank 4 input. Got'
+          f' channel_shape={input_shape}'
+      )
+    freq_dim = input_shape[0]
+    if isinstance(self.spatial_padding, str):
+      sp_pad = conv_utils._explicit_padding(
+          self.spatial_padding,
+          self.pool_size[1],
+          self.strides[1],
+          self.dilation_rate[1],
+      )
+    else:
+      sp_pad = self.spatial_padding
+    ek_sp = conv_utils._effective_kernel_size(
+        self.pool_size[1], self.dilation_rate[1]
+    )
+    out_freq = (freq_dim + sp_pad[0] + sp_pad[1] - ek_sp) // self.strides[1] + 1
+    return (out_freq, input_shape[1])
+
+  @override
+  def get_output_dtype(self, input_dtype, *, constants=None):
+    return input_dtype
+
+  def _pool(self, values, time_pad, spatial_pad) -> Any:
+    """Apply 2D average pooling with explicit padding."""
+    if (
+        time_pad[0] > 0
+        or time_pad[1] > 0
+        or spatial_pad[0] > 0
+        or spatial_pad[1] > 0
+    ):
+      values = mx.pad(
+          values,
+          [
+              (0, 0),
+              (time_pad[0], time_pad[1]),
+              (spatial_pad[0], spatial_pad[1]),
+              (0, 0),
+          ],
+      )
+    # Implement average pooling via im2col-style approach.
+    # For simplicity, use a strided mean.
+    b, t, h, c = values.shape
+    pt, ps = self.pool_size
+    st, ss = self.strides
+    out_t = (t - pt) // st + 1
+    out_h = (h - ps) // ss + 1
+    # Extract patches and average.
+    result = mx.zeros((b, out_t, out_h, c), dtype=values.dtype)
+    patches = []
+    for dt in range(pt):
+      for ds in range(ps):
+        patch = values[
+            :, dt : dt + out_t * st : st, ds : ds + out_h * ss : ss, :
+        ]
+        patches.append(patch)
+    result = sum(patches) / len(patches)
+    return result
+
+  @types.check_layer
+  @override
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool = False, constants=None
+  ):
+
+    l_out_time = conv_utils._compute_output_length(
+        x.shape[1],
+        self.pool_size[0],
+        self.strides[0],
+        self.dilation_rate[0],
+        self.time_padding,
+    )
+    if l_out_time == 0:
+      output_spec = self.get_output_spec(x.channel_spec, constants=constants)
+      empty_values = mx.zeros(
+          (x.shape[0], 0, *output_spec.shape), dtype=x.values.dtype
+      )
+      empty_mask = mx.zeros((x.shape[0], 0), dtype=mx.bool_)
+      return Sequence(empty_values, empty_mask)
+
+    time_pad = conv_utils._explicit_padding(
+        self.time_padding,
+        self.pool_size[0],
+        self.strides[0],
+        self.dilation_rate[0],
+    )
+    if isinstance(self.spatial_padding, str):
+      sp_pad = conv_utils._explicit_padding(
+          self.spatial_padding,
+          self.pool_size[1],
+          self.strides[1],
+          self.dilation_rate[1],
+      )
+    else:
+      sp_pad = self.spatial_padding
+
+    values = self._pool(x.values, time_pad, sp_pad)
+    mask = conv_utils._compute_conv_mask(
+        x.mask,
+        self.pool_size[0],
+        self.strides[0],
+        self.dilation_rate[0],
+        self.time_padding,
+        is_step=False,
+    )
+    return Sequence(values, mask)
+
+  @override
+  def get_initial_state(
+      self, batch_size, input_spec, *, training: bool = False, constants=None
+  ):
+    bw = conv_utils._buffer_width(
+        self.time_padding,
+        self.pool_size[0],
+        self.strides[0],
+        self.dilation_rate[0],
+    )
+    if not bw:
+      return ()
+    if self.time_padding in (
+        PaddingMode.CAUSAL_VALID.value,
+        PaddingMode.REVERSE_CAUSAL_VALID.value,
+    ):
+      mask = mx.ones((batch_size, bw), dtype=bt.MASK_DTYPE)
+    else:
+      mask = mx.zeros((batch_size, bw), dtype=bt.MASK_DTYPE)
+    values = mx.zeros(
+        (batch_size, bw) + input_spec.shape,
+        dtype=input_spec.dtype,
+    )
+    return MaskedSequence(values, mask)
+
+  @types.check_step
+  @override
+  def step(  # pyrefly: ignore[missing-override-decorator]
+      self, x, state, *, training: bool = False, constants=None
+  ):
+
+    bw = conv_utils._buffer_width(
+        self.time_padding,
+        self.pool_size[0],
+        self.strides[0],
+        self.dilation_rate[0],
+    )
+    if bw:
+      state = state.concatenate(x)  # pyrefly: ignore[missing-attribute]
+    else:
+
+      state = x
+
+    if isinstance(self.spatial_padding, str):
+      sp_pad = conv_utils._explicit_padding(
+          self.spatial_padding,
+          self.pool_size[1],
+          self.strides[1],
+          self.dilation_rate[1],
+      )
+    else:
+      sp_pad = self.spatial_padding
+
+    values = self._pool(state.values, (0, 0), sp_pad)
+    mask = conv_utils._compute_conv_mask(
+        state.mask,
+        self.pool_size[0],
+        self.strides[0],
+        self.dilation_rate[0],
+        self.time_padding,
+        is_step=True,
+    )
+
+    if bw:
+      state = state[:, -bw:]
+    else:
+      state = ()
+
+    return Sequence(values, mask), state
+
+  @classmethod
+  def from_config(cls, config):
+    """Creates an AveragePooling2D instance from its configuration object."""
+    pool_size = _normalize_2tuple(config.pool_size)
+    strides = _normalize_2tuple(config.strides)
+    dilation_rate = _normalize_2tuple(getattr(config, 'dilation_rate', (1, 1)))
+    return cls(
+        pool_size=pool_size,
+        strides=strides,
+        dilation_rate=dilation_rate,
+        time_padding=getattr(config, 'time_padding', 'valid'),
+        spatial_padding=getattr(config, 'spatial_padding', 'same'),
+        masked_average=getattr(config, 'masked_average', False),
+    )
+
+
+# ---------------------------------------------------------------------------
+# Upsample2D
+# ---------------------------------------------------------------------------
+
+
+class Upsample2D(types.PreservesType, types.Stateless):
+  """2D upsampling layer using nearest-neighbor repetition."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(_SequenceLayerConfig):
+    """Configuration for Upsample2D."""
+
+    rate: tuple[int, int] = (1, 1)
+    name: str | None = None
+
+    def __post_init__(self):
+      object.__setattr__(self, 'rate', _normalize_2tuple(self.rate))
+
+    @override
+    def make(self) -> types.SequenceLayer:
+      return Upsample2D.from_config(self)
+
+  def __init__(self, *, rate):
+    super().__init__()
+    self._rate = _normalize_2tuple(rate)
+
+  @property
+  @override
+  def output_ratio(self):
+    return fractions.Fraction(self._rate[0])
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    if len(input_shape) != 2:
+      raise ValueError(
+          f'Upsample2D requires rank 4 input, got channel_shape={input_shape}'
+      )
+    return (input_shape[0] * self._rate[1], input_shape[1])
+
+  @types.check_layer
+  @override
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool = False, constants=None
+  ):
+
+    values = mx.repeat(x.values, self._rate[0], axis=1)
+    values = mx.repeat(values, self._rate[1], axis=2)
+    mask = mx.repeat(x.mask, self._rate[0], axis=1)
+    return type(x)(values, mask)
+
+  @classmethod
+  def from_config(cls, config):
+    """Creates an Upsample2D instance from its configuration object."""
+    return cls(rate=_normalize_2tuple(config.rate))
+
+
+# ---------------------------------------------------------------------------
+# ParallelChannels
+# ---------------------------------------------------------------------------
+
+
+class ParallelChannels(types.Emitting):
+  """Applies a layer with shared parameters to groups of input channels.
+
+  The input sequence is split on its final channels dimension into num_groups
+  separate sequences and processed with the child layer. Parameters for the
+  child layer are shared across all parallel invocations.
+  """
+
+  # CombinationMode values matching the JAX utils version.
+  STACK = 1
+  CONCAT = 2
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(_SequenceLayerConfig):
+    """Configuration for ParallelChannels."""
+
+    child_layer: _SequenceLayerConfig | None = None
+    num_groups: int = 1
+    combination: object = None  # CombinationMode enum value
+    name: str | None = None
+
+    @override
+    def make(self) -> types.SequenceLayer:
+      return ParallelChannels.from_config(self)
+
+  def __init__(self, *, child_layer, num_groups, combination=CONCAT):
+    super().__init__()
+    self.child = child_layer
+    self._num_groups = num_groups
+    # Default to CONCAT (2) which is what soundstream uses.
+    if combination is None:
+      self._combination = self.CONCAT
+    elif hasattr(combination, 'value'):
+      self._combination = combination.value
+    else:
+      self._combination = int(combination)
+
+  @property
+  @override
+  def supports_step(self):
+    return self.child.supports_step
+
+  @property
+  @override
+  def block_size(self):
+    return self.child.block_size
+
+  @property
+  @override
+  def output_ratio(self):
+    return self.child.output_ratio
+
+  @property
+  @override
+  def input_latency(self):
+    return self.child.input_latency
+
+  def _split(self, x):
+    """Split sequence along last channel dim into num_groups."""
+    vals = x.values
+    c = vals.shape[-1]
+    if c % self._num_groups != 0:
+      raise ValueError(
+          f'Input channels ({c}) must be divisible by num_groups'
+          f' ({self._num_groups}).'
+      )
+    group_size = c // self._num_groups
+    groups = []
+    for i in range(self._num_groups):
+      g_vals = vals[..., i * group_size : (i + 1) * group_size]
+      groups.append(type(x)(g_vals, x.mask))
+    return groups
+
+  def _combine(self, outputs):
+    """Combine group outputs."""
+    if self._combination == self.CONCAT:
+      # Concatenate along last axis.
+      combined_vals = mx.concatenate([o.values for o in outputs], axis=-1)
+      return Sequence(combined_vals, outputs[0].mask)
+    if self._combination == self.STACK:
+      # Stack along a new axis before the last.
+      stacked = mx.stack([o.values for o in outputs], axis=-2)
+      return Sequence(stacked, outputs[0].mask)
+    raise ValueError(f'Unsupported combination mode: {self._combination}')
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    if not input_shape:
+      raise ValueError(f'Input must be at least 3D, got: {input_shape=}.')
+    if input_shape[-1] % self._num_groups != 0:
+      raise ValueError(
+          f'Input channels ({input_shape[-1]}) must be divisible by'
+          f' num_groups ({self._num_groups}).'
+      )
+    group_shape = list(input_shape)
+    group_shape[-1] //= self._num_groups
+    child_shape = self.child.get_output_shape(
+        tuple(group_shape), constants=constants
+    )
+    if self._combination == self.CONCAT:
+      return child_shape[:-1] + (child_shape[-1] * self._num_groups,)
+    if self._combination == self.STACK:
+      return child_shape[:-1] + (self._num_groups,) + (child_shape[-1],)
+    raise ValueError(f'Unsupported combination mode: {self._combination}')
+
+  @override
+  def get_output_dtype(self, input_dtype, *, constants=None):
+    return self.child.get_output_dtype(input_dtype, constants=constants)
+
+  @types.check_layer
+  @override
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool = False, constants=None
+  ):
+
+    groups = self._split(x)
+    outputs = [
+        self.child.layer(g, training=training, constants=constants)
+        for g in groups
+    ]
+    return self._combine(outputs)
+
+  @override
+  def layer_with_emits(self, x, *, training: bool = False, constants=None):
+    groups = self._split(x)
+    outputs, emits = [], []
+    for g in groups:
+      y, e = self.child.layer_with_emits(
+          g, training=training, constants=constants
+      )
+      outputs.append(y)
+      emits.append(e)
+    return self._combine(outputs), tuple(emits)
+
+  @override
+  def get_initial_state(
+      self, batch_size, input_spec, *, training: bool = False, constants=None
+  ):
+    if not input_spec.shape:
+      raise ValueError(f'Input must be at least 3D, got: {input_spec.shape=}.')
+    if input_spec.shape[-1] % self._num_groups != 0:
+      raise ValueError(
+          f'Input channels ({input_spec.shape[-1]}) must be divisible by'
+          f' num_groups ({self._num_groups}).'
+      )
+    group_shape = list(input_spec.shape)
+    group_shape[-1] //= self._num_groups
+    group_spec = types.ChannelSpec(
+        shape=tuple(group_shape),
+        dtype=input_spec.dtype,
+    )
+    state = self.child.get_initial_state(
+        batch_size, group_spec, training=training, constants=constants
+    )
+    return (state,) * self._num_groups
+
+  @types.check_step
+  @override
+  def step(  # pyrefly: ignore[missing-override-decorator]
+      self, x, state, *, training: bool = False, constants=None
+  ):
+
+    groups = self._split(x)
+    outputs = []
+    new_states = []
+    for g, s in zip(groups, state):  # pyrefly: ignore[bad-argument-type]
+      y, ns = self.child.step(g, s, training=training, constants=constants)
+      outputs.append(y)
+      new_states.append(ns)
+    return self._combine(outputs), tuple(new_states)
+
+  @override
+  def step_with_emits(
+      self, x, state, *, training: bool = False, constants=None
+  ):
+    groups = self._split(x)
+    outputs, new_states, emits = [], [], []
+    for g, s in zip(groups, state):  # pyrefly: ignore[bad-argument-type]
+      y, ns, e = self.child.step_with_emits(
+          g, s, training=training, constants=constants
+      )
+      outputs.append(y)
+      new_states.append(ns)
+      emits.append(e)
+    return self._combine(outputs), tuple(new_states), tuple(emits)
+
+  @classmethod
+  def from_config(cls, config, backend='mlx'):
+    """Creates a ParallelChannels instance from its configuration object."""
+    child = mlx_utils.make_layer(config.child_layer, backend=backend)
+    return cls(
+        child_layer=child,
+        num_groups=config.num_groups,
+        combination=config.combination,
+    )
diff --git a/sequence_layers/mlx/convolution_test.py b/sequence_layers/mlx/convolution_test.py
new file mode 100644
index 0000000..0d2c7a5
--- /dev/null
+++ b/sequence_layers/mlx/convolution_test.py
@@ -0,0 +1,103 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for convolution MLX sequence layers."""
+
+from absl.testing import absltest
+from absl.testing import parameterized
+
+from sequence_layers.mlx import convolution
+from sequence_layers.mlx import test_utils
+from sequence_layers.specs import convolution_behaviors as spec
+
+
+class Conv1DTest(
+    spec.Conv1DTest, test_utils.SequenceLayerTest, parameterized.TestCase
+):
+
+  def test_from_config(self):
+    config = convolution.Conv1D.Config(
+        filters=8,
+        kernel_size=3,
+        padding='causal',
+    )
+    mlx_layer = config.make()
+    self.assertIsInstance(
+        mlx_layer,
+        convolution.Conv1D,
+    )
+    x = self.random_sequence(1, 8, 4)
+    y = mlx_layer.layer(x, training=False)
+    self.assertEqual(y.channel_shape, (8,))
+
+
+class DepthwiseConv1DTest(
+    spec.DepthwiseConv1DTest,
+    test_utils.SequenceLayerTest,
+    parameterized.TestCase,
+):
+
+  def test_from_config(self):
+    config = convolution.DepthwiseConv1D.Config(
+        kernel_size=3,
+        padding='causal',
+    )
+    mlx_layer = config.make()
+    self.assertIsInstance(
+        mlx_layer,
+        convolution.DepthwiseConv1D,
+    )
+    x = self.random_sequence(1, 8, 4)
+    y = mlx_layer.layer(x, training=False)
+    self.assertEqual(y.channel_shape, (4,))
+
+
+class Conv1DTransposeTest(
+    spec.Conv1DTransposeTest,
+    test_utils.SequenceLayerTest,
+    parameterized.TestCase,
+):
+
+  def test_from_config(self):
+    config = convolution.Conv1DTranspose.Config(
+        filters=8,
+        kernel_size=3,
+        strides=2,
+        padding='causal',
+    )
+    mlx_layer = config.make()
+    self.assertIsInstance(
+        mlx_layer,
+        convolution.Conv1DTranspose,
+    )
+    x = self.random_sequence(1, 4, 4)
+    y = mlx_layer.layer(x, training=False)
+    self.assertEqual(y.channel_shape, (8,))
+
+
+class Conv2DTest(
+    spec.Conv2DTest, test_utils.SequenceLayerTest, parameterized.TestCase
+):
+  pass
+
+
+class Conv2DTransposeTest(
+    spec.Conv2DTransposeTest,
+    test_utils.SequenceLayerTest,
+    parameterized.TestCase,
+):
+  pass
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/sequence_layers/mlx/decoder_transformer_test.py b/sequence_layers/mlx/decoder_transformer_test.py
new file mode 100644
index 0000000..4180e9e
--- /dev/null
+++ b/sequence_layers/mlx/decoder_transformer_test.py
@@ -0,0 +1,248 @@
+"""End-to-end test: decoder-only transformer on MLX.
+
+Defines a small decoder transformer using MLX configs, builds an MLX
+model via config.make(), and tests inference + export.
+"""
+
+import os
+import tempfile
+import unittest
+
+import mlx.core as mx
+import mlx.nn as nn
+import numpy as np
+from absl.testing import absltest
+from absl.testing import parameterized
+
+from sequence_layers.mlx import attention as mlx_attn
+from sequence_layers.mlx import combinators as mlx_comb
+from sequence_layers.mlx import dense as mlx_dense
+from sequence_layers.mlx import export
+from sequence_layers.mlx import normalization as mlx_norm
+from sequence_layers.mlx import position as mlx_pos
+from sequence_layers.mlx import simple as mlx_simple
+from sequence_layers.mlx import test_utils
+from sequence_layers.mlx import types as bt
+
+Sequence = bt.Sequence
+ShapeDType = bt.ShapeDType
+
+
+def _decoder_config(vocab_size=256, dim=64, num_heads=4, num_layers=2):
+  """A small decoder-only transformer config."""
+  return mlx_comb.Serial.Config([
+      mlx_simple.Embedding.Config(
+          num_embeddings=vocab_size,
+          dimension=dim,
+      ),
+      mlx_comb.Repeat.Config(
+          num_repeats=num_layers,
+          layer=mlx_comb.Serial.Config([
+              mlx_comb.Residual.Config([
+                  mlx_norm.RMSNormalization.Config(),
+                  mlx_attn.DotProductSelfAttention.Config(
+                      num_heads=num_heads,
+                      units_per_head=dim // num_heads,
+                      max_past_horizon=128,
+                      max_future_horizon=0,
+                      query_network=(
+                          mlx_pos.ApplyRotaryPositionalEncoding.Config(
+                              max_wavelength=10000.0,
+                          )
+                      ),
+                      key_network=(
+                          mlx_pos.ApplyRotaryPositionalEncoding.Config(
+                              max_wavelength=10000.0,
+                          )
+                      ),
+                  ),
+                  mlx_simple.Flatten.Config(),
+              ]),
+              mlx_comb.Residual.Config([
+                  mlx_norm.RMSNormalization.Config(),
+                  mlx_dense.Dense.Config(
+                      features=dim * 4,
+                      activation=nn.gelu,
+                  ),
+                  mlx_dense.Dense.Config(features=dim),
+              ]),
+          ]),
+      ),
+      mlx_norm.RMSNormalization.Config(),
+      mlx_dense.Dense.Config(features=vocab_size),
+  ])
+
+
+def _make_token_sequence(tokens):
+  """Create a Sequence from integer token ids.
+
+  Args:
+    tokens: A 2D list [[t1, t2, ...], ...] of shape [batch, time].
+
+  Returns:
+    Sequence with values shape [batch, time] and all-valid mask.
+  """
+  arr = mx.array(tokens, dtype=mx.int32)
+  if arr.ndim != 2:
+    raise ValueError(f'Expected 2D token array, got shape {arr.shape}')
+  mask = mx.ones(arr.shape, dtype=mx.bool_)
+  return Sequence(arr, mask)
+
+
+@unittest.skip("Core dumps on Linux due to activations")
+class DecoderTransformerTest(test_utils.SequenceLayerTest, parameterized.TestCase):
+  """End-to-end tests for a decoder transformer on MLX."""
+
+  def _make_model(self, config=None):
+    if config is None:
+      config = _decoder_config()
+    model = config.make()
+    return model
+
+  def test_make_mlx(self):
+    """config.make() produces an MLX SequenceLayer."""
+    config = _decoder_config()
+    model = config.make()
+    from sequence_layers.mlx import types
+
+    self.assertIsInstance(model, types.SequenceLayer)
+    self.assertTrue(model.supports_step)
+
+  def test_layer(self):
+    """model.layer() produces correct output shape and dtype."""
+    model = self._make_model()
+    batch, time, vocab_size = 2, 8, 256
+    # Input: integer token ids with scalar channel shape ().
+    x = _make_token_sequence([[0] * time] * batch)
+    y = model.layer(x, training=False)
+    self.assertEqual(y.shape, (batch, time, vocab_size))
+
+  def test_step(self):
+    """model.step() runs and output shape is correct."""
+    model = self._make_model()
+    batch, vocab_size = 1, 256
+    input_spec = ShapeDType((), mx.int32)
+
+    export._materialize_deferred(model, batch, input_spec)
+    state = model.get_initial_state(batch, input_spec, training=False)
+
+    # Step with a single token.
+    x = _make_token_sequence([[42]])
+    y, new_state = model.step(x, state, training=False)
+    self.assertEqual(y.shape, (batch, 1, vocab_size))
+
+    # Second step.
+    x2 = _make_token_sequence([[7]])
+    y2, state2 = model.step(x2, new_state, training=False)
+    self.assertEqual(y2.shape, (batch, 1, vocab_size))
+
+  def test_step_layer_match(self):
+    """step() and layer() produce matching outputs."""
+    model = self._make_model()
+    batch, time = 2, 8
+    values = mx.random.randint(0, 256, shape=(batch, time)).astype(mx.int32)
+    mask = mx.ones((batch, time), dtype=mx.bool_)
+    x = Sequence(values, mask)
+
+    y_layer = model.layer(x, training=False)
+    y_step, _ = self._step_by_step(model, x)
+
+    np.testing.assert_allclose(
+        np.array(y_step.values),
+        np.array(y_layer.values),
+        atol=1e-4,
+        rtol=1e-4,
+        err_msg='step() and layer() outputs differ',
+    )
+
+  def test_autoregressive_generation(self):
+    """Token-by-token generation loop with random weights."""
+    model = self._make_model()
+    batch, vocab_size, max_len = 1, 256, 16
+    input_spec = ShapeDType((), mx.int32)
+
+    export._materialize_deferred(model, batch, input_spec)
+    state = model.get_initial_state(batch, input_spec, training=False)
+
+    token = 0
+    generated = [token]
+
+    for _ in range(max_len - 1):
+      x = _make_token_sequence([[token]])
+      y, state = model.step(x, state, training=False)
+      mx.eval(y.values)
+
+      logits = y.values[0, 0]  # [vocab_size]
+      token = int(mx.argmax(logits))
+      generated.append(token)
+
+    self.assertLen(generated, max_len)
+    for t in generated:
+      self.assertGreaterEqual(t, 0)
+      self.assertLess(t, vocab_size)
+
+  def test_export_import(self):
+    """Export step to .mlxfn, import, verify same outputs."""
+    model = self._make_model()
+    batch, vocab_size = 1, 256
+    input_spec = ShapeDType((), mx.int32)
+
+    export._materialize_deferred(model, batch, input_spec)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+      path = os.path.join(tmpdir, 'decoder.mlxfn')
+      export.export_step(model, path, batch_size=batch, input_spec=input_spec)
+      self.assertTrue(os.path.exists(path))
+
+      imported = mx.import_function(path)
+
+      tokens = [42, 7, 13]
+      inputs = [_make_token_sequence([[t]]) for t in tokens]
+
+      # Native inference.
+      state = model.get_initial_state(batch, input_spec, training=False)
+      native_outputs = []
+      for x in inputs:
+        y, state = model.step(x, state, training=False)
+        mx.eval(y.values)
+        native_outputs.append(np.array(y.values))
+
+      # Exported inference.
+      flat_state, structure = export.get_initial_state_flat(
+          model, batch, input_spec
+      )
+      exported_outputs = []
+      for x in inputs:
+        y_vals, y_mask, flat_state = export.run_exported(
+            imported, x.values, x.mask, flat_state
+        )
+        mx.eval(y_vals)
+        exported_outputs.append(np.array(y_vals))
+
+      for i, (native, exported) in enumerate(
+          zip(native_outputs, exported_outputs)
+      ):
+        np.testing.assert_allclose(
+            exported,
+            native,
+            atol=1e-4,
+            rtol=1e-4,
+            err_msg=f'Token {tokens[i]}: exported != native',
+        )
+
+  def test_export_file_size(self):
+    """Exported .mlxfn file has reasonable size."""
+    model = self._make_model()
+    batch = 1
+    input_spec = ShapeDType((), mx.int32)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+      path = os.path.join(tmpdir, 'decoder.mlxfn')
+      export.export_step(model, path, batch_size=batch, input_spec=input_spec)
+      size_mb = os.path.getsize(path) / (1024 * 1024)
+      # Small model should be < 10 MB.
+      self.assertLess(size_mb, 10.0)
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/sequence_layers/mlx/dense.py b/sequence_layers/mlx/dense.py
new file mode 100644
index 0000000..fef6a34
--- /dev/null
+++ b/sequence_layers/mlx/dense.py
@@ -0,0 +1,327 @@
+"""Dense sequence layer for MLX."""
+
+import dataclasses
+from typing import Callable, override
+
+from mlx import nn
+import mlx.core as mx
+
+from sequence_layers.mlx import init_mapping
+from sequence_layers.mlx import types
+from sequence_layers.mlx.simple import _to_mx_dtype
+from sequence_layers.specs import dense as spec
+
+
+class Dense(types.Stateless, spec.Dense[types.Sequence, types.ShapeDType]):
+  """A basic dense layer with deferred initialization.
+
+  Matches JAX interface where in_features is inferred on first call.
+  """
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types.SequenceLayerConfig, spec.Dense.Config):
+    """Dense config."""
+
+    features: int
+    use_bias: bool = True
+    activation: Callable | None = None
+    compute_dtype: types.DType | None = None
+    param_dtype: types.DType = mx.float32
+    name: str | None = None
+
+    @override
+    def make(self) -> 'Dense':
+      return Dense(self)
+
+  @classmethod
+  def from_config(cls, config: spec.Dense.Config) -> 'Dense':
+    """Creates a Dense layer from a spec Config."""
+    mlx_config = cls.Config(
+        features=config.features,
+        use_bias=config.use_bias,
+        activation=init_mapping.map_activation(config.activation),
+        compute_dtype=config.compute_dtype,
+        param_dtype=config.param_dtype or mx.float32,
+        name=config.name,
+    )
+    return cls(mlx_config)
+
+  def __init__(
+      self,
+      config: Config | None = None,
+      *,
+      features: int | None = None,
+      in_features: int | None = None,
+      use_bias: bool = True,
+      activation=None,
+      compute_dtype=None,
+      param_dtype=mx.float32,
+  ):
+    """Initialize Dense."""
+    super().__init__()
+    if config is not None:
+      self.config = config
+    else:
+      if features is None:
+        raise ValueError('Must provide either config or features')
+      self.config = self.Config(
+          features=features,
+          use_bias=use_bias,
+          activation=activation,
+          compute_dtype=compute_dtype,
+          param_dtype=param_dtype,
+      )
+    self._compute_dtype = _to_mx_dtype(self.config.compute_dtype)
+    self._param_dtype = _to_mx_dtype(self.config.param_dtype) or mx.float32
+    self.activation = init_mapping.map_activation(self.config.activation)
+    self._linear = None
+    if in_features is not None:
+      self._ensure_initialized(in_features)
+
+  @property
+  @override
+  def receptive_field(self) -> tuple[int, int]:
+    return (0, 0)
+
+  def _ensure_initialized(self, in_features: int):
+    """Ensure nn.Linear is initialized on first call."""
+    if self._linear is not None:
+      return
+    self._linear = nn.Linear(
+        in_features, self.config.features, bias=self.config.use_bias
+    )
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    """Get output shape."""
+    if not input_shape:
+      raise ValueError(
+          f'Dense requires at least rank 3 input. Got: {input_shape=}'
+      )
+    return tuple(input_shape[:-1]) + (self.config.features,)
+
+  @override
+  def get_output_dtype(self, input_dtype, *, constants=None):
+    if self._compute_dtype is not None:
+      return self._compute_dtype
+    assert self._param_dtype is not None
+    return self._param_dtype
+
+  @override
+  @types.check_layer
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    if x.ndim < 3:
+      raise ValueError(f'Dense requires at least rank 3 input. Got: {x.shape=}')
+    self._ensure_initialized(x.shape[-1])
+    assert self._linear is not None
+    activation = self.activation
+    compute_dtype = self.get_output_dtype(x.dtype)
+
+    def dense_fn(v):
+      y = self._linear(v.astype(compute_dtype))
+      if activation is not None:
+        y = activation(y)
+      return y
+
+    if self.config.use_bias or activation is not None:
+      return x.apply_values(dense_fn)
+    return x.apply_values_masked(dense_fn)
+
+
+class EinsumDense(
+    types.Stateless, spec.EinsumDense[types.Sequence, types.ShapeDType]
+):
+  """Dense layer using Einstein summation notation."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types.SequenceLayerConfig, spec.EinsumDense.Config):
+    """MLX-native configuration for EinsumDense."""
+
+    equation: str
+    output_shape: tuple[int | None, ...]
+    bias_axes: str = ''
+    activation: Callable | None = None
+    compute_dtype: types.DType | None = None
+    param_dtype: types.DType = mx.float32
+    name: str | None = None
+
+    def __post_init__(self):
+      object.__setattr__(self, 'output_shape', tuple(self.output_shape))
+
+    @override
+    def make(self) -> 'EinsumDense':
+      return EinsumDense(self)
+
+  @classmethod
+  def from_config(cls, config: spec.EinsumDense.Config) -> 'EinsumDense':
+    """Creates an EinsumDense layer from a spec Config."""
+    mlx_config = cls.Config(
+        equation=config.equation,
+        output_shape=tuple(config.output_shape),
+        bias_axes=config.bias_axes,
+        activation=init_mapping.map_activation(config.activation),
+        compute_dtype=config.compute_dtype,
+        param_dtype=config.param_dtype or mx.float32,
+        name=config.name,
+    )
+    return cls(mlx_config)
+
+  def __init__(
+      self,
+      config: Config | None = None,
+      *,
+      equation: str | None = None,
+      output_shape: tuple[int | None, ...] = (),
+      bias_axes: str = '',
+      activation=None,
+      compute_dtype=None,
+      param_dtype=mx.float32,
+  ):
+    """Initialize EinsumDense."""
+    super().__init__()
+    if config is not None:
+      self.config = config
+    else:
+      if equation is None:
+        raise ValueError('Must provide either config or equation')
+      self.config = self.Config(
+          equation=equation,
+          output_shape=output_shape,
+          bias_axes=bias_axes,
+          activation=activation,
+          compute_dtype=compute_dtype,
+          param_dtype=param_dtype,
+      )
+    self._compute_dtype = _to_mx_dtype(self.config.compute_dtype)
+    self._param_dtype = _to_mx_dtype(self.config.param_dtype) or mx.float32
+    self.activation = init_mapping.map_activation(self.config.activation)
+    self.kernel = None
+    self.bias = None
+    self._initialized = False
+    self._resolved_output_shape = None
+
+  @property
+  @override
+  def receptive_field(self) -> tuple[int, int]:
+    return (0, 0)
+
+  def _ensure_initialized(self, input_shape):
+    """Ensure parameters are initialized."""
+    if self._initialized:
+      return
+    output_shape, kernel_shape, bias_shape = _compute_shapes(
+        self.config.equation,
+        input_shape,
+        self.config.output_shape,
+        self.config.bias_axes,
+    )
+    self._resolved_output_shape = output_shape
+    self.kernel = mx.zeros(kernel_shape, dtype=self._param_dtype)
+    if bias_shape is not None:
+      self.bias = mx.zeros(bias_shape, dtype=self._param_dtype)
+    self._initialized = True
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    """Get output shape."""
+    output_shape, _, _ = _compute_shapes(
+        self.config.equation,
+        input_shape,
+        self.config.output_shape,
+        self.config.bias_axes,
+    )
+    return output_shape
+
+  @override
+  def get_output_dtype(self, input_dtype, *, constants=None):
+    if self._compute_dtype is not None:
+      return self._compute_dtype
+    assert self._param_dtype is not None
+    return self._param_dtype
+
+  @override
+  @types.check_layer
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    self._ensure_initialized(x.channel_shape)
+    compute_dtype = self.get_output_dtype(x.dtype)
+    activation = self.activation
+
+    def einsum_fn(v):
+      y = mx.einsum(self.config.equation, v.astype(compute_dtype), self.kernel)
+      if self.bias is not None:
+        y = y + self.bias
+      if activation is not None:
+        y = activation(y)
+      return y
+
+    if self.bias is not None or activation is not None:
+      return x.apply_values(einsum_fn)
+    return x.apply_values_masked(einsum_fn)
+
+
+def _parse_equation(equation):
+  """Parse einsum equation of form '...ab,bc->...ac'."""
+  if '->' not in equation:
+    raise ValueError(f'equation is not valid for EinsumDense: {equation}')
+  left, output_spec = equation.split('->')
+  input_spec, kernel_spec = left.split(',')
+  if not input_spec.startswith('...') or not output_spec.startswith('...'):
+    raise ValueError('Equation must be of the form "...X,Y->...Z".')
+  if 3 + len(set(input_spec[3:])) != len(input_spec):
+    raise ValueError(
+        f'Equation {input_spec=} must not contain duplicate variables.'
+    )
+  if 3 + len(set(output_spec[3:])) != len(output_spec):
+    raise ValueError(
+        f'Equation {output_spec=} must not contain duplicate variables.'
+    )
+  return input_spec, kernel_spec, output_spec
+
+
+def _compute_shapes(equation, input_shape, output_shape_spec, bias_axes):
+  """Compute kernel_shape and bias_shape from equation and shapes."""
+  input_spec, kernel_spec, output_spec = _parse_equation(equation)
+  in_spec = input_spec[3:]
+  out_spec = output_spec[3:]
+
+  if len(in_spec) != len(input_shape):
+    raise ValueError(f'Equation {in_spec=} does not match {input_shape=} rank.')
+
+  input_dims = {d: input_shape[i] for i, d in enumerate(in_spec)}
+  output_shape = list(output_shape_spec)
+  if len(out_spec) != len(output_shape):
+    raise ValueError(f'Equation {out_spec=} does not match {output_shape=}.')
+
+  for i, d in enumerate(out_spec):
+    if output_shape[i] is None:
+      output_shape[i] = input_dims[d]
+    elif d in input_dims and output_shape[i] != input_dims[d]:
+      raise ValueError(
+          f'Inconsistent dimension {d=}. {output_shape=} vs {input_shape=}'
+      )
+
+  output_dim_map = {d: output_shape[i] for i, d in enumerate(out_spec)}
+
+  kernel_shape = []
+  for d in kernel_spec:
+    if d in input_dims:
+      kernel_shape.append(input_dims[d])
+    elif d in output_dim_map:
+      kernel_shape.append(output_dim_map[d])
+    else:
+      raise ValueError(f"Weight dimension '{d}' not in input or output spec.")
+
+  if bias_axes:
+    first_bias_loc = min(out_spec.find(c) for c in bias_axes)
+    bias_out_spec = out_spec[first_bias_loc:]
+    bias_shape = [
+        output_dim_map[c] if c in bias_axes else 1 for c in bias_out_spec
+    ]
+  else:
+    bias_shape = None
+
+  return tuple(output_shape), tuple(kernel_shape), bias_shape
diff --git a/sequence_layers/mlx/dense_test.py b/sequence_layers/mlx/dense_test.py
new file mode 100644
index 0000000..49a9e63
--- /dev/null
+++ b/sequence_layers/mlx/dense_test.py
@@ -0,0 +1,28 @@
+"""Tests for Dense MLX sequence layers."""
+
+from absl.testing import absltest
+from mlx import nn
+
+from sequence_layers.mlx import dense
+from sequence_layers.mlx import test_utils
+from sequence_layers.specs import dense_behaviors as spec
+
+
+class DenseTest(test_utils.SequenceLayerTest, spec.DenseTest):
+  """Test behavior of Dense layer."""
+
+  def test_activation(self):
+    """Test activation in Dense."""
+    layer = dense.Dense.Config(features=8, activation=nn.relu).make()
+    x = self.random_sequence(2, 3, 4)
+    # pyrefly: ignore[bad-argument-type]
+    layer = self.init_layer(layer, x)
+    self.verify_contract(layer, x)  # pyrefly: ignore[bad-argument-type]
+
+
+class EinsumDenseTest(test_utils.SequenceLayerTest, spec.EinsumDenseTest):
+  """Test behavior of EinsumDense layer."""
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/sequence_layers/mlx/dsp.py b/sequence_layers/mlx/dsp.py
new file mode 100644
index 0000000..1a2e102
--- /dev/null
+++ b/sequence_layers/mlx/dsp.py
@@ -0,0 +1,1477 @@
+"""DSP layers for MLX."""
+
+# pylint: disable=protected-access,attribute-defined-outside-init,missing-class-docstring,missing-function-docstring,unsubscriptable-object,no-else-return,unnecessary-lambda
+
+import dataclasses
+import fractions
+from typing import override
+
+import mlx.core as mx
+import numpy as np
+
+from sequence_layers.mlx import convolution as conv_utils
+from sequence_layers.mlx import types
+from sequence_layers.specs import dsp as spec
+
+from . import types as bt
+
+Sequence = bt.Sequence
+MaskedSequence = bt.MaskedSequence
+PaddingMode = bt.PaddingMode
+
+
+# ---------------------------------------------------------------------------
+# Signal utilities
+# ---------------------------------------------------------------------------
+
+
+def hann_window(window_length, periodic=True, dtype=np.float32):
+  """Compute a periodic Hann window."""
+  if window_length == 1:
+    return np.ones([1], dtype=dtype)
+  even = 1 - window_length % 2
+  n = np.asarray(window_length + int(periodic) * even - 1, dtype=dtype)
+  count = np.arange(window_length, dtype=dtype)
+  return np.asarray(0.5 - 0.5 * np.cos(2 * np.pi * count / n), dtype)
+
+
+def frame(values, frame_length, frame_step, pad_mode='valid', axis=1):
+  """Produce overlapping frames of a signal along `axis`.
+
+  Args:
+    values: [..., T, ...] array.
+    frame_length: Length of each frame.
+    frame_step: Stride between frames.
+    pad_mode: Padding mode string or 'valid'.
+    axis: Axis along which to frame.
+
+  Returns:
+    [..., num_frames, frame_length, ...] array.
+  """
+  # Normalize axis.
+  if axis < 0:
+    axis += values.ndim
+
+  t = values.shape[axis]
+
+  # Apply padding if needed.
+  if isinstance(pad_mode, str) and pad_mode != PaddingMode.VALID.value:
+    pad_left, pad_right = conv_utils._explicit_padding(
+        pad_mode, frame_length, frame_step, 1
+    )
+    pad_widths = [(0, 0)] * values.ndim
+    pad_widths[axis] = (pad_left, pad_right)
+    values = mx.pad(values, pad_widths)
+    t = values.shape[axis]
+
+  # Compute number of frames.
+  num_frames = max(0, (t - frame_length) // frame_step + 1)
+
+  # Move target axis to position 1 for uniform handling.
+  if axis != 1:
+    perm = list(range(values.ndim))
+    perm[1], perm[axis] = perm[axis], perm[1]
+    values = mx.transpose(values, perm)
+
+  # values shape: [batch, t, ...]
+  batch = values.shape[0]
+  rest_shape = values.shape[2:]
+
+  # Fast path: zero-copy strided view for contiguous data.
+  rest_size = 1
+  for d in rest_shape:
+    rest_size *= d
+
+  batch_stride = t * rest_size
+  frame_stride = frame_step * rest_size
+  time_stride = rest_size
+
+  # Compute rest strides from contiguous layout.
+  rest_strides = []
+  s = 1
+  for d in reversed(rest_shape):
+    rest_strides.append(s)
+    s *= d
+  rest_strides.reverse()
+
+  result = mx.as_strided(
+      values,
+      shape=(batch, num_frames, frame_length) + rest_shape,
+      strides=(batch_stride, frame_stride, time_stride) + tuple(rest_strides),
+  )
+
+  if axis != 1:
+    # Move back.
+    perm = list(range(result.ndim))
+    perm[1], perm[axis] = perm[axis], perm[1]
+    if axis > 1:
+      perm.insert(axis + 1, perm.pop(2))
+    result = mx.transpose(result, perm)
+
+  return result
+
+
+def overlap_and_add(signal_arr, frame_step):
+  """Overlap-add framed signal.
+
+  Args:
+    signal_arr: [..., frames, frame_length] array.
+    frame_step: Stride between frames.
+
+  Returns:
+    [..., output_length] array where
+    output_length = (frames - 1) * frame_step + frame_length.
+  """
+  shape = signal_arr.shape
+  outer_dims = shape[:-2]
+  frames = shape[-2]
+  frame_length = shape[-1]
+  output_length = frame_length + frame_step * (frames - 1)
+
+  if frame_length == frame_step:
+    return signal_arr.reshape(outer_dims + (output_length,))
+
+  # Vectorized overlap-add via scatter.
+  outer_size = 1
+  for d in outer_dims:
+    outer_size *= d
+
+  flat = signal_arr.reshape(outer_size, frames, frame_length)
+
+  # Build output position indices: [frames, frame_length].
+  offsets = mx.arange(frames)[:, None] * frame_step
+  positions = offsets + mx.arange(frame_length)[None, :]
+  flat_positions = positions.reshape(-1)  # [frames * frame_length]
+
+  # Flatten signal and scatter-add all frame contributions at once.
+  flat_signal = flat.reshape(outer_size, frames * frame_length)
+  result = mx.zeros((outer_size, output_length), dtype=flat.dtype)
+  result = result.at[:, flat_positions].add(flat_signal)
+
+  return result.reshape(outer_dims + (output_length,))
+
+
+def linear_to_mel_weight_matrix(
+    num_mel_bins,
+    num_spectrogram_bins,
+    sample_rate,
+    lower_edge_hertz,
+    upper_edge_hertz,
+    dtype=np.float64,
+):
+  """Create a weight matrix for converting linear spectrogram to mel."""
+
+  # Mel scale conversion (HTK formula).
+  def hz_to_mel(f):
+    return 2595.0 * np.log10(1.0 + f / 700.0)
+
+  def mel_to_hz(m):
+    return 700.0 * (10.0 ** (m / 2595.0) - 1.0)
+
+  nyquist = sample_rate / 2.0
+  freq_bins = np.linspace(0, nyquist, num_spectrogram_bins)
+
+  mel_low = hz_to_mel(lower_edge_hertz)
+  mel_high = hz_to_mel(upper_edge_hertz)
+  mel_points = np.linspace(mel_low, mel_high, num_mel_bins + 2)
+  hz_points = mel_to_hz(mel_points)
+
+  lower = hz_points[:-2][np.newaxis, :]  # [1, num_mel_bins]
+  center = hz_points[1:-1][np.newaxis, :]  # [1, num_mel_bins]
+  upper = hz_points[2:][np.newaxis, :]  # [1, num_mel_bins]
+  freq = freq_bins[:, np.newaxis]  # [num_spectrogram_bins, 1]
+
+  rising = np.where(
+      (freq >= lower) & (freq <= center) & (center > lower),
+      (freq - lower) / np.maximum(center - lower, 1e-10),
+      0.0,
+  )
+  falling = np.where(
+      (freq > center) & (freq <= upper) & (upper > center),
+      (upper - freq) / np.maximum(upper - center, 1e-10),
+      0.0,
+  )
+  return (rising + falling).astype(dtype)
+
+
+# ---------------------------------------------------------------------------
+# Delay
+# ---------------------------------------------------------------------------
+
+
+class Delay(
+    types.PreservesShape, types.PreservesType, types.SequenceLayer, spec.Delay
+):
+  """Delays input by `length` timesteps."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.Delay.Config):
+
+    @override
+    def make(self) -> 'Delay':
+      return Delay.from_config(self)
+
+  def __init__(self, *, length, delay_layer_output=True):
+    super().__init__()
+    if length < 0:
+      raise ValueError(f'length must be non-negative, got {length}.')
+    self.length = length
+    self.delay_layer_output = delay_layer_output
+
+  @property
+  @override
+  def input_latency(self):
+    return self.length
+
+  @property
+  @override
+  def output_latency(self):
+    return 0 if self.delay_layer_output else self.length
+
+  @override
+  def get_initial_state(
+      self, batch_size, input_spec, *, training: bool, constants=None
+  ):
+    if not self.length:
+      return ()
+    return Sequence(
+        mx.zeros(
+            (batch_size, self.length) + input_spec.shape,
+            dtype=input_spec.dtype,
+        ),
+        mx.zeros(
+            (batch_size, self.length),
+            dtype=bt.MASK_DTYPE,
+        ),
+    )
+
+  @override
+  @types.check_step
+  def step(  # pyrefly: ignore[missing-override-decorator]
+      self, x: Sequence, state: Sequence, *, training: bool, constants=None
+  ):
+    if not self.length:
+      return x, state
+    state = state.concatenate(x)
+    t = x.shape[1]
+    y = Sequence(state.values[:, :t], state.mask[:, :t])
+    state = Sequence(state.values[:, t:], state.mask[:, t:])
+    return y, state
+
+  @override
+  @types.check_layer
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    if self.delay_layer_output:
+      return x.pad_time(self.length, 0, valid=False)
+    return x
+
+  @classmethod
+  def from_config(cls, config):
+    layer = cls(
+        length=config.length,
+        delay_layer_output=config.delay_layer_output,
+    )
+    layer.config = config
+    return layer
+
+
+# ---------------------------------------------------------------------------
+# Lookahead
+# ---------------------------------------------------------------------------
+
+
+class Lookahead(
+    types.PreservesShape,
+    types.PreservesType,
+    types.SequenceLayer,
+    spec.Lookahead,
+):
+  """Drops the first `length` timesteps from the input."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.Lookahead.Config):
+
+    @override
+    def make(self) -> 'Lookahead':
+      return Lookahead.from_config(self)
+
+  def __init__(self, *, length, preserve_length_in_layer=False):
+    super().__init__()
+    if length < 0:
+      raise ValueError(f'length must be non-negative, got {length}.')
+    self.length = length
+    self.preserve_length_in_layer = preserve_length_in_layer
+
+  @property
+  @override
+  def input_latency(self):
+    return 0
+
+  @property
+  @override
+  def output_latency(self):
+    return self.length
+
+  @override
+  def get_initial_state(
+      self, batch_size, input_spec, *, training: bool, constants=None
+  ):
+    if not self.length:
+      return ()
+    return mx.full(
+        (batch_size,),
+        self.length + 1,
+        dtype=mx.int32,
+    )
+
+  @override
+  @types.check_step
+  def step(  # pyrefly: ignore[missing-override-decorator]
+      self, x: Sequence, state: mx.array, *, training: bool, constants=None
+  ):
+    if not self.length:
+      return x, state
+    increments = mx.cumsum(x.mask.astype(mx.int32), axis=1)
+    countdown = mx.maximum(0, state[:, None] - increments)
+    mask = mx.logical_and(
+        x.mask, countdown == 0  # pyrefly: ignore[bad-argument-type]
+    )
+    y = Sequence(x.values, mask)
+    state = countdown[:, -1]
+    return y, state
+
+  @override
+  @types.check_layer
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    if not self.length:
+      return x
+    x = x[:, self.length :]
+    if self.preserve_length_in_layer:
+      return x.pad_time(0, self.length, valid=False)
+    return x
+
+  @classmethod
+  def from_config(cls, config):
+    layer = cls(
+        length=config.length,
+        preserve_length_in_layer=config.preserve_length_in_layer,
+    )
+    layer.config = config
+    return layer
+
+
+# ---------------------------------------------------------------------------
+# Window
+# ---------------------------------------------------------------------------
+
+
+class Window(
+    types.PreservesShape, types.PreservesType, types.Stateless, spec.Window
+):
+  """Applies a window function along a channel axis."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.Window.Config):
+
+    @override
+    def make(self) -> 'Window':
+      return Window.from_config(self)
+
+  def __init__(self, *, axis, window_fn=None):
+    super().__init__()
+    self._axis = axis
+    self._window_fn = window_fn or hann_window
+
+  def _get_axis(self, x):
+    axis = self._axis
+    if axis < 0:
+      axis += x.ndim
+    if axis < 2:
+      raise ValueError(
+          f'Window axis must be a channel axis (>= 2), got {axis}.'
+      )
+    return axis
+
+  @override
+  @types.check_layer
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    axis = self._get_axis(x)
+    window_length = x.shape[axis]
+    window = self._window_fn(window_length)
+    window = mx.array(window, dtype=x.dtype)
+    shape = [1] * x.ndim
+    shape[axis] = window_length
+    window = window.reshape(shape)
+    return x.apply_values_masked(lambda v: v * window)
+
+  @classmethod
+  def from_config(cls, config):
+    layer = cls(
+        axis=config.axis,
+        window_fn=config.window_fn,
+    )
+    layer.config = config
+    return layer
+
+
+# ---------------------------------------------------------------------------
+# Frame
+# ---------------------------------------------------------------------------
+
+
+class Frame(types.PreservesType, types.SequenceLayer, spec.Frame):
+  """Produces overlapping frames of the input sequence."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.Frame.Config):
+
+    @override
+    def make(self) -> 'Frame':
+      return Frame.from_config(self)
+
+  def __init__(self, *, frame_length, frame_step, padding='valid'):
+    super().__init__()
+    if frame_length <= 0:
+      raise ValueError(f'frame_length must be positive: {frame_length}')
+    if frame_step <= 0:
+      raise ValueError(f'frame_step must be positive: {frame_step}')
+    self.frame_length = frame_length
+    self.frame_step = frame_step
+    self.padding = padding
+
+  @property
+  @override
+  def supports_step(self):
+    return conv_utils._supports_step(self.padding)
+
+  @property
+  @override
+  def block_size(self):
+    return self.frame_step
+
+  @property
+  @override
+  def output_ratio(self):
+    return fractions.Fraction(1, self.frame_step)
+
+  @property
+  @override
+  def input_latency(self):
+    if self.padding in (
+        PaddingMode.CAUSAL_VALID.value,
+        PaddingMode.CAUSAL.value,
+        PaddingMode.SEMICAUSAL.value,
+    ):
+      return 0
+    elif self.padding in (
+        PaddingMode.REVERSE_CAUSAL_VALID.value,
+        PaddingMode.REVERSE_CAUSAL.value,
+    ):
+      return self.frame_length - 1
+    return 0
+
+  @property
+  def _buffer_width(self):
+    if self.padding == PaddingMode.SEMICAUSAL.value:
+      return max(self.frame_length - self.frame_step, 0)
+    elif self.padding in (
+        PaddingMode.REVERSE_CAUSAL.value,
+        PaddingMode.REVERSE_CAUSAL_VALID.value,
+    ):
+      return (self.frame_length - 1) // self.frame_step * self.frame_step
+    elif self.padding in (
+        PaddingMode.CAUSAL.value,
+        PaddingMode.CAUSAL_VALID.value,
+    ):
+      return self.frame_length - 1
+    else:
+      raise ValueError(f'Unsupported step padding: {self.padding}')
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    return (self.frame_length,) + tuple(input_shape)
+
+  @override
+  def get_output_dtype(self, input_dtype, *, constants=None):
+    return input_dtype
+
+  @override
+  def get_initial_state(
+      self, batch_size, input_spec, *, training: bool, constants=None
+  ):
+    bw = self._buffer_width
+    if not bw:
+      return ()
+    return conv_utils._compute_initial_state(
+        batch_size,
+        input_spec,
+        bw,
+        self.padding,
+    )
+
+  @override
+  @types.check_step
+  def step(  # pyrefly: ignore[missing-override-decorator]
+      self,
+      x: Sequence,
+      state: Sequence | tuple[()],
+      *,
+      training: bool,
+      constants=None,
+  ):
+    if self.frame_length > 1:
+      x = x.mask_invalid()
+
+    bw = self._buffer_width
+    if bw:
+      assert isinstance(state, Sequence)
+      state = state.concatenate(x)
+    else:
+      state = x
+
+    values = frame(
+        state.values,
+        frame_length=self.frame_length,
+        frame_step=self.frame_step,
+        pad_mode=PaddingMode.VALID.value,
+        axis=1,
+    )
+    mask = conv_utils._compute_conv_mask(
+        state.mask,
+        self.frame_length,
+        self.frame_step,
+        1,
+        self.padding,
+        is_step=True,
+    )
+
+    if bw:
+      assert isinstance(state, Sequence)
+      state = state[:, -bw:]
+    else:
+      state = ()
+
+    return Sequence(values, mask), state
+
+  @override
+  @types.check_layer
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    if self.frame_length > 1:
+      x = x.mask_invalid()
+
+    values = frame(
+        x.values,
+        frame_length=self.frame_length,
+        frame_step=self.frame_step,
+        pad_mode=self.padding,
+        axis=1,
+    )
+    mask = conv_utils._compute_conv_mask(
+        x.mask,
+        self.frame_length,
+        self.frame_step,
+        1,
+        self.padding,
+        is_step=False,
+    )
+    return Sequence(values, mask)
+
+  @classmethod
+  def from_config(cls, config):
+    layer = cls(
+        frame_length=config.frame_length,
+        frame_step=config.frame_step,
+        padding=config.padding,
+    )
+    layer.config = config
+    return layer
+
+
+# ---------------------------------------------------------------------------
+# OverlapAdd
+# ---------------------------------------------------------------------------
+
+
+class OverlapAdd(types.PreservesType, types.SequenceLayer, spec.OverlapAdd):
+  """Overlap-adds windows of [b, t, frame_length, ...].
+
+  Output shape: [b, to, ...] where
+  to = (ti - 1) * frame_step + frame_length.
+  """
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.OverlapAdd.Config):
+
+    @override
+    def make(self) -> 'OverlapAdd':
+      return OverlapAdd.from_config(self)
+
+  def __init__(self, *, frame_length, frame_step, padding='valid'):
+    super().__init__()
+    if frame_length <= 0:
+      raise ValueError(f'frame_length must be positive: {frame_length}')
+    if frame_step <= 0:
+      raise ValueError(f'frame_step must be positive: {frame_step}')
+    if frame_length < frame_step:
+      raise ValueError('frame_length must be >= frame_step.')
+    if padding not in (
+        PaddingMode.CAUSAL.value,
+        PaddingMode.VALID.value,
+        PaddingMode.SEMICAUSAL_FULL.value,
+    ):
+      raise ValueError(f'Unsupported padding: {padding}')
+    self.frame_length = frame_length
+    self.frame_step = frame_step
+    self.padding = padding
+
+  @property
+  @override
+  def supports_step(self):
+    return self.padding == PaddingMode.CAUSAL.value
+
+  @property
+  @override
+  def output_ratio(self):
+    return fractions.Fraction(self.frame_step)
+
+  @property
+  def _buffer_width(self):
+    return max(0, self.frame_length - self.frame_step)
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    if not input_shape or input_shape[0] != self.frame_length:
+      raise ValueError(
+          f'OverlapAdd expects (frame_length, ...) input, got {input_shape}.'
+      )
+    return tuple(input_shape[1:])
+
+  @override
+  def get_output_dtype(self, input_dtype, *, constants=None):
+    return input_dtype
+
+  @override
+  def get_initial_state(
+      self, batch_size, input_spec, *, training: bool, constants=None
+  ):
+    if not input_shape_valid(input_spec.shape, self.frame_length):
+      raise ValueError(f'Invalid input_spec shape: {input_spec.shape}')
+    bw = self._buffer_width
+    if not bw:
+      return ()
+    out_shape = tuple(input_spec.shape[1:])
+    return mx.zeros(
+        (batch_size, bw) + out_shape,
+        dtype=input_spec.dtype,
+    )
+
+  @override
+  @types.check_step
+  def step(  # pyrefly: ignore[missing-override-decorator]
+      self,
+      x: Sequence,
+      state: mx.array | tuple[()],
+      *,
+      training: bool,
+      constants=None,
+  ):
+    if self.frame_length > 1:
+      x = x.mask_invalid()
+
+    # Transpose [num_frames, frame_length] to end.
+    if x.ndim > 3:
+      # Move axes 1,2 to -2,-1.
+      axes = list(range(x.ndim))
+      axes.remove(1)
+      axes.remove(2)
+      axes.extend([1, 2])
+      values = mx.transpose(x.values, axes)
+    else:
+      values = x.values
+
+    values = overlap_and_add(values, self.frame_step)
+
+    if x.ndim > 3:
+      # Move back.
+      values = mx.moveaxis(values, -1, 1)
+
+    mask = conv_utils._compute_conv_transpose_mask(
+        x.mask,
+        self.frame_length,
+        self.frame_step,
+        1,
+        self.padding,
+    )
+
+    bw = self._buffer_width
+    if bw:
+      assert isinstance(state, mx.array)
+      time = x.shape[1]
+      # Pad state to extend to values length.
+      pad_right = max(0, values.shape[1] - bw)
+      pad_widths = [(0, 0)] * state.ndim
+      pad_widths[1] = (0, pad_right)
+      padded_state = mx.pad(state, pad_widths)
+
+      values = values + padded_state
+
+      output_samples = self.frame_step * time
+      output = values[:, :output_samples]
+      state = values[:, output_samples : output_samples + bw]
+      if state.shape[1] < bw:
+        pad_widths = [(0, 0)] * state.ndim
+        pad_widths[1] = (0, bw - state.shape[1])
+        state = mx.pad(state, pad_widths)
+      values = output
+
+    return Sequence(values, mask), state
+
+  @override
+  @types.check_layer
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    if self.frame_length > 1:
+      x = x.mask_invalid()
+
+    if x.ndim > 3:
+      axes = list(range(x.ndim))
+      axes.remove(1)
+      axes.remove(2)
+      axes.extend([1, 2])
+      values = mx.transpose(x.values, axes)
+    else:
+      values = x.values
+
+    values = overlap_and_add(values, self.frame_step)
+
+    if x.ndim > 3:
+      values = mx.moveaxis(values, -1, 1)
+
+    mask = conv_utils._compute_conv_transpose_mask(
+        x.mask,
+        self.frame_length,
+        self.frame_step,
+        1,
+        self.padding,
+    )
+
+    trim = max(self.frame_length - self.frame_step, 0)
+    if self.padding == PaddingMode.CAUSAL.value:
+      if trim:
+        values = values[:, :-trim]
+    elif self.padding == PaddingMode.SEMICAUSAL_FULL.value:
+      if trim:
+        values = values[:, trim:]
+        mask = mask[:, trim:]
+      size = min(values.shape[1], mask.shape[1])
+      return Sequence(values[:, :size], mask[:, :size])
+
+    return Sequence(values, mask)
+
+  @classmethod
+  def from_config(cls, config):
+    layer = cls(
+        frame_length=config.frame_length,
+        frame_step=config.frame_step,
+        padding=config.padding,
+    )
+    layer.config = config
+    return layer
+
+
+def input_shape_valid(shape, frame_length):
+  return shape and shape[0] == frame_length
+
+
+# ---------------------------------------------------------------------------
+# FFT layers
+# ---------------------------------------------------------------------------
+
+
+def _validate_and_normalize_axis(axis, input_shape):
+  """Normalize axis for FFT, ensuring it's a channel axis."""
+  if axis < 0:
+    axis += len(input_shape)
+  if axis < 0 or axis >= len(input_shape):
+    raise ValueError(f'Axis {axis} out of range for shape {input_shape}.')
+  if axis in (0, 1):
+    raise ValueError(f'FFT over batch/time not allowed. Got axis={axis}.')
+  return axis
+
+
+def _pad_or_truncate_for_fft(x, axis, required_length, padding):
+  """Pad or truncate sequence for FFT."""
+  input_dim = x.shape[axis]
+  if input_dim == required_length:
+    return x
+  if input_dim < required_length:
+    pad_amount = required_length - input_dim
+    if padding == 'center':
+      pad_left = pad_amount // 2
+      pad_right = pad_amount - pad_left
+    else:
+      pad_left = 0
+      pad_right = pad_amount
+    pad_widths = [(0, 0)] * x.ndim
+    pad_widths[axis] = (pad_left, pad_right)
+    return x.apply_values_masked(mx.pad, pad_widths)
+  else:
+    # Truncate.
+    if padding == 'center':
+      start = (input_dim - required_length) // 2
+    else:
+      start = 0
+    slices = [slice(None)] * x.ndim
+    slices[axis] = slice(start, start + required_length)
+    return x.apply_values_masked(lambda v: v[tuple(slices)])
+
+
+class FFT(types.Stateless, spec.FFT):
+  """Applies FFT to a channel dimension."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.FFT.Config):
+
+    @override
+    def make(self) -> 'FFT':
+      return FFT.from_config(self)
+
+  def __init__(self, *, fft_length=None, axis=-1, padding='right'):
+    super().__init__()
+    self.fft_length = fft_length
+    self._axis = axis
+    self._padding = padding
+
+  def _get_output_length(self, input_size):
+    return self.fft_length or input_size
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    shape = list(input_shape)
+    axis = (
+        _validate_and_normalize_axis(
+            self._axis, (None, None) + tuple(input_shape)
+        )
+        - 2
+    )
+    shape[axis] = self._get_output_length(shape[axis])
+    return tuple(shape)
+
+  @override
+  def get_output_dtype(self, input_dtype, *, constants=None):
+    return mx.complex64
+
+  @override
+  @types.check_layer
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    if x.ndim <= 2:
+      raise ValueError('FFT requires rank >= 3 input.')
+    axis = _validate_and_normalize_axis(self._axis, x.shape)
+    required = self._get_output_length(x.shape[axis])
+    x = _pad_or_truncate_for_fft(x, axis, required, self._padding)
+    return x.apply_values(mx.fft.fft, axis=axis)
+
+  @classmethod
+  def from_config(cls, config):
+    layer = cls(
+        fft_length=config.fft_length,
+        axis=config.axis,
+        padding=config.padding,
+    )
+    layer.config = config
+    return layer
+
+
+class IFFT(types.Stateless, spec.IFFT):
+  """Applies IFFT to a channel dimension."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.IFFT.Config):
+
+    @override
+    def make(self) -> 'IFFT':
+      return IFFT.from_config(self)
+
+  def __init__(
+      self,
+      *,
+      fft_length=None,
+      frame_length=None,
+      axis=-1,
+      padding='right',
+  ):
+    super().__init__()
+    self.fft_length = fft_length
+    self.frame_length = frame_length
+    self._axis = axis
+    self._padding = padding
+
+  def _get_output_length(self, input_size):
+    return self.frame_length or input_size
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    shape = list(input_shape)
+    axis = (
+        _validate_and_normalize_axis(
+            self._axis, (None, None) + tuple(input_shape)
+        )
+        - 2
+    )
+    shape[axis] = self._get_output_length(shape[axis])
+    return tuple(shape)
+
+  @override
+  def get_output_dtype(self, input_dtype, *, constants=None):
+    return mx.complex64
+
+  @override
+  @types.check_layer
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    if x.ndim <= 2:
+      raise ValueError('IFFT requires rank >= 3 input.')
+    axis = _validate_and_normalize_axis(self._axis, x.shape)
+    x = x.apply_values(mx.fft.ifft, axis=axis)
+    required = self._get_output_length(x.shape[axis])
+    return _pad_or_truncate_for_fft(x, axis, required, self._padding)
+
+  @classmethod
+  def from_config(cls, config):
+    layer = cls(
+        fft_length=config.fft_length,
+        frame_length=config.frame_length,
+        axis=config.axis,
+        padding=config.padding,
+    )
+    layer.config = config
+    return layer
+
+
+class RFFT(types.Stateless, spec.RFFT):
+  """Applies RFFT to a channel dimension."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.RFFT.Config):
+
+    @override
+    def make(self) -> 'RFFT':
+      return RFFT.from_config(self)
+
+  def __init__(self, *, fft_length=None, axis=-1, padding='right'):
+    super().__init__()
+    self.fft_length = fft_length
+    self._axis = axis
+    self._padding = padding
+
+  def _get_fft_length(self, input_size):
+    return self.fft_length or input_size
+
+  def _get_output_length(self, input_size):
+    return self._get_fft_length(input_size) // 2 + 1
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    shape = list(input_shape)
+    axis = (
+        _validate_and_normalize_axis(
+            self._axis, (None, None) + tuple(input_shape)
+        )
+        - 2
+    )
+    shape[axis] = self._get_output_length(shape[axis])
+    return tuple(shape)
+
+  @override
+  def get_output_dtype(self, input_dtype, *, constants=None):
+    return mx.complex64
+
+  @override
+  @types.check_layer
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    if x.ndim <= 2:
+      raise ValueError('RFFT requires rank >= 3 input.')
+    axis = _validate_and_normalize_axis(self._axis, x.shape)
+    fft_len = self._get_fft_length(x.shape[axis])
+    x = _pad_or_truncate_for_fft(x, axis, fft_len, self._padding)
+
+    def rfft_fn(v):
+      if v.dtype == mx.bfloat16:
+        v = v.astype(mx.float32)
+      return mx.fft.rfft(v, n=fft_len, axis=axis)
+
+    return x.apply_values(rfft_fn)
+
+  @classmethod
+  def from_config(cls, config):
+    layer = cls(
+        fft_length=config.fft_length,
+        axis=config.axis,
+        padding=config.padding,
+    )
+    layer.config = config
+    return layer
+
+
+class IRFFT(types.Stateless, spec.IRFFT):
+  """Applies IRFFT to a channel dimension."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.IRFFT.Config):
+
+    @override
+    def make(self) -> 'IRFFT':
+      return IRFFT.from_config(self)
+
+  def __init__(
+      self,
+      *,
+      fft_length=None,
+      frame_length=None,
+      axis=-1,
+      padding='right',
+  ):
+    super().__init__()
+    self.fft_length = fft_length
+    self.frame_length = frame_length
+    self._axis = axis
+    self._padding = padding
+
+  def _get_fft_length(self, input_size):
+    return self.fft_length or (input_size - 1) * 2
+
+  def _get_output_length(self, input_size):
+    return self.frame_length or self._get_fft_length(input_size)
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    shape = list(input_shape)
+    axis = (
+        _validate_and_normalize_axis(
+            self._axis, (None, None) + tuple(input_shape)
+        )
+        - 2
+    )
+    shape[axis] = self._get_output_length(shape[axis])
+    return tuple(shape)
+
+  @override
+  def get_output_dtype(self, input_dtype, *, constants=None):
+    return mx.float32
+
+  @override
+  @types.check_layer
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    if x.ndim <= 2:
+      raise ValueError('IRFFT requires rank >= 3 input.')
+    axis = _validate_and_normalize_axis(self._axis, x.shape)
+    fft_len = self._get_fft_length(x.shape[axis])
+    x = x.apply_values(lambda v: mx.fft.irfft(v, n=fft_len, axis=axis))
+    required = self.frame_length or x.shape[axis]
+    return _pad_or_truncate_for_fft(x, axis, required, self._padding)
+
+  @classmethod
+  def from_config(cls, config):
+    layer = cls(
+        fft_length=config.fft_length,
+        frame_length=config.frame_length,
+        axis=config.axis,
+        padding=config.padding,
+    )
+    layer.config = config
+    return layer
+
+
+# ---------------------------------------------------------------------------
+# STFT
+# ---------------------------------------------------------------------------
+
+
+class STFT(types.SequenceLayer, spec.STFT):
+  """Short-Time Fourier Transform."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.STFT.Config):
+
+    @override
+    def make(self) -> 'STFT':
+      return STFT.from_config(self)
+
+  def __init__(
+      self,
+      *,
+      frame_length,
+      frame_step,
+      fft_length,
+      window_fn=None,
+      time_padding='reverse_causal_valid',
+      fft_padding='right',
+      output_magnitude=False,
+  ):
+    super().__init__()
+    self._frame_length = frame_length
+    self._frame_step = frame_step
+    self._fft_length = fft_length
+    self._window_fn = window_fn or hann_window
+    self._time_padding = time_padding
+    self._fft_padding = fft_padding
+    self._output_magnitude = output_magnitude
+
+    self.framer = Frame(
+        frame_length=frame_length,
+        frame_step=frame_step,
+        padding=time_padding,
+    )
+    self.fft = RFFT(
+        fft_length=fft_length,
+        axis=2,
+        padding=fft_padding,
+    )
+
+  @property
+  @override
+  def supports_step(self):
+    return self.framer.supports_step
+
+  @property
+  @override
+  def block_size(self):
+    return self.framer.block_size
+
+  @property
+  @override
+  def output_ratio(self):
+    return self.framer.output_ratio
+
+  @property
+  @override
+  def input_latency(self):
+    return self.framer.input_latency
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    frame_shape = self.framer.get_output_shape(input_shape, constants=constants)
+    return self.fft.get_output_shape(frame_shape, constants=constants)
+
+  @override
+  def get_output_dtype(self, input_dtype, *, constants=None):
+    fft_dtype = self.fft.get_output_dtype(input_dtype, constants=constants)
+    if self._output_magnitude:
+      return mx.float32
+    return fft_dtype
+
+  def _apply_window(self, x):
+    if self._window_fn:
+      window = self._window_fn(self._frame_length)
+      window = mx.array(window, dtype=x.dtype)
+      shape = [1] * x.ndim
+      shape[2] = self._frame_length
+      window = window.reshape(shape)
+      return x.apply_values_masked(lambda v: v * window)
+    return x
+
+  @override
+  def get_initial_state(
+      self, batch_size, input_spec, *, training: bool, constants=None
+  ):
+    return self.framer.get_initial_state(
+        batch_size, input_spec, training=training, constants=constants
+    )
+
+  @override
+  @types.check_step
+  def step(  # pyrefly: ignore[missing-override-decorator]
+      self,
+      x: Sequence,
+      state: Sequence | tuple[()],
+      *,
+      training: bool,
+      constants=None,
+  ):
+    framed, state = self.framer.step(
+        x, state, training=training, constants=constants
+    )
+    framed = self._apply_window(framed)
+    dft = self.fft.layer(framed, training=training, constants=constants)
+    if self._output_magnitude:
+      dft = dft.apply_values_masked(lambda v: mx.abs(v))
+    return dft, state
+
+  @override
+  @types.check_layer
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    framed = self.framer.layer(x, training=training, constants=constants)
+    framed = self._apply_window(framed)
+    dft = self.fft.layer(framed, training=training, constants=constants)
+    if self._output_magnitude:
+      dft = dft.apply_values_masked(lambda v: mx.abs(v))
+    return dft
+
+  @classmethod
+  def from_config(cls, config):
+    layer = cls(
+        frame_length=config.frame_length,
+        frame_step=config.frame_step,
+        fft_length=config.fft_length,
+        window_fn=config.window_fn,
+        time_padding=config.time_padding,
+        fft_padding=config.fft_padding,
+        output_magnitude=config.output_magnitude,
+    )
+    layer.config = config
+    return layer
+
+
+# ---------------------------------------------------------------------------
+# InverseSTFT
+# ---------------------------------------------------------------------------
+
+
+class InverseSTFT(types.SequenceLayer, spec.InverseSTFT):
+  """Inverse Short-Time Fourier Transform."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.InverseSTFT.Config):
+
+    @override
+    def make(self) -> 'InverseSTFT':
+      return InverseSTFT.from_config(self)
+
+  def __init__(
+      self,
+      *,
+      frame_length,
+      frame_step,
+      fft_length,
+      window_fn=None,
+      time_padding='causal',
+      fft_padding='right',
+  ):
+    super().__init__()
+    self._frame_length = frame_length
+    self._frame_step = frame_step
+    self._fft_length = fft_length
+    self._window_fn = window_fn or hann_window
+    self._time_padding = time_padding
+    self._fft_padding = fft_padding
+
+    self.overlap_add = OverlapAdd(
+        frame_length=frame_length,
+        frame_step=frame_step,
+        padding=time_padding,
+    )
+    self.irfft = IRFFT(
+        fft_length=fft_length,
+        frame_length=frame_length,
+        axis=2,
+        padding=fft_padding,
+    )
+
+  @property
+  @override
+  def supports_step(self):
+    return self.overlap_add.supports_step
+
+  @property
+  @override
+  def block_size(self):
+    return 1
+
+  @property
+  @override
+  def output_ratio(self):
+    return self.overlap_add.output_ratio
+
+  @property
+  @override
+  def input_latency(self):
+    return 0
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    irfft_shape = list(
+        self.irfft.get_output_shape(input_shape, constants=constants)
+    )
+    irfft_shape[0] = self._frame_length
+    return self.overlap_add.get_output_shape(irfft_shape, constants=constants)
+
+  @override
+  def get_output_dtype(self, input_dtype, *, constants=None):
+    return self.irfft.get_output_dtype(input_dtype, constants=constants)
+
+  def _apply_window(self, irfft):
+    """Pad/truncate to frame_length and apply window."""
+    fft_len = irfft.shape[2]
+    if fft_len > self._frame_length:
+      irfft = irfft.apply_values_masked(lambda v: v[:, :, : self._frame_length])
+    elif fft_len < self._frame_length:
+      pad_amount = self._frame_length - fft_len
+      if self._fft_padding == 'center':
+        pl = pad_amount // 2
+        pr = pad_amount - pl
+      else:
+        pl, pr = 0, pad_amount
+      pad_widths = [(0, 0)] * irfft.ndim
+      pad_widths[2] = (pl, pr)
+      irfft = irfft.apply_values_masked(mx.pad, pad_widths)
+
+    if self._window_fn:
+      window = self._window_fn(self._frame_length)
+      window = mx.array(window, dtype=irfft.dtype)
+      shape = [1] * irfft.ndim
+      shape[2] = self._frame_length
+      window = window.reshape(shape)
+      irfft = irfft.apply_values_masked(lambda v: v * window)
+    return irfft
+
+  @override
+  def get_initial_state(
+      self, batch_size, input_spec, *, training: bool, constants=None
+  ):
+    irfft_spec = self.irfft.get_output_spec(input_spec, constants=constants)
+    irfft_shape = list(irfft_spec.shape)
+    irfft_shape[0] = self._frame_length
+    irfft_spec = bt.ShapeDType(tuple(irfft_shape), irfft_spec.dtype)
+    return self.overlap_add.get_initial_state(
+        batch_size, irfft_spec, training=training, constants=constants
+    )
+
+  @override
+  @types.check_step
+  def step(  # pyrefly: ignore[missing-override-decorator]
+      self,
+      x: Sequence,
+      state: mx.array | tuple[()],
+      *,
+      training: bool,
+      constants=None,
+  ):
+    if x.ndim < 3:
+      raise ValueError(f'Expected [b,t,fft_bins,...] input, got {x.shape}.')
+    irfft = self.irfft.layer(x, training=training, constants=constants)
+    irfft = self._apply_window(irfft)
+    ola, state = self.overlap_add.step(
+        irfft, state, training=training, constants=constants
+    )
+    return ola, state
+
+  @override
+  @types.check_layer
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    if x.ndim < 3:
+      raise ValueError(f'Expected [b,t,fft_bins,...] input, got {x.shape}.')
+    irfft = self.irfft.layer(x, training=training, constants=constants)
+    irfft = self._apply_window(irfft)
+    ola = self.overlap_add.layer(irfft, training=training, constants=constants)
+    return ola
+
+  @classmethod
+  def from_config(cls, config):
+    layer = cls(
+        frame_length=config.frame_length,
+        frame_step=config.frame_step,
+        fft_length=config.fft_length,
+        window_fn=config.window_fn,
+        time_padding=config.time_padding,
+        fft_padding=config.fft_padding,
+    )
+    layer.config = config
+    return layer
+
+
+# ---------------------------------------------------------------------------
+# LinearToMelSpectrogram
+# ---------------------------------------------------------------------------
+
+
+class LinearToMelSpectrogram(
+    types.PreservesType, types.Stateless, spec.LinearToMelSpectrogram
+):
+  """Converts linear spectrogram to mel spectrogram."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.LinearToMelSpectrogram.Config):
+
+    @override
+    def make(self) -> 'LinearToMelSpectrogram':
+      return LinearToMelSpectrogram.from_config(self)
+
+  def __init__(
+      self,
+      *,
+      num_mel_bins,
+      sample_rate,
+      lower_edge_hertz,
+      upper_edge_hertz,
+  ):
+    super().__init__()
+    self.num_mel_bins = num_mel_bins
+    self.sample_rate = sample_rate
+    self.lower_edge_hertz = lower_edge_hertz
+    self.upper_edge_hertz = upper_edge_hertz
+    self._cached_weights = None
+    self._cached_num_bins = None
+    self._cached_dtype = None
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    if not input_shape:
+      raise ValueError('LinearToMelSpectrogram requires rank >= 1 input.')
+    return tuple(input_shape[:-1]) + (self.num_mel_bins,)
+
+  def _get_weights(self, num_bins, dtype):
+    if (
+        self._cached_weights is None
+        or self._cached_num_bins != num_bins
+        or self._cached_dtype != dtype
+    ):
+      weights = linear_to_mel_weight_matrix(
+          num_mel_bins=self.num_mel_bins,
+          num_spectrogram_bins=num_bins,
+          sample_rate=self.sample_rate,
+          lower_edge_hertz=self.lower_edge_hertz,
+          upper_edge_hertz=self.upper_edge_hertz,
+      )
+      self._cached_weights = mx.array(weights, dtype=dtype)
+      self._cached_num_bins = num_bins
+      self._cached_dtype = dtype
+    return self._cached_weights
+
+  @override
+  @types.check_layer
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    weights = self._get_weights(x.shape[-1], x.dtype)
+    return x.apply_values_masked(lambda v: v @ weights)
+
+  @classmethod
+  def from_config(cls, config):
+    layer = cls(
+        num_mel_bins=config.num_mel_bins,
+        sample_rate=config.sample_rate,
+        lower_edge_hertz=config.lower_edge_hertz,
+        upper_edge_hertz=config.upper_edge_hertz,
+    )
+    layer.config = config
+    return layer
diff --git a/sequence_layers/mlx/dsp_test.py b/sequence_layers/mlx/dsp_test.py
new file mode 100644
index 0000000..7bc1913
--- /dev/null
+++ b/sequence_layers/mlx/dsp_test.py
@@ -0,0 +1,182 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for DSP MLX sequence layers."""
+
+from absl.testing import absltest
+from absl.testing import parameterized
+import mlx.core as mx
+import numpy as np
+
+from sequence_layers.jax import dsp as jax_dsp
+from sequence_layers.mlx import dsp
+from sequence_layers.mlx import test_utils
+from sequence_layers.specs import dsp_behaviors as spec
+
+
+class DelayTest(test_utils.SequenceLayerTest, spec.DelayTest):
+
+  def test_from_config(self):
+    config = jax_dsp.Delay.Config(length=3)
+    mlx_layer = dsp.Delay.from_config(config)
+    self.assertIsInstance(mlx_layer, dsp.Delay)
+
+
+class LookaheadTest(test_utils.SequenceLayerTest, spec.LookaheadTest):
+
+  def test_from_config(self):
+    config = jax_dsp.Lookahead.Config(length=2)
+    mlx_layer = dsp.Lookahead.from_config(config)
+    self.assertIsInstance(mlx_layer, dsp.Lookahead)
+
+
+class WindowTest(test_utils.SequenceLayerTest, spec.WindowTest):
+
+  def test_from_config(self):
+    config = jax_dsp.Window.Config(axis=-1)
+    mlx_layer = dsp.Window.from_config(config)
+    self.assertIsInstance(mlx_layer, dsp.Window)
+
+
+class FrameTest(test_utils.SequenceLayerTest, spec.FrameTest):
+
+  def test_from_config(self):
+    config = jax_dsp.Frame.Config(
+        frame_length=4,
+        frame_step=2,
+        padding='causal',
+    )
+    mlx_layer = dsp.Frame.from_config(config)
+    self.assertIsInstance(mlx_layer, dsp.Frame)
+
+
+class OverlapAddTest(test_utils.SequenceLayerTest, spec.OverlapAddTest):
+
+  def test_from_config(self):
+    config = jax_dsp.OverlapAdd.Config(
+        frame_length=4,
+        frame_step=2,
+        padding='causal',
+    )
+    mlx_layer = dsp.OverlapAdd.from_config(config)
+    self.assertIsInstance(mlx_layer, dsp.OverlapAdd)
+
+
+class FFTTest(test_utils.SequenceLayerTest, spec.FFTTest):
+
+  def test_from_config(self):
+    config = jax_dsp.FFT.Config()
+    mlx_layer = dsp.FFT.from_config(config)
+    self.assertIsInstance(mlx_layer, dsp.FFT)
+
+
+class IFFTTest(test_utils.SequenceLayerTest, spec.IFFTTest):
+
+  def test_from_config(self):
+    config = jax_dsp.IFFT.Config()
+    mlx_layer = dsp.IFFT.from_config(config)
+    self.assertIsInstance(mlx_layer, dsp.IFFT)
+
+
+class RFFTTest(test_utils.SequenceLayerTest, spec.RFFTTest):
+
+  def test_from_config(self):
+    config = jax_dsp.RFFT.Config()
+    mlx_layer = dsp.RFFT.from_config(config)
+    self.assertIsInstance(mlx_layer, dsp.RFFT)
+
+
+class IRFFTTest(test_utils.SequenceLayerTest, spec.IRFFTTest):
+
+  def test_from_config(self):
+    config = jax_dsp.IRFFT.Config()
+    mlx_layer = dsp.IRFFT.from_config(config)
+    self.assertIsInstance(mlx_layer, dsp.IRFFT)
+
+
+class STFTTest(test_utils.SequenceLayerTest, spec.STFTTest):
+
+  def test_from_config(self):
+    config = jax_dsp.STFT.Config(
+        frame_length=16,
+        frame_step=8,
+        fft_length=16,
+        time_padding='causal',
+    )
+    mlx_layer = dsp.STFT.from_config(config)
+    self.assertIsInstance(mlx_layer, dsp.STFT)
+
+
+class InverseSTFTTest(test_utils.SequenceLayerTest, spec.InverseSTFTTest):
+
+  def test_from_config(self):
+    config = jax_dsp.InverseSTFT.Config(
+        frame_length=16,
+        frame_step=8,
+        fft_length=16,
+        time_padding='causal',
+    )
+    mlx_layer = dsp.InverseSTFT.from_config(config)
+    self.assertIsInstance(mlx_layer, dsp.InverseSTFT)
+
+
+class LinearToMelSpectrogramTest(
+    test_utils.SequenceLayerTest, spec.LinearToMelSpectrogramTest
+):
+
+  def test_from_config(self):
+    config = jax_dsp.LinearToMelSpectrogram.Config(
+        num_mel_bins=40,
+        sample_rate=16000.0,
+        lower_edge_hertz=80.0,
+        upper_edge_hertz=7600.0,
+    )
+    mlx_layer = dsp.LinearToMelSpectrogram.from_config(config)
+    self.assertIsInstance(
+        mlx_layer,
+        dsp.LinearToMelSpectrogram,
+    )
+
+
+class SignalUtilitiesTest(parameterized.TestCase):
+
+  def test_hann_window(self):
+    w = dsp.hann_window(4)
+    self.assertEqual(len(w), 4)
+    # Periodic Hann: endpoints should not both be zero.
+    self.assertGreater(w[-1], 0.0)
+
+  def test_frame(self):
+    values = mx.arange(10).reshape(1, 10, 1).astype(mx.float32)
+    framed = dsp.frame(values, 4, 2)
+    self.assertEqual(framed.shape, (1, 4, 4, 1))
+
+  def test_overlap_and_add_identity(self):
+    signal_arr = mx.array([[[1.0, 2.0], [3.0, 4.0]]])
+    result = dsp.overlap_and_add(signal_arr, 2)
+    np.testing.assert_allclose(np.array(result), [[1.0, 2.0, 3.0, 4.0]])
+
+  def test_mel_weight_matrix(self):
+    w = dsp.linear_to_mel_weight_matrix(
+        num_mel_bins=40,
+        num_spectrogram_bins=129,
+        sample_rate=16000,
+        lower_edge_hertz=80.0,
+        upper_edge_hertz=7600.0,
+    )
+    self.assertEqual(w.shape, (129, 40))
+    self.assertTrue(np.all(w >= 0))
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/sequence_layers/mlx/export.py b/sequence_layers/mlx/export.py
new file mode 100644
index 0000000..1beebce
--- /dev/null
+++ b/sequence_layers/mlx/export.py
@@ -0,0 +1,198 @@
+"""Export MLX SequenceLayer step() to .mlxfn for streaming inference."""
+
+import mlx.core as mx
+
+from . import types as bt
+
+Sequence = bt.Sequence
+
+
+# ---------------------------------------------------------------------------
+# State flattening / unflattening
+# ---------------------------------------------------------------------------
+
+
+def _flatten_state(state):
+  """Flatten a nested pytree state into a list of mx.array.
+
+  Handles tuples, lists, and mx.array leaves. Empty tuples contribute
+  zero arrays.
+
+  Args:
+    state: Nested tuple/list of mx.array.
+
+  Returns:
+    (flat_arrays, structure) where structure encodes the nesting.
+  """
+  flat = []
+
+  def _record(node):
+    if isinstance(node, mx.array):
+      flat.append(node)
+      return 'array'
+    elif isinstance(node, tuple):
+      children = [_record(child) for child in node]
+      return ('tuple', children)
+    elif isinstance(node, list):
+      children = [_record(child) for child in node]
+      return ('list', children)
+    else:
+      raise TypeError(f'Unsupported state node type: {type(node)}')
+
+  structure = _record(state)
+  return flat, structure
+
+
+def _unflatten_state(flat, structure):
+  """Reconstruct a nested state from a flat array list and structure.
+
+  Args:
+    flat: List of mx.array.
+    structure: Structure descriptor from _flatten_state.
+
+  Returns:
+    Nested tuple/list matching the original structure.
+  """
+  idx = [0]
+
+  def _rebuild(struct):
+    if struct == 'array':
+      result = flat[idx[0]]
+      idx[0] += 1
+      return result
+    elif isinstance(struct, tuple) and struct[0] == 'tuple':
+      return tuple(_rebuild(s) for s in struct[1])
+    elif isinstance(struct, tuple) and struct[0] == 'list':
+      return [_rebuild(s) for s in struct[1]]
+    else:
+      raise ValueError(f'Unknown structure node: {struct}')
+
+  result = _rebuild(structure)
+  if idx[0] != len(flat):
+    raise ValueError(f'Not all arrays consumed: used {idx[0]} of {len(flat)}')
+  return result
+
+
+# ---------------------------------------------------------------------------
+# Export
+# ---------------------------------------------------------------------------
+
+
+def _materialize_deferred(model, batch_size, input_spec, *, constants=None):
+  """Run a dummy forward pass to materialize all deferred layers."""
+  x_values = mx.zeros(
+      (batch_size, 1) + input_spec.shape, dtype=input_spec.dtype
+  )
+  x_mask = mx.ones((batch_size, 1), dtype=mx.bool_)
+  x = Sequence(x_values, x_mask)
+  state = model.get_initial_state(
+      batch_size, input_spec, training=False, constants=constants
+  )
+  model.step(x, state, training=False, constants=constants)
+  mx.eval(model.parameters())
+
+
+def get_initial_state_flat(model, batch_size, input_spec, *, constants=None):
+  """Get flattened initial state arrays and structure for a model.
+
+  Args:
+    model: An MLX SequenceLayer.
+    batch_size: Batch size.
+    input_spec: A ShapeDType describing the input channels.
+    constants: Optional constants dict.
+
+  Returns:
+    (flat_arrays, structure) where flat_arrays is a list of mx.array
+    and structure can be used with _unflatten_state.
+  """
+  state = model.get_initial_state(
+      batch_size, input_spec, training=False, constants=constants
+  )
+  flat, structure = _flatten_state(state)
+  mx.eval(*flat) if flat else None
+  return flat, structure
+
+
+def export_step(
+    model,
+    path,
+    batch_size,
+    input_spec,
+    *,
+    constants=None,
+    time_steps=1,
+):
+  """Export model.step() to a .mlxfn file.
+
+  The exported function signature is:
+    (x_values, x_mask, *state_flat) -> (y_values, y_mask, *new_state_flat)
+
+  Model weights are captured in the closure and embedded in the .mlxfn
+  file. State arrays (e.g. KV cache) are explicit I/O.
+
+  The exported function uses fixed shapes (batch_size, time_steps).
+  For streaming generation, time_steps=1 is typical.
+
+  Args:
+    model: An MLX SequenceLayer with supports_step.
+    path: Output file path (should end in .mlxfn).
+    batch_size: Batch size for the exported function.
+    input_spec: A ShapeDType describing the input channel shape and dtype.
+    constants: Optional constants dict for cross-attention.
+    time_steps: Number of time steps per call (default 1).
+  """
+  if not model.supports_step:
+    raise ValueError(f'{model.__class__.__name__} does not support step().')
+
+  # Materialize all deferred layers.
+  _materialize_deferred(model, batch_size, input_spec, constants=constants)
+
+  # Get initial state and flatten.
+  flat_state, structure = get_initial_state_flat(
+      model, batch_size, input_spec, constants=constants
+  )
+
+  # Make sure all model params are evaluated.
+  mx.eval(model.parameters())
+
+  def step_fn(x_values, x_mask, *state_flat):
+    state = _unflatten_state(list(state_flat), structure)
+    x = Sequence(x_values, x_mask)
+    y, new_state = model.step(x, state, training=False, constants=constants)
+    new_flat, _ = _flatten_state(new_state)
+    return (y.values, y.mask, *new_flat)
+
+  # Create example inputs for tracing.
+  x_values = mx.zeros(
+      (batch_size, time_steps) + input_spec.shape,
+      dtype=input_spec.dtype,
+  )
+  x_mask = mx.ones((batch_size, time_steps), dtype=mx.bool_)
+  mx.eval(x_values, x_mask)
+
+  mx.export_function(
+      path,
+      step_fn,
+      x_values,
+      x_mask,
+      *flat_state,
+  )
+
+
+def run_exported(imported_fn, x_values, x_mask, state_flat):
+  """Call an imported .mlxfn step function.
+
+  Args:
+    imported_fn: A function from mx.import_function().
+    x_values: Input values array [batch, time, ...channels].
+    x_mask: Input mask array [batch, time].
+    state_flat: List of flat state arrays.
+
+  Returns:
+    (y_values, y_mask, new_state_flat) where new_state_flat is a list.
+  """
+  results = imported_fn(x_values, x_mask, *state_flat)
+  y_values = results[0]
+  y_mask = results[1]
+  new_state_flat = list(results[2:])
+  return y_values, y_mask, new_state_flat
diff --git a/sequence_layers/mlx/export_test.py b/sequence_layers/mlx/export_test.py
new file mode 100644
index 0000000..735f6c0
--- /dev/null
+++ b/sequence_layers/mlx/export_test.py
@@ -0,0 +1,297 @@
+"""Tests for MLX export utilities."""
+
+import os
+import tempfile
+
+import mlx.core as mx
+import numpy as np
+from absl.testing import absltest
+from absl.testing import parameterized
+
+from sequence_layers.mlx import basic_types as bt
+from sequence_layers.mlx import export
+from sequence_layers.mlx import test_utils
+
+Sequence = bt.Sequence
+ShapeDType = bt.ShapeDType
+
+
+class StateFlattenTest(parameterized.TestCase):
+  """Tests for state flatten/unflatten."""
+
+  def test_empty_tuple(self):
+    state = ()
+    flat, structure = export._flatten_state(state)
+    self.assertEmpty(flat)
+    rebuilt = export._unflatten_state(flat, structure)
+    self.assertEqual(rebuilt, ())
+
+  def test_single_array(self):
+    arr = mx.zeros((2, 3))
+    state = (arr,)
+    flat, structure = export._flatten_state(state)
+    self.assertLen(flat, 1)
+    rebuilt = export._unflatten_state(flat, structure)
+    self.assertIsInstance(rebuilt, tuple)
+    np.testing.assert_array_equal(rebuilt[0], arr)
+
+  def test_nested_tuples(self):
+    a = mx.ones((2,))
+    b = mx.zeros((3, 4))
+    c = mx.full((1,), 5.0)
+    state = ((a, b), (c, ()))
+    flat, structure = export._flatten_state(state)
+    self.assertLen(flat, 3)
+    rebuilt = export._unflatten_state(flat, structure)
+    np.testing.assert_array_equal(rebuilt[0][0], a)
+    np.testing.assert_array_equal(rebuilt[0][1], b)
+    np.testing.assert_array_equal(rebuilt[1][0], c)
+    self.assertEqual(rebuilt[1][1], ())
+
+  def test_attention_state_round_trip(self):
+    """Simulate attention state: (keys, values, mask, time, (), (), ())."""
+    keys = mx.zeros((2, 8, 4, 16))
+    values = mx.zeros((2, 8, 4, 16))
+    mask = mx.zeros((2, 8), dtype=mx.bool_)
+    time = mx.zeros((2,), dtype=mx.int32)
+    state = (keys, values, mask, time, (), (), ())
+    flat, structure = export._flatten_state(state)
+    self.assertLen(flat, 4)
+    rebuilt = export._unflatten_state(flat, structure)
+    np.testing.assert_array_equal(rebuilt[0], keys)
+    np.testing.assert_array_equal(rebuilt[1], values)
+    np.testing.assert_array_equal(rebuilt[2], mask)
+    np.testing.assert_array_equal(rebuilt[3], time)
+    self.assertEqual(rebuilt[4], ())
+    self.assertEqual(rebuilt[5], ())
+    self.assertEqual(rebuilt[6], ())
+
+  def test_serial_state_round_trip(self):
+    """Simulate Serial state: tuple of per-layer states."""
+    state = (
+        (),  # Identity (stateless)
+        (
+            mx.zeros((2, 4, 4, 8)),  # Attention keys
+            mx.zeros((2, 4, 4, 8)),  # values
+            mx.zeros((2, 4), dtype=mx.bool_),  # mask
+            mx.zeros((2,), dtype=mx.int32),  # time
+            mx.full((2, 1), -1, dtype=mx.int32),  # q_net_state
+            mx.full((2, 1), -1, dtype=mx.int32),  # k_net_state
+            (),
+        ),  # v_net_state
+        (),  # Dense (stateless)
+    )
+    flat, structure = export._flatten_state(state)
+    self.assertLen(flat, 6)
+    rebuilt = export._unflatten_state(flat, structure)
+    self.assertEqual(rebuilt[0], ())
+    self.assertLen(rebuilt[1], 7)
+    self.assertEqual(rebuilt[2], ())
+
+
+class ExportDenseTest(parameterized.TestCase):
+  """Test exporting a simple Dense layer."""
+
+  def test_export_dense_step(self):
+    from sequence_layers.mlx import dense
+
+    layer = dense.Dense(in_features=8, features=16, use_bias=True)
+    input_spec = ShapeDType((8,), mx.float32)
+    batch_size = 2
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+      path = os.path.join(tmpdir, 'dense.mlxfn')
+      export.export_step(
+          layer,
+          path,
+          batch_size=batch_size,
+          input_spec=input_spec,
+      )
+      self.assertTrue(os.path.exists(path))
+
+      # Import and run.
+      imported = mx.import_function(path)
+      flat_state, structure = export.get_initial_state_flat(
+          layer, batch_size, input_spec
+      )
+
+      x = test_utils.random_sequence(batch_size, 1, 8)
+      mx.eval(x.values, x.mask)
+
+      # Run native.
+      state = layer.get_initial_state(batch_size, input_spec, training=False)
+      y_native, _ = layer.step(x, state, training=False)
+
+      # Run exported.
+      y_vals, y_mask, new_state = export.run_exported(
+          imported, x.values, x.mask, flat_state
+      )
+      mx.eval(y_native.values, y_vals)
+
+      np.testing.assert_allclose(
+          np.array(y_vals),
+          np.array(y_native.values),
+          atol=1e-5,
+          rtol=1e-5,
+      )
+
+  def test_export_dense_no_bias(self):
+    from sequence_layers.mlx import dense
+
+    layer = dense.Dense(in_features=8, features=16, use_bias=False)
+    input_spec = ShapeDType((8,), mx.float32)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+      path = os.path.join(tmpdir, 'dense_nobias.mlxfn')
+      export.export_step(layer, path, batch_size=1, input_spec=input_spec)
+
+      imported = mx.import_function(path)
+      flat_state, _ = export.get_initial_state_flat(layer, 1, input_spec)
+
+      x = test_utils.random_sequence(1, 1, 8)
+      mx.eval(x.values, x.mask)
+
+      state = layer.get_initial_state(1, input_spec, training=False)
+      y_native, _ = layer.step(x, state, training=False)
+
+      y_vals, _, _ = export.run_exported(imported, x.values, x.mask, flat_state)
+      mx.eval(y_native.values, y_vals)
+
+      np.testing.assert_allclose(
+          np.array(y_vals),
+          np.array(y_native.values),
+          atol=1e-5,
+          rtol=1e-5,
+      )
+
+
+class ExportAttentionTest(parameterized.TestCase):
+  """Test exporting attention with KV cache."""
+
+  def test_export_attention_multi_step(self):
+    from sequence_layers.mlx import attention
+
+    layer = attention.DotProductSelfAttention(
+        in_features=16,
+        num_heads=2,
+        units_per_head=8,
+        max_past_horizon=32,
+        max_future_horizon=0,
+    )
+    input_spec = ShapeDType((16,), mx.float32)
+    batch_size = 1
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+      path = os.path.join(tmpdir, 'attn.mlxfn')
+      export.export_step(
+          layer,
+          path,
+          batch_size=batch_size,
+          input_spec=input_spec,
+      )
+
+      imported = mx.import_function(path)
+
+      # Run 3 steps natively.
+      state = layer.get_initial_state(batch_size, input_spec, training=False)
+      flat_state, structure = export._flatten_state(state)
+      mx.eval(*flat_state)
+
+      # Use same inputs for both native and exported.
+      inputs = []
+      for _ in range(3):
+        x = test_utils.random_sequence(batch_size, 1, 16)
+        mx.eval(x.values, x.mask)
+        inputs.append(x)
+
+      # Native.
+      native_state = state
+      native_outputs = []
+      for x in inputs:
+        y, native_state = layer.step(x, native_state, training=False)
+        mx.eval(y.values)
+        native_outputs.append(np.array(y.values))
+
+      # Exported.
+      exported_state = list(flat_state)
+      exported_outputs = []
+      for x in inputs:
+        y_vals, y_mask, exported_state = export.run_exported(
+            imported, x.values, x.mask, exported_state
+        )
+        mx.eval(y_vals)
+        exported_outputs.append(np.array(y_vals))
+
+      for i, (native, exported) in enumerate(
+          zip(native_outputs, exported_outputs)
+      ):
+        np.testing.assert_allclose(
+            exported,
+            native,
+            atol=1e-5,
+            rtol=1e-5,
+            err_msg=f'Step {i} mismatch',
+        )
+
+
+class ExportSerialTest(parameterized.TestCase):
+  """Test exporting a Serial model."""
+
+  def test_export_serial(self):
+    from sequence_layers.mlx import combinators
+    from sequence_layers.mlx import dense
+    from sequence_layers.mlx import normalization
+
+    model = combinators.Serial([
+        normalization.RMSNormalization(epsilon=1e-6),
+        dense.Dense(
+            in_features=8,
+            features=16,
+            use_bias=True,
+            activation=mx.sigmoid,
+        ),
+        dense.Dense(in_features=16, features=8, use_bias=True),
+    ])
+    input_spec = ShapeDType((8,), mx.float32)
+    batch_size = 2
+
+    # Materialize deferred layers.
+    export._materialize_deferred(model, batch_size, input_spec)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+      path = os.path.join(tmpdir, 'serial.mlxfn')
+      export.export_step(
+          model,
+          path,
+          batch_size=batch_size,
+          input_spec=input_spec,
+      )
+
+      imported = mx.import_function(path)
+      flat_state, structure = export.get_initial_state_flat(
+          model, batch_size, input_spec
+      )
+
+      x = test_utils.random_sequence(batch_size, 1, 8)
+      mx.eval(x.values, x.mask)
+
+      # Native.
+      state = model.get_initial_state(batch_size, input_spec, training=False)
+      y_native, _ = model.step(x, state, training=False)
+
+      # Exported.
+      y_vals, y_mask, _ = export.run_exported(
+          imported, x.values, x.mask, flat_state
+      )
+      mx.eval(y_native.values, y_vals)
+
+      np.testing.assert_allclose(
+          np.array(y_vals),
+          np.array(y_native.values),
+          atol=1e-5,
+          rtol=1e-5,
+      )
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/sequence_layers/mlx/init_mapping.py b/sequence_layers/mlx/init_mapping.py
new file mode 100644
index 0000000..d9cc6c1
--- /dev/null
+++ b/sequence_layers/mlx/init_mapping.py
@@ -0,0 +1,239 @@
+"""Mapping JAX/Flax initializers and activations to MLX equivalents."""
+
+import math
+
+import jax
+import jax.numpy as jnp
+from mlx import nn
+import mlx.core as mx
+import numpy as np
+
+
+def _variance_scaling(key, shape, dtype, mode, distribution, fan_in, fan_out):
+  """Variance scaling initializer core logic."""
+  dtype = _to_mx_dtype(dtype)
+  if mode == 'fan_in':
+    denominator = max(fan_in, 1)
+  elif mode == 'fan_out':
+    denominator = max(fan_out, 1)
+  elif mode == 'fan_avg':
+    denominator = max((fan_in + fan_out) / 2.0, 1)
+  else:
+    raise ValueError(f'Unknown mode: {mode}')
+
+  variance = 1.0 / denominator
+  if distribution == 'truncated_normal':
+    stddev = math.sqrt(variance) / 0.87962566103423978
+    return (
+        mx.random.truncated_normal(-2.0, 2.0, shape=shape, key=key).astype(
+            dtype
+        )
+        * stddev
+    )
+  if distribution == 'normal':
+    return mx.random.normal(shape=shape, key=key).astype(dtype) * math.sqrt(
+        variance
+    )
+  if distribution == 'uniform':
+    limit = math.sqrt(3.0 * variance)
+    return mx.random.uniform(-limit, limit, shape=shape, key=key).astype(dtype)
+
+  raise ValueError(f'Unknown distribution: {distribution}')
+
+
+def _compute_fans(shape):
+  """Compute fan_in and fan_out for a weight shape."""
+  if len(shape) < 1:
+    fan_in = fan_out = 1
+  elif len(shape) == 1:
+    fan_in = fan_out = shape[0]
+  elif len(shape) == 2:
+    fan_in, fan_out = shape
+  else:
+    # Conv kernels: last two dims are (fan_in, fan_out), rest are spatial.
+    receptive_field_size = 1
+    for s in shape[:-2]:
+      receptive_field_size *= s
+    fan_in = shape[-2] * receptive_field_size
+    fan_out = shape[-1] * receptive_field_size
+  return fan_in, fan_out
+
+
+def _make_variance_scaling_init(mode, distribution):
+  """Create an MLX variance scaling initializer."""
+
+  def init_fn(key, shape, dtype=mx.float32):
+    fan_in, fan_out = _compute_fans(shape)
+    return _variance_scaling(
+        key, shape, dtype, mode, distribution, fan_in, fan_out
+    )
+
+  return init_fn
+
+
+def _to_mx_dtype(dtype):
+  """Convert any dtype (JAX, numpy, MLX) to an MLX dtype."""
+  if isinstance(dtype, mx.Dtype):
+    return dtype
+  name = getattr(dtype, '__name__', '') or str(dtype)
+  mapping = {
+      'float32': mx.float32,
+      'float16': mx.float16,
+      'bfloat16': mx.bfloat16,
+      'float64': mx.float32,  # MLX lacks float64.
+      'int32': mx.int32,
+      'int64': mx.int32,  # MLX lacks int64.
+      'int16': mx.int16,
+      'int8': mx.int8,
+      'uint8': mx.uint8,
+      'uint32': mx.uint32,
+      'bool': mx.bool_,
+      'bool_': mx.bool_,
+      'complex64': mx.complex64,
+  }
+  for key, val in mapping.items():
+    if key in name:
+      return val
+  return mx.float32
+
+
+def _zeros_init(key, shape, dtype=mx.float32):
+  """Initializer that generates tensors initialized with 0."""
+  del key
+  return mx.zeros(shape, dtype=_to_mx_dtype(dtype))
+
+
+def _ones_init(key, shape, dtype=mx.float32):
+  """Initializer that generates tensors initialized with 1."""
+  del key
+  return mx.ones(shape, dtype=_to_mx_dtype(dtype))
+
+
+def _normal_init(stddev=0.01):
+  """Initializer that generates tensors with a normal distribution."""
+
+  def init_fn(key, shape, dtype=mx.float32):
+    dtype = _to_mx_dtype(dtype)
+    return mx.random.normal(shape=shape, key=key).astype(dtype) * stddev
+
+  return init_fn
+
+
+def map_initializer(jax_init):
+  """Convert a JAX/Flax initializer to an MLX-compatible initializer.
+
+  Args:
+    jax_init: A JAX/Flax initializer function.
+
+  Returns:
+    An MLX initializer function with signature (key, shape, dtype).
+  """
+  if jax_init is None:
+    return None
+
+  # Check for common Flax initializer instances by calling with
+  # a probe to determine behavior.
+  try:
+    # Test with a small shape to determine the initializer type.
+    test_key = jax.random.PRNGKey(0)
+    test_shape = (4, 4)
+    test_out = jax_init(test_key, test_shape, jnp.float32)
+    test_np = np.array(test_out)
+
+    # Check if it's zeros.
+    if np.allclose(test_np, 0.0):
+      return _zeros_init
+
+    # Check if it's ones.
+    if np.allclose(test_np, 1.0):
+      return _ones_init
+  except Exception:  # pylint: disable=broad-exception-caught
+    pass
+
+  # Try to identify by function name or attributes.
+  name = getattr(jax_init, '__name__', '')
+  qualname = getattr(jax_init, '__qualname__', '')
+  func = getattr(jax_init, 'func', None)
+  func_qualname = getattr(func, '__qualname__', '') if func else ''
+
+  # Variance scaling variants.
+  if 'lecun_normal' in name or 'lecun_normal' in qualname:
+    return _make_variance_scaling_init('fan_in', 'truncated_normal')
+  if 'lecun_uniform' in name or 'lecun_uniform' in qualname:
+    return _make_variance_scaling_init('fan_in', 'uniform')
+  if 'glorot_normal' in name or 'glorot_normal' in qualname:
+    return _make_variance_scaling_init('fan_avg', 'truncated_normal')
+  if 'glorot_uniform' in name or 'glorot_uniform' in qualname:
+    return _make_variance_scaling_init('fan_avg', 'uniform')
+  if 'he_normal' in name or 'he_normal' in qualname:
+    return _make_variance_scaling_init('fan_in', 'normal')
+  if 'he_uniform' in name or 'he_uniform' in qualname:
+    return _make_variance_scaling_init('fan_in', 'uniform')
+  if 'xavier_normal' in name or 'xavier_normal' in qualname:
+    return _make_variance_scaling_init('fan_avg', 'normal')
+  if 'xavier_uniform' in name or 'xavier_uniform' in qualname:
+    return _make_variance_scaling_init('fan_avg', 'uniform')
+
+  # Check for variance_scaling in qualname/func.
+  if 'variance_scaling' in qualname or 'variance_scaling' in func_qualname:
+    return _make_variance_scaling_init('fan_in', 'truncated_normal')
+
+  if 'zeros' in name or 'zeros' in qualname:
+    return _zeros_init
+  if 'ones' in name or 'ones' in qualname:
+    return _ones_init
+
+  # Default fallback: lecun_normal equivalent.
+  return _make_variance_scaling_init('fan_in', 'truncated_normal')
+
+
+# ---------------------------------------------------------------------------
+# Activation mapping
+# ---------------------------------------------------------------------------
+
+_ACTIVATION_MAP = {}
+
+
+def _build_activation_map():
+  """Build the JAX -> MLX activation mapping lazily."""
+  if _ACTIVATION_MAP:
+    return
+  _ACTIVATION_MAP.update({
+      jax.nn.relu: nn.relu,
+      jax.nn.gelu: nn.gelu,
+      jax.nn.silu: nn.silu,
+      jax.nn.swish: nn.silu,  # swish == silu
+      jax.nn.sigmoid: mx.sigmoid,
+      jax.nn.tanh: mx.tanh,
+      jax.nn.softmax: mx.softmax,
+      jax.nn.elu: nn.elu,
+      jax.nn.leaky_relu: nn.leaky_relu,
+      jax.nn.log_softmax: mx.log,  # Approximate.
+  })
+  # Also add jnp versions.
+  for k, v in list(_ACTIVATION_MAP.items()):
+    name = getattr(k, '__name__', '')
+    jnp_fn = getattr(jnp, name, None)
+    if jnp_fn is not None and jnp_fn not in _ACTIVATION_MAP:
+      _ACTIVATION_MAP[jnp_fn] = v
+
+
+def map_activation(jax_activation):
+  """Convert a JAX activation function to its MLX equivalent.
+
+  Args:
+    jax_activation: A JAX activation function (e.g. jax.nn.relu).
+
+  Returns:
+    The corresponding MLX activation, or the original function
+    if no mapping is found.
+  """
+  if jax_activation is None:
+    return None
+  _build_activation_map()
+  return _ACTIVATION_MAP.get(jax_activation, jax_activation)
+
+
+to_mx_dtype = _to_mx_dtype
+zeros_init = _zeros_init
+make_variance_scaling_init = _make_variance_scaling_init
diff --git a/sequence_layers/mlx/normalization.py b/sequence_layers/mlx/normalization.py
new file mode 100644
index 0000000..ab91dc1
--- /dev/null
+++ b/sequence_layers/mlx/normalization.py
@@ -0,0 +1,463 @@
+"""Normalization layers for MLX."""
+
+import dataclasses
+from typing import Any, override
+from typing import Sequence as _Sequence
+
+from mlx import nn
+import mlx.core as mx
+
+from sequence_layers.mlx import init_mapping
+from sequence_layers.mlx import types
+from sequence_layers.mlx.init_mapping import _to_mx_dtype
+from sequence_layers.specs import normalization as spec
+
+Sequence = types.Sequence
+
+
+def _normalize_axes(axis, input_shape):
+  """Normalize axes and check batch/time are not specified."""
+  if isinstance(axis, int):
+    axis = (axis,)
+  normalized = set()
+  for a in axis:
+    if a < 0:
+      a += len(input_shape)
+    normalized.add(a)
+  axes = tuple(sorted(normalized))
+  for a in axes:
+    if a in (0, 1):
+      raise ValueError(
+          f'Normalizing over batch or time is not allowed. Got: {axes}'
+      )
+  return axes
+
+
+class L2Normalize(
+    types.PreservesType,
+    types.StatelessPointwise,
+    spec.L2Normalize[types.Sequence, types.ShapeDType],
+):
+  """L2 normalization over the specified channel axes."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types.SequenceLayerConfig, spec.L2Normalize.Config):
+    """Configuration for L2Normalize."""
+
+    axis: int | _Sequence[int] = -1
+    epsilon: float = 1e-12
+    name: str | None = None
+
+    @override
+    def make(self) -> 'L2Normalize':
+      return L2Normalize(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+
+  @override
+  @types.check_layer
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool = False, constants=None
+  ):
+    values = x.values
+    axes = _normalize_axes(self.config.axis, values.shape)
+
+    v = values.astype(mx.float32)
+    squared_sum = mx.sum(mx.square(v), axis=axes, keepdims=True)
+    normed = v * mx.rsqrt(squared_sum + self.config.epsilon)
+    return Sequence(normed.astype(values.dtype), x.mask)
+
+
+class RMSNormalization(
+    types.PreservesType,
+    types.StatelessPointwise,
+    spec.RMSNormalization[types.Sequence, types.ShapeDType],
+):
+  """RMS Normalization backed by mlx.nn.RMSNorm."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types.SequenceLayerConfig, spec.RMSNormalization.Config):
+    """Configuration for RMSNormalization."""
+
+    axis: int | _Sequence[int] = -1
+    epsilon: float = 1e-6
+    use_scale: bool = True
+    scale_init: Any | None = None
+    compute_dtype: types.DType | None = None
+    param_dtype: types.DType = mx.float32
+    name: str | None = None
+
+    def __post_init__(self):
+      if not isinstance(self.axis, int):
+        object.__setattr__(self, 'axis', tuple(self.axis))
+
+    @override
+    def make(self) -> 'RMSNormalization':
+      return RMSNormalization(self)
+
+  def __init__(
+      self,
+      config: Config | None = None,
+      *,
+      axis: int | _Sequence[int] = -1,
+      epsilon: float = 1e-6,
+      use_scale: bool = True,
+      scale_init: Any | None = None,
+      compute_dtype: types.DType | None = None,
+      param_dtype: types.DType = mx.float32,
+  ):
+    super().__init__()
+    if config is not None:
+      self.config = config
+    else:
+      self.config = self.Config(
+          axis=axis,
+          epsilon=epsilon,
+          use_scale=use_scale,
+          scale_init=scale_init,
+          compute_dtype=compute_dtype,
+          param_dtype=param_dtype,
+      )
+    self._param_dtype = _to_mx_dtype(self.config.param_dtype)
+    self._scale_init = init_mapping.map_initializer(self.config.scale_init)
+    # mlx.nn.RMSNorm created lazily since we need input shape.
+    self._rms_norm = None
+    self._scale = None
+    self._use_builtin = False
+
+  def _ensure_initialized(self, input_shape):
+    """Create internal RMSNorm on first call."""
+    if self._rms_norm is not None or not self.config.use_scale:
+      return
+    axes = _normalize_axes(self.config.axis, input_shape)
+    # mlx.nn.RMSNorm only supports normalizing over the last dim.
+    if axes == (len(input_shape) - 1,) and self._scale_init is None:
+      dims = input_shape[-1]
+      self._rms_norm = nn.RMSNorm(dims, eps=self.config.epsilon)
+      self._use_builtin = True
+    else:
+      # Multi-axis or custom init: manual scale parameter.
+      scale_shape = tuple(input_shape[a] for a in axes)
+      if self._scale_init is not None:
+        key = mx.random.key(0)
+        self._scale = self._scale_init(key, scale_shape, self._param_dtype)
+      else:
+        self._scale = mx.ones(scale_shape, dtype=self._param_dtype)
+
+  @override
+  @types.check_layer
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool = False, constants=None
+  ):
+    self._ensure_initialized(x.values.shape)
+
+    if self._use_builtin and self._rms_norm is not None:
+      # Cast back to input dtype to preserve bfloat16 compute.
+      result = self._rms_norm(x.values).astype(x.values.dtype)
+      return Sequence(result, x.mask)
+
+    values = x.values
+    axes = _normalize_axes(self.config.axis, values.shape)
+
+    # Manual RMS norm in float32.
+    v = values.astype(mx.float32)
+    mean_sq = mx.mean(mx.square(v), axis=axes, keepdims=True)
+    normed = v * mx.rsqrt(mean_sq + self.config.epsilon)
+    normed = normed.astype(values.dtype)
+
+    # Apply learned scale.
+    if self.config.use_scale and self._scale is not None:
+      scale = self._scale.astype(normed.dtype)
+      shape = [1] * len(values.shape)
+      for i, a in enumerate(axes):
+        shape[a] = self._scale.shape[i]
+      scale = scale.reshape(shape)
+      normed = normed * scale
+
+    return Sequence(normed, x.mask)
+
+
+class LayerNormalization(
+    types.PreservesType,
+    types.StatelessPointwise,
+    spec.LayerNormalization[types.Sequence, types.ShapeDType],
+):
+  """Layer Normalization backed by mlx.nn.LayerNorm."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types.SequenceLayerConfig, spec.LayerNormalization.Config):
+    """Configuration for LayerNormalization."""
+
+    axis: int | _Sequence[int] = -1
+    epsilon: float = 1e-6
+    use_bias: bool = True
+    use_scale: bool = True
+    reductions_in_at_least_fp32: bool = True
+    compute_dtype: types.DType | None = None
+    param_dtype: types.DType = mx.float32
+    name: str | None = None
+
+    def __post_init__(self):
+      if not isinstance(self.axis, int):
+        object.__setattr__(self, 'axis', tuple(self.axis))
+
+    @override
+    def make(self) -> 'LayerNormalization':
+      return LayerNormalization(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+    self._param_dtype = _to_mx_dtype(config.param_dtype)
+    self._layer_norm = None
+    self._use_builtin = False
+    self._manual_scale = None
+    self._manual_bias = None
+
+  def _ensure_initialized(self, input_shape):
+    """Create internal LayerNorm on first call."""
+    if self._layer_norm is not None or self._manual_scale is not None:
+      return
+    if not self.config.use_scale and not self.config.use_bias:
+      return
+    axes = _normalize_axes(self.config.axis, input_shape)
+    # mlx.nn.LayerNorm supports a single last-dim normalization.
+    if axes == (len(input_shape) - 1,):
+      dims = input_shape[-1]
+      self._layer_norm = nn.LayerNorm(
+          dims,
+          eps=self.config.epsilon,
+          affine=self.config.use_scale or self.config.use_bias,
+          bias=self.config.use_bias,
+      )
+      self._use_builtin = True
+    else:
+      # Multi-axis: manual parameters.
+      scale_shape = tuple(input_shape[a] for a in axes)
+      if self.config.use_scale:
+        self._manual_scale = mx.ones(scale_shape, dtype=self._param_dtype)
+      if self.config.use_bias:
+        self._manual_bias = mx.zeros(scale_shape, dtype=self._param_dtype)
+
+  @override
+  @types.check_layer
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool = False, constants=None
+  ):
+    self._ensure_initialized(x.values.shape)
+
+    if self._use_builtin and self._layer_norm is not None:
+      x_values = x.values
+      original_dtype = x_values.dtype
+      if self.config.reductions_in_at_least_fp32:
+        x_values = x_values.astype(mx.float32)
+      # Cast back to input dtype to preserve bfloat16 compute.
+      result = self._layer_norm(x_values).astype(original_dtype)
+      return Sequence(result, x.mask)
+
+    values = x.values
+    axes = _normalize_axes(self.config.axis, values.shape)
+
+    # Manual layer norm in float32.
+    v = values.astype(mx.float32)
+    mean = mx.mean(v, axis=axes, keepdims=True)
+    variance = mx.mean(mx.square(v - mean), axis=axes, keepdims=True)
+    normed = (v - mean) * mx.rsqrt(variance + self.config.epsilon)
+    normed = normed.astype(values.dtype)
+
+    # Apply learned scale and bias.
+    if self.config.use_scale and self._manual_scale is not None:
+      scale = self._manual_scale.astype(normed.dtype)
+      shape = [1] * len(values.shape)
+      for i, a in enumerate(axes):
+        shape[a] = self._manual_scale.shape[i]
+      normed = normed * scale.reshape(shape)
+
+    if self.config.use_bias and self._manual_bias is not None:
+      bias = self._manual_bias.astype(normed.dtype)
+      shape = [1] * len(values.shape)
+      for i, a in enumerate(axes):
+        shape[a] = self._manual_bias.shape[i]
+      normed = normed + bias.reshape(shape)
+
+    return Sequence(normed, x.mask)
+
+
+class BatchNormalization(
+    types.PreservesType,
+    types.StatelessPointwise,
+    spec.BatchNormalization[types.Sequence, types.ShapeDType],
+):
+  """Batch Normalization (inference-only)."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types.SequenceLayerConfig, spec.BatchNormalization.Config):
+    """Configuration for BatchNormalization."""
+
+    axis: int | _Sequence[int] = -1
+    epsilon: float = 1e-5
+    momentum: float = 0.99
+    use_scale: bool = True
+    use_bias: bool = True
+    use_fast_variance: bool = True
+    compute_dtype: types.DType | None = None
+    param_dtype: types.DType = mx.float32
+    name: str | None = None
+
+    @override
+    def make(self) -> 'BatchNormalization':
+      return BatchNormalization(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+    self._param_dtype = _to_mx_dtype(config.param_dtype)
+    self._running_mean = None
+    self._running_var = None
+    self._scale = None
+    self._bias = None
+
+  def _ensure_initialized(self, input_shape):
+    """Create internal running statistics on first call."""
+    if self._running_mean is not None:
+      return
+    axes = _normalize_axes(self.config.axis, input_shape)
+    axis_size = input_shape[axes[0]]
+    self._running_mean = mx.zeros((axis_size,), dtype=self._param_dtype)
+    self._running_var = mx.ones((axis_size,), dtype=self._param_dtype)
+    if self.config.use_scale:
+      self._scale = mx.ones((axis_size,), dtype=self._param_dtype)
+    if self.config.use_bias:
+      self._bias = mx.zeros((axis_size,), dtype=self._param_dtype)
+
+  @override
+  @types.check_layer
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool = False, constants=None
+  ):
+    self._ensure_initialized(x.values.shape)
+    assert self._running_mean is not None
+    assert self._running_var is not None
+
+    values = x.values
+    axes = _normalize_axes(self.config.axis, values.shape)
+
+    # Broadcast running stats over batch and time.
+    shape = [1] * len(values.shape)
+    shape[axes[0]] = self._running_mean.shape[0]
+
+    mean = self._running_mean.reshape(shape)
+    var = self._running_var.reshape(shape)
+
+    normed = (values.astype(mx.float32) - mean) * mx.rsqrt(
+        var + self.config.epsilon
+    )
+    normed = normed.astype(values.dtype)
+
+    if self.config.use_scale and self._scale is not None:
+      normed = normed * self._scale.reshape(shape)
+    if self.config.use_bias and self._bias is not None:
+      normed = normed + self._bias.reshape(shape)
+
+    return Sequence(normed, x.mask)
+
+
+class GroupNormalization(
+    types.PreservesType,
+    types.StatelessPointwise,
+    spec.GroupNormalization[types.Sequence, types.ShapeDType],
+):
+  """Group Normalization.
+
+  Normalizes per-timestep within each group (not across time), so
+  that step() and layer() produce identical results.
+
+  Note: mlx.nn.GroupNorm normalizes across all spatial dims including
+  time, which is incompatible with the SequenceLayer step/layer contract.
+  """
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types.SequenceLayerConfig, spec.GroupNormalization.Config):
+    """Configuration for GroupNormalization."""
+
+    num_groups: int
+    axis: int | _Sequence[int] = -1
+    epsilon: float = 1e-6
+    cumulative: bool = False
+    use_scale: bool = True
+    use_bias: bool = True
+    compute_dtype: types.DType | None = None
+    param_dtype: types.DType = mx.float32
+    name: str | None = None
+
+    @override
+    def make(self) -> 'GroupNormalization':
+      if self.num_groups <= 0:
+        raise ValueError(f'{self.num_groups=} must be positive.')
+      return GroupNormalization(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+    self._param_dtype = _to_mx_dtype(config.param_dtype)
+    self._scale = None
+    self._bias = None
+
+  def _ensure_initialized(self, input_shape):
+    """Create internal GroupNorm components on first call."""
+    if self._scale is not None or self._bias is not None:
+      return
+    axes = _normalize_axes(self.config.axis, input_shape)
+    axis_size = input_shape[axes[0]]
+    if self.config.use_scale:
+      self._scale = mx.ones((axis_size,), dtype=self._param_dtype)
+    if self.config.use_bias:
+      self._bias = mx.zeros((axis_size,), dtype=self._param_dtype)
+
+  @override
+  @types.check_layer
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool = False, constants=None
+  ):
+    self._ensure_initialized(x.values.shape)
+
+    values = x.values
+    axes = _normalize_axes(self.config.axis, values.shape)
+    axis = axes[0]
+    axis_size = values.shape[axis]
+
+    if axis_size % self.config.num_groups != 0:
+      raise ValueError(
+          f'Input axis {axis} size {axis_size} must be'
+          f' divisible by {self.config.num_groups}.'
+      )
+    group_size = axis_size // self.config.num_groups
+
+    # Reshape to [... num_groups, group_size ...]
+    shape = list(values.shape)
+    grouped_shape = (
+        shape[:axis] + [self.config.num_groups, group_size] + shape[axis + 1 :]
+    )
+    grouped = mx.reshape(values, grouped_shape)
+
+    # Normalize over group_size only (per-timestep).
+    g = grouped.astype(mx.float32)
+    reduce_axis = axis + 1
+    mean = mx.mean(g, axis=reduce_axis, keepdims=True)
+    variance = mx.mean(mx.square(g - mean), axis=reduce_axis, keepdims=True)
+    normed = (g - mean) * mx.rsqrt(variance + self.config.epsilon)
+    normed = mx.reshape(normed.astype(values.dtype), values.shape)
+
+    # Apply learned scale and bias.
+    if self.config.use_scale and self._scale is not None:
+      scale_shape = [1] * len(values.shape)
+      scale_shape[axis] = axis_size
+      normed = normed * self._scale.reshape(scale_shape)
+    if self.config.use_bias and self._bias is not None:
+      bias_shape = [1] * len(values.shape)
+      bias_shape[axis] = axis_size
+      normed = normed + self._bias.reshape(bias_shape)
+
+    return Sequence(normed, x.mask)
diff --git a/sequence_layers/mlx/normalization_test.py b/sequence_layers/mlx/normalization_test.py
new file mode 100644
index 0000000..fb31966
--- /dev/null
+++ b/sequence_layers/mlx/normalization_test.py
@@ -0,0 +1,262 @@
+"""Tests for normalization MLX sequence layers."""
+
+# pylint: disable=import-outside-toplevel,protected-access
+
+from absl.testing import absltest
+import mlx.core as mx
+import numpy as np
+
+from sequence_layers.mlx import normalization
+from sequence_layers.mlx import test_utils
+from sequence_layers.specs import normalization_behaviors
+
+
+class L2NormalizeTest(
+    test_utils.SequenceLayerTest,
+    normalization_behaviors.L2NormalizeTest,
+):
+
+  def test_layer(self):
+    layer = normalization.L2Normalize.Config().make()
+    x = self.random_sequence(2, 3, 8)
+    self.verify_contract(layer, x)
+
+  def test_normalizes(self):
+    layer = normalization.L2Normalize.Config().make()
+    values = mx.array([[[3.0, 4.0]]])
+    mask = mx.ones((1, 1), dtype=mx.bool_)
+    x = self.random_sequence(1, 1, 2).unmask()
+    x = type(x)(values, mask)
+    y = layer.layer(x, training=False)
+    # L2 norm of [3, 4] is 5, so output should be [0.6, 0.8].
+    np.testing.assert_allclose(np.array(y.values), [[[0.6, 0.8]]], atol=1e-6)
+
+  def test_multi_axis(self):
+    layer = normalization.L2Normalize.Config(axis=(-2, -1)).make()
+    x = self.random_sequence(2, 3, 4, 3)
+    self.verify_contract(layer, x)
+
+  def test_from_config(self):
+    from sequence_layers.jax import normalization as jax_norm
+
+    config = jax_norm.L2Normalize.Config()
+    mlx_config = normalization.L2Normalize.Config(
+        axis=config.axis,
+        epsilon=config.epsilon,
+        name=config.name,
+    )
+    mlx_layer = mlx_config.make()
+    self.assertIsInstance(mlx_layer, normalization.L2Normalize)
+    x = self.random_sequence(2, 3, 8)
+    self.verify_contract(mlx_layer, x)
+
+
+class RMSNormalizationTest(
+    test_utils.SequenceLayerTest,
+    normalization_behaviors.RMSNormalizationTest,
+):
+
+  def test_layer(self):
+    layer = normalization.RMSNormalization.Config().make()
+    x = self.random_sequence(2, 3, 8)
+    self.verify_contract(layer, x)
+
+  def test_no_scale(self):
+    layer = normalization.RMSNormalization.Config(use_scale=False).make()
+    x = self.random_sequence(2, 3, 8)
+    self.verify_contract(layer, x)
+
+  def test_normalizes(self):
+    layer = normalization.RMSNormalization.Config(use_scale=False).make()
+    values = mx.array([[[1.0, 2.0, 3.0, 4.0]]])
+    mask = mx.ones((1, 1), dtype=mx.bool_)
+    x = self.random_sequence(1, 1, 4).unmask()
+    x = type(x)(values, mask)
+    y = layer.layer(x, training=False)
+    # After RMS norm, the RMS of the output should be ~1.
+    rms = float(mx.sqrt(mx.mean(mx.square(y.values))))
+    np.testing.assert_allclose(rms, 1.0, atol=0.1)
+
+  def test_from_config(self):
+    from sequence_layers.jax import normalization as jax_norm
+
+    config = jax_norm.RMSNormalization.Config()
+    mlx_config = normalization.RMSNormalization.Config(
+        axis=config.axis,
+        epsilon=config.epsilon,
+        use_scale=config.use_scale,
+        name=config.name,
+    )
+    mlx_layer = mlx_config.make()
+    self.assertIsInstance(mlx_layer, normalization.RMSNormalization)
+    x = self.random_sequence(2, 3, 8)
+    self.verify_contract(mlx_layer, x)
+
+
+class LayerNormalizationTest(
+    test_utils.SequenceLayerTest,
+    normalization_behaviors.LayerNormalizationTest,
+):
+
+  def test_layer(self):
+    layer = normalization.LayerNormalization.Config().make()
+    x = self.random_sequence(2, 3, 8)
+    self.verify_contract(layer, x)
+
+  def test_no_affine(self):
+    layer = normalization.LayerNormalization.Config(
+        use_scale=False,
+        use_bias=False,
+    ).make()
+    x = self.random_sequence(2, 3, 8)
+    self.verify_contract(layer, x)
+
+  def test_normalizes(self):
+    layer = normalization.LayerNormalization.Config(
+        use_scale=False,
+        use_bias=False,
+    ).make()
+    values = mx.array([[[1.0, 2.0, 3.0, 4.0]]])
+    mask = mx.ones((1, 1), dtype=mx.bool_)
+    x = self.random_sequence(1, 1, 4).unmask()
+    x = type(x)(values, mask)
+    y = layer.layer(x, training=False)
+    # After layer norm, mean should be ~0, std should be ~1.
+    mean = float(mx.mean(y.values))
+    std = float(mx.sqrt(mx.mean(mx.square(y.values - mean))))
+    np.testing.assert_allclose(mean, 0.0, atol=1e-5)
+    np.testing.assert_allclose(std, 1.0, atol=0.15)
+
+  def test_from_config(self):
+    from sequence_layers.jax import normalization as jax_norm
+
+    config = jax_norm.LayerNormalization.Config()
+    mlx_config = normalization.LayerNormalization.Config(
+        axis=config.axis,
+        epsilon=config.epsilon,
+        use_scale=config.use_scale,
+        use_bias=config.use_bias,
+        name=config.name,
+    )
+    mlx_layer = mlx_config.make()
+    self.assertIsInstance(mlx_layer, normalization.LayerNormalization)
+    x = self.random_sequence(2, 3, 8)
+    self.verify_contract(mlx_layer, x)
+
+
+class BatchNormalizationTest(
+    test_utils.SequenceLayerTest,
+    normalization_behaviors.BatchNormalizationTest,
+):
+
+  def test_layer(self):
+    layer = normalization.BatchNormalization.Config().make()
+    x = self.random_sequence(2, 3, 8)
+    self.verify_contract(layer, x)
+
+  def test_no_affine(self):
+    layer = normalization.BatchNormalization.Config(
+        use_scale=False,
+        use_bias=False,
+    ).make()
+    x = self.random_sequence(2, 3, 8)
+    self.verify_contract(layer, x)
+
+  def test_normalizes(self):
+    layer = normalization.BatchNormalization.Config(
+        use_scale=False,
+        use_bias=False,
+    ).make()
+    # Set known running stats.
+    layer._ensure_initialized((1, 1, 4))
+    layer._running_mean = mx.array([1.0, 2.0, 3.0, 4.0])
+    layer._running_var = mx.array([1.0, 1.0, 1.0, 1.0])
+    values = mx.array([[[1.0, 2.0, 3.0, 4.0]]])
+    mask = mx.ones((1, 1), dtype=mx.bool_)
+    x = type(self.random_sequence(1, 1, 4))(values, mask)
+    y = layer.layer(x, training=False)
+    # (x - mean) / sqrt(var + eps) should be ~0
+    np.testing.assert_allclose(y.values, np.zeros((1, 1, 4)), atol=1e-3)
+
+  def test_scale_and_bias(self):
+    layer = normalization.BatchNormalization.Config(epsilon=1e-3).make()
+    layer._ensure_initialized((1, 1, 4))
+    layer._running_mean = mx.zeros((4,))
+    layer._running_var = mx.ones((4,))
+    layer._scale = mx.array([2.0, 2.0, 2.0, 2.0])
+    layer._bias = mx.array([1.0, 1.0, 1.0, 1.0])
+    values = mx.array([[[1.0, 0.0, -1.0, 2.0]]])
+    mask = mx.ones((1, 1), dtype=mx.bool_)
+    x = type(self.random_sequence(1, 1, 4))(values, mask)
+    y = layer.layer(x, training=False)
+    # (x - 0) / sqrt(1 + 0.001) * 2 + 1
+    scale = 2.0 / float(mx.sqrt(mx.array(1.001)))
+    expected = np.array([[[
+        1.0 * scale + 1.0,
+        0.0 * scale + 1.0,
+        -1.0 * scale + 1.0,
+        2.0 * scale + 1.0,
+    ]]])
+    np.testing.assert_allclose(y.values, expected, atol=1e-5)
+
+  def test_from_config(self):
+    from sequence_layers.jax import normalization as jax_norm
+
+    config = jax_norm.BatchNormalization.Config()
+    mlx_config = normalization.BatchNormalization.Config(
+        axis=config.axis,
+        epsilon=config.epsilon,
+        use_scale=config.use_scale,
+        use_bias=config.use_bias,
+        name=config.name,
+    )
+    mlx_layer = mlx_config.make()
+    self.assertIsInstance(mlx_layer, normalization.BatchNormalization)
+    x = self.random_sequence(2, 3, 8)
+    self.verify_contract(mlx_layer, x)
+
+
+class GroupNormalizationTest(
+    test_utils.SequenceLayerTest,
+    normalization_behaviors.GroupNormalizationTest,
+):
+
+  def test_layer(self):
+    layer = normalization.GroupNormalization.Config(num_groups=2).make()
+    x = self.random_sequence(2, 3, 8)
+    self.verify_contract(layer, x)
+
+  def test_no_affine(self):
+    layer = normalization.GroupNormalization.Config(
+        num_groups=4,
+        use_scale=False,
+        use_bias=False,
+    ).make()
+    x = self.random_sequence(2, 3, 8)
+    self.verify_contract(layer, x)
+
+  def test_num_groups_must_divide(self):
+    layer = normalization.GroupNormalization.Config(num_groups=3).make()
+    with self.assertRaises(ValueError):
+      layer.layer(self.random_sequence(1, 2, 8), training=False)
+
+  def test_from_config(self):
+    from sequence_layers.jax import normalization as jax_norm
+
+    config = jax_norm.GroupNormalization.Config(num_groups=2)
+    mlx_config = normalization.GroupNormalization.Config(
+        num_groups=config.num_groups,
+        axis=config.axis,
+        epsilon=config.epsilon,
+        use_scale=config.use_scale,
+        use_bias=config.use_bias,
+        name=config.name,
+    )
+    mlx_layer = mlx_config.make()
+    self.assertIsInstance(mlx_layer, normalization.GroupNormalization)
+    x = self.random_sequence(2, 3, 8)
+    self.verify_contract(mlx_layer, x)
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/sequence_layers/mlx/pooling.py b/sequence_layers/mlx/pooling.py
new file mode 100644
index 0000000..c732243
--- /dev/null
+++ b/sequence_layers/mlx/pooling.py
@@ -0,0 +1,696 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Pooling layers for MLX."""
+
+import dataclasses
+import fractions
+from typing import Any, override
+
+import mlx.core as mx
+import numpy as np
+
+from sequence_layers.mlx import convolution as conv_utils
+from sequence_layers.mlx import types
+from sequence_layers.specs import pooling as spec
+
+
+def _is_floating(dtype):
+  """Returns True if the given MLX/numpy dtype is a floating type."""
+  return np.issubdtype(
+      np.dtype(str(dtype).rsplit('.', maxsplit=1)[-1]), np.floating
+  )
+
+
+def _is_integer(dtype):
+  """Returns True if the given MLX/numpy dtype is an integer type."""
+  return np.issubdtype(
+      np.dtype(str(dtype).rsplit('.', maxsplit=1)[-1]), np.integer
+  )
+
+
+def _max_pool_init_value(dtype) -> Any:
+  """Returns the initial value for MaxPooling1D for a given dtype."""
+  if _is_floating(dtype):
+    return float('-inf')
+  if _is_integer(dtype):
+    np_dt = np.dtype(str(dtype).rsplit('.', maxsplit=1)[-1])
+    return int(np.iinfo(np_dt).min)
+  if str(dtype) == 'mlx.core.bool':
+    return False
+  raise ValueError(f'Unsupported dtype for max pool: {dtype}')
+
+
+def _min_pool_init_value(dtype) -> Any:
+  """Returns the initial value for MinPooling1D for a given dtype."""
+  if _is_floating(dtype):
+    return float('inf')
+  if _is_integer(dtype):
+    np_dt = np.dtype(str(dtype).rsplit('.', maxsplit=1)[-1])
+    return int(np.iinfo(np_dt).max)
+  if str(dtype) == 'mlx.core.bool':
+    return True
+  raise ValueError(f'Unsupported dtype for min pool: {dtype}')
+
+
+bt = types
+
+Sequence = bt.Sequence
+
+MaskedSequence = bt.MaskedSequence
+PaddingMode = bt.PaddingMode
+
+# Reuse convolution utilities.
+# pylint: disable=protected-access
+_effective_kernel_size = conv_utils._effective_kernel_size
+_explicit_padding = conv_utils._explicit_padding
+_buffer_width = conv_utils._buffer_width
+_compute_conv_mask = conv_utils._compute_conv_mask
+# pylint: enable=protected-access
+
+# Pooling supports fewer step modes than convolution (no causal_valid).
+_STEP_PADDINGS = frozenset({
+    PaddingMode.REVERSE_CAUSAL_VALID.value,
+    PaddingMode.CAUSAL.value,
+    PaddingMode.REVERSE_CAUSAL.value,
+    PaddingMode.SEMICAUSAL.value,
+})
+
+
+def _reduce_window_1d(values, pool_size, stride, dilation_rate, reduce_fn):
+  """Gather pooling windows and reduce along the window axis.
+
+  Args:
+    values: [batch, time, *channels] input tensor (already padded).
+    pool_size: Size of the pooling window.
+    stride: Stride between windows.
+    dilation_rate: Dilation of the pooling window.
+    reduce_fn: Function(array, axis) -> array.
+
+  Returns:
+    [batch, num_outputs, *channels]
+  """
+  if pool_size == 1 and stride == 1:
+    return values
+  if pool_size == 1:
+    return values[:, ::stride]
+
+  t = values.shape[1]
+  ek = _effective_kernel_size(pool_size, dilation_rate)
+  num_outputs = max(0, (t - ek) // stride + 1)
+  if num_outputs == 0:
+    out_shape = (values.shape[0], 0) + values.shape[2:]
+    return mx.zeros(out_shape, dtype=values.dtype)
+
+  window_offsets = mx.arange(pool_size) * dilation_rate
+  start_positions = mx.arange(num_outputs) * stride
+  indices = start_positions[:, None] + window_offsets[None, :]
+  gathered = values[:, indices]  # [b, n, pool_size, *channels]
+  return reduce_fn(gathered, axis=2)
+
+
+def _reduce_window_masked_avg_1d(
+    values, mask, pool_size, stride, dilation_rate
+):
+  """Sum-then-divide pooling with mask-aware divisor.
+
+  Args:
+    values: [batch, time, *channels] already masked to zero.
+    mask: [batch, time] boolean mask.
+    pool_size: Size of the pooling window.
+    stride: Stride between windows.
+    dilation_rate: Dilation of the pooling window.
+
+  Returns:
+    [batch, num_outputs, *channels]
+  """
+  t = values.shape[1]
+  ek = _effective_kernel_size(pool_size, dilation_rate)
+  num_outputs = max(0, (t - ek) // stride + 1)
+  if num_outputs == 0:
+    out_shape = (values.shape[0], 0) + values.shape[2:]
+    return mx.zeros(out_shape, dtype=values.dtype)
+
+  window_offsets = mx.arange(pool_size) * dilation_rate
+  start_positions = mx.arange(num_outputs) * stride
+  indices = start_positions[:, None] + window_offsets[None, :]
+
+  gathered = values[:, indices]
+  v_sum = mx.sum(gathered, axis=2)
+
+  if mx.issubdtype(values.dtype, mx.integer):
+    gathered_mask = mask[:, indices].astype(mx.int32)
+    count = mx.sum(gathered_mask, axis=2)  # [b, n]
+    count = mx.maximum(count, 1)
+    # Expand to broadcast over channel dims.
+    for _ in range(values.ndim - 2):
+      count = mx.expand_dims(count, axis=-1)
+    return v_sum // count
+
+  gathered_mask = mask[:, indices].astype(mx.float32)
+  count = mx.sum(gathered_mask, axis=2)  # [b, n]
+  count = mx.maximum(count, 1.0)
+  # Expand to broadcast over channel dims.
+  for _ in range(values.ndim - 2):
+    count = mx.expand_dims(count, axis=-1)
+  return v_sum / count
+
+
+def _compute_initial_state_pooling(
+    batch_size, input_spec, buf_width, padding, pad_value=0.0
+):
+  """Create initial buffer state for pooling step mode."""
+  if padding in (
+      PaddingMode.CAUSAL_VALID.value,
+      PaddingMode.REVERSE_CAUSAL_VALID.value,
+      PaddingMode.SEMICAUSAL_FULL.value,
+  ):
+    mask = mx.ones((batch_size, buf_width), dtype=bt.MASK_DTYPE)
+  elif padding in (
+      PaddingMode.CAUSAL.value,
+      PaddingMode.REVERSE_CAUSAL.value,
+      PaddingMode.SEMICAUSAL.value,
+  ):
+    mask = mx.zeros((batch_size, buf_width), dtype=bt.MASK_DTYPE)
+  else:
+    raise ValueError(f'Step not supported with padding: {padding}')
+
+  values = mx.full(
+      (batch_size, buf_width) + input_spec.shape,
+      pad_value,
+      dtype=input_spec.dtype,
+  )
+  # Return Sequence (not MaskedSequence) — matches JAX's .unmask().
+  return Sequence(values, mask)
+
+
+class _Pooling1D(
+    types.PreservesType,
+    types.SequenceLayer,
+    spec.BasePooling[types.Sequence, types.ShapeDType],
+):
+  """Base class for 1D pooling layers."""
+
+  def __init__(self, pool_size, strides=1, dilation_rate=1, padding='valid'):
+    super().__init__()
+    self._pool_size = pool_size
+    self._strides = strides
+    self._dilation_rate = dilation_rate
+    self._padding = padding
+
+  def _pad_value(self, dtype):
+    """Returns the pad value for the given dtype."""
+    raise NotImplementedError
+
+  def _reduce(self, gathered, axis):
+    """Reduces gathered windows along the specified axis."""
+    raise NotImplementedError
+
+  @override
+  @property
+  def supports_step(self):
+    return self._padding in _STEP_PADDINGS
+
+  @override
+  @property
+  def block_size(self):
+    return self._strides
+
+  @override
+  @property
+  def output_ratio(self):
+    return fractions.Fraction(1, self._strides)
+
+  @override
+  @property
+  def input_latency(self):
+    ek = _effective_kernel_size(self._pool_size, self._dilation_rate)
+    if self._padding in (
+        PaddingMode.CAUSAL_VALID.value,
+        PaddingMode.CAUSAL.value,
+        PaddingMode.SEMICAUSAL.value,
+    ):
+      return 0
+    if self._padding in (
+        PaddingMode.REVERSE_CAUSAL_VALID.value,
+        PaddingMode.REVERSE_CAUSAL.value,
+    ):
+      return ek - 1
+    return 0
+
+  @property
+  @override
+  def receptive_field(self) -> types.ReceptiveField:
+    return super().receptive_field
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    return tuple(input_shape)
+
+  @override
+  def get_initial_state(
+      self,
+      batch_size: int,
+      input_spec: types.ShapeDType,
+      *,
+      training: bool,
+      constants=None,
+  ):
+    bw = _buffer_width(
+        self._padding,
+        self._pool_size,
+        self._strides,
+        self._dilation_rate,
+    )
+    if not bw:
+      return ()
+    return _compute_initial_state_pooling(
+        batch_size,
+        input_spec,
+        bw,
+        self._padding,
+        pad_value=self._pad_value(input_spec.dtype),
+    )
+
+  @override
+  @types.check_layer
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool = False, constants=None
+  ):
+    pad_value = self._pad_value(x.dtype)
+    # pylint: disable=protected-access
+    output_length = conv_utils._compute_output_length(
+        x.shape[1],
+        self._pool_size,
+        self._strides,
+        self._dilation_rate,
+        self._padding,
+    )
+    # pylint: enable=protected-access
+    if output_length == 0:
+      empty_values = mx.zeros(
+          (x.shape[0], 0, x.shape[-1]), dtype=x.values.dtype
+      )
+      empty_mask = mx.zeros((x.shape[0], 0), dtype=mx.bool_)
+      return Sequence(empty_values, empty_mask)
+
+    if self._pool_size > 1:
+      x = x.mask_invalid(pad_value)
+
+    pad_left, pad_right = _explicit_padding(
+        self._padding,
+        self._pool_size,
+        self._strides,
+        self._dilation_rate,
+    )
+    values = x.values
+    if pad_left > 0 or pad_right > 0:
+      pad_widths = [(0, 0), (pad_left, pad_right)] + [(0, 0)] * (
+          values.ndim - 2
+      )
+      values = mx.pad(values, pad_widths, constant_values=pad_value)
+
+    values = _reduce_window_1d(
+        values,
+        self._pool_size,
+        self._strides,
+        self._dilation_rate,
+        self._reduce,
+    )
+    mask = _compute_conv_mask(
+        x.mask,
+        self._pool_size,
+        self._strides,
+        self._dilation_rate,
+        self._padding,
+        is_step=False,
+    )
+    return Sequence(values, mask)
+
+  @override
+  @types.check_step
+  def step(  # pyrefly: ignore[missing-override-decorator]
+      self, x, state, *, training: bool = False, constants=None
+  ):
+    pad_value = self._pad_value(x.dtype)
+    ek = _effective_kernel_size(self._pool_size, self._dilation_rate)
+    if ek > 1:
+      x = x.mask_invalid(pad_value)
+
+    bw = _buffer_width(
+        self._padding,
+        self._pool_size,
+        self._strides,
+        self._dilation_rate,
+    )
+
+    if bw:
+      state = state.concatenate(x)  # pyrefly: ignore[missing-attribute]
+    else:
+      state = x
+
+    values = _reduce_window_1d(
+        state.values,
+        self._pool_size,
+        self._strides,
+        self._dilation_rate,
+        self._reduce,
+    )
+    mask = _compute_conv_mask(
+        state.mask,
+        self._pool_size,
+        self._strides,
+        self._dilation_rate,
+        self._padding,
+        is_step=True,
+    )
+
+    if bw:
+      state = state[:, -bw:]
+    else:
+      state = ()
+
+    return Sequence(values, mask), state
+
+
+class MaxPooling1D(
+    _Pooling1D, spec.MaxPooling1D[types.Sequence, types.ShapeDType]
+):
+  """1D max pooling layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types.SequenceLayerConfig, spec.MaxPooling1D.Config):
+    """Configuration for MaxPooling1D."""
+
+    pool_size: int
+    strides: int = 1
+    dilation_rate: int = 1
+    padding: types.PaddingModeString = types.PaddingMode.VALID.value
+    name: str | None = None
+
+    @override
+    def make(self) -> 'MaxPooling1D':
+      return MaxPooling1D(self)
+
+  def __init__(
+      self,
+      config: Config | None = None,
+      *,
+      pool_size: int | None = None,
+      strides: int = 1,
+      dilation_rate: int = 1,
+      padding: types.PaddingModeString = 'valid',
+  ):
+    if config is not None:
+      super().__init__(
+          pool_size=config.pool_size,
+          strides=config.strides,
+          dilation_rate=config.dilation_rate,
+          padding=config.padding,
+      )
+      self.config = config
+    else:
+      if pool_size is None:
+        raise ValueError('Must provide either config or pool_size')
+      super().__init__(
+          pool_size=pool_size,
+          strides=strides,
+          dilation_rate=dilation_rate,
+          padding=padding,
+      )
+      self.config = self.Config(
+          pool_size=pool_size,
+          strides=strides,
+          dilation_rate=dilation_rate,
+          padding=padding,
+      )
+
+  @override
+  def _pad_value(self, dtype):
+    return _max_pool_init_value(dtype)
+
+  @override
+  def _reduce(self, gathered, axis):
+    return mx.max(gathered, axis=axis)
+
+  @classmethod
+  def from_config(cls, config):
+    """Creates a MaxPooling1D instance from config."""
+    return cls(config)
+
+
+class MinPooling1D(
+    _Pooling1D, spec.MinPooling1D[types.Sequence, types.ShapeDType]
+):
+  """1D min pooling layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types.SequenceLayerConfig, spec.MinPooling1D.Config):
+    """Configuration for MinPooling1D."""
+
+    pool_size: int
+    strides: int = 1
+    dilation_rate: int = 1
+    padding: types.PaddingModeString = types.PaddingMode.VALID.value
+    name: str | None = None
+
+    @override
+    def make(self) -> 'MinPooling1D':
+      return MinPooling1D(self)
+
+  def __init__(
+      self,
+      config: Config | None = None,
+      *,
+      pool_size: int | None = None,
+      strides: int = 1,
+      dilation_rate: int = 1,
+      padding: types.PaddingModeString = 'valid',
+  ):
+    if config is not None:
+      super().__init__(
+          pool_size=config.pool_size,
+          strides=config.strides,
+          dilation_rate=config.dilation_rate,
+          padding=config.padding,
+      )
+      self.config = config
+    else:
+      if pool_size is None:
+        raise ValueError('Must provide either config or pool_size')
+      super().__init__(
+          pool_size=pool_size,
+          strides=strides,
+          dilation_rate=dilation_rate,
+          padding=padding,
+      )
+      self.config = self.Config(
+          pool_size=pool_size,
+          strides=strides,
+          dilation_rate=dilation_rate,
+          padding=padding,
+      )
+
+  @override
+  def _pad_value(self, dtype):
+    return _min_pool_init_value(dtype)
+
+  @override
+  def _reduce(self, gathered, axis):
+    return mx.min(gathered, axis=axis)
+
+  @classmethod
+  def from_config(cls, config):
+    """Creates a MinPooling1D instance from config."""
+    return cls(config)
+
+
+class AveragePooling1D(
+    _Pooling1D,
+    spec.AveragePooling1D[types.Sequence, types.ShapeDType],
+):
+  """1D average pooling layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types.SequenceLayerConfig, spec.AveragePooling1D.Config):
+    """Configuration for AveragePooling1D."""
+
+    pool_size: int
+    strides: int = 1
+    dilation_rate: int = 1
+    padding: types.PaddingModeString = types.PaddingMode.VALID.value
+    masked_average: bool = False
+    name: str | None = None
+
+    @override
+    def make(self) -> 'AveragePooling1D':
+      return AveragePooling1D(self)
+
+  def __init__(
+      self,
+      config: Config | None = None,
+      *,
+      pool_size: int | None = None,
+      strides: int = 1,
+      dilation_rate: int = 1,
+      padding: types.PaddingModeString = 'valid',
+      masked_average: bool = False,
+  ):
+    if config is not None:
+      super().__init__(
+          pool_size=config.pool_size,
+          strides=config.strides,
+          dilation_rate=config.dilation_rate,
+          padding=config.padding,
+      )
+      self._masked_average = config.masked_average
+      self.config = config
+    else:
+      if pool_size is None:
+        raise ValueError('Must provide either config or pool_size')
+      super().__init__(
+          pool_size=pool_size,
+          strides=strides,
+          dilation_rate=dilation_rate,
+          padding=padding,
+      )
+      self._masked_average = masked_average
+      self.config = self.Config(
+          pool_size=pool_size,
+          strides=strides,
+          dilation_rate=dilation_rate,
+          padding=padding,
+          masked_average=masked_average,
+      )
+
+  @override
+  def _pad_value(self, dtype):
+    """Returns the pad value for the given dtype."""
+    if mx.issubdtype(dtype, mx.integer):
+      return 0
+    if dtype == mx.bool_:
+      return False
+    return 0.0
+
+  @override
+  def _reduce(self, gathered, axis):
+    """Reduces gathered windows along the specified axis."""
+    if mx.issubdtype(gathered.dtype, mx.integer):
+      return mx.sum(gathered, axis=axis) // gathered.shape[axis]
+    return mx.mean(gathered, axis=axis)
+
+  @override
+  @types.check_layer
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool = False, constants=None
+  ):
+    if not self._masked_average:
+      return _Pooling1D.layer.__wrapped__(  # pylint: disable=no-member  # pyrefly: ignore[missing-attribute]
+          self, x, training=training, constants=constants
+      )
+
+    pad_value = self._pad_value(x.dtype)
+    # Masked average: divide by count of valid elements.
+
+    x = x.mask_invalid(pad_value)
+    pad_left, pad_right = _explicit_padding(
+        self._padding,
+        self._pool_size,
+        self._strides,
+        self._dilation_rate,
+    )
+    values = x.values
+    input_mask = x.mask
+    if pad_left > 0 or pad_right > 0:
+      pad_widths = [(0, 0), (pad_left, pad_right)] + [(0, 0)] * (
+          values.ndim - 2
+      )
+      values = mx.pad(values, pad_widths, constant_values=pad_value)
+      input_mask = mx.pad(
+          input_mask,
+          [(0, 0), (pad_left, pad_right)],
+          constant_values=False,
+      )
+
+    values = _reduce_window_masked_avg_1d(
+        values,
+        input_mask,
+        self._pool_size,
+        self._strides,
+        self._dilation_rate,
+    )
+    mask = _compute_conv_mask(
+        x.mask,
+        self._pool_size,
+        self._strides,
+        self._dilation_rate,
+        self._padding,
+        is_step=False,
+    )
+    return Sequence(values, mask)
+
+  @override
+  @types.check_step
+  def step(  # pyrefly: ignore[missing-override-decorator]
+      self, x, state, *, training: bool = False, constants=None
+  ):
+    if not self._masked_average:
+      return _Pooling1D.step.__wrapped__(  # pylint: disable=no-member  # pyrefly: ignore[missing-attribute]
+          self, x, state, training=training, constants=constants
+      )
+
+    # Masked average step.
+    pad_value = self._pad_value(x.dtype)
+    ek = _effective_kernel_size(self._pool_size, self._dilation_rate)
+    if ek > 1:
+      x = x.mask_invalid(pad_value)
+
+    bw = _buffer_width(
+        self._padding,
+        self._pool_size,
+        self._strides,
+        self._dilation_rate,
+    )
+
+    if bw:
+      state = state.concatenate(x)
+    else:
+      state = x
+
+    values = _reduce_window_masked_avg_1d(
+        state.values,
+        state.mask,
+        self._pool_size,
+        self._strides,
+        self._dilation_rate,
+    )
+    mask = _compute_conv_mask(
+        state.mask,
+        self._pool_size,
+        self._strides,
+        self._dilation_rate,
+        self._padding,
+        is_step=True,
+    )
+
+    if bw:
+      state = state[:, -bw:]
+    else:
+      state = ()
+
+    return Sequence(values, mask), state
+
+  @classmethod
+  def from_config(cls, config):
+    """Creates an AveragePooling1D instance from config."""
+    return cls(config)
diff --git a/sequence_layers/mlx/pooling_test.py b/sequence_layers/mlx/pooling_test.py
new file mode 100644
index 0000000..64c6ca2
--- /dev/null
+++ b/sequence_layers/mlx/pooling_test.py
@@ -0,0 +1,110 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for pooling MLX sequence layers."""
+
+from absl.testing import absltest
+import mlx.core as mx
+import numpy as np
+
+from sequence_layers.jax import pooling as jax_pooling
+from sequence_layers.mlx import pooling
+from sequence_layers.mlx import test_utils
+from sequence_layers.specs import pooling_behaviors as spec
+
+
+class Pooling1DTest(test_utils.SequenceLayerTest, spec.Pooling1DTest):
+  """Shared behavior tests for 1D pooling layers in MLX."""
+
+
+class MaxPooling1DTest(test_utils.SequenceLayerTest):
+  """MLX-specific tests for MaxPooling1D."""
+
+  def test_max_values(self):
+    values = mx.array([[[1.0], [3.0], [2.0], [5.0], [4.0]]])
+    mask = mx.ones((1, 5), dtype=mx.bool_)
+    x = type(self.random_sequence(1, 5, 1))(values, mask)
+    layer = pooling.MaxPooling1D(pool_size=3, padding='valid')
+    y = layer.layer(x, training=False)
+    expected = np.array([[[3.0], [5.0], [5.0]]])
+    np.testing.assert_allclose(y.values, expected)
+
+  def test_from_config(self):
+    config = jax_pooling.MaxPooling1D.Config(
+        pool_size=3,
+        padding='causal',
+    )
+    mlx_layer = pooling.MaxPooling1D.from_config(config)
+    self.assertIsInstance(mlx_layer, pooling.MaxPooling1D)
+    self.verify_contract(mlx_layer, self.random_sequence(2, 10, 4))
+
+
+class MinPooling1DTest(test_utils.SequenceLayerTest):
+  """MLX-specific tests for MinPooling1D."""
+
+  def test_min_values(self):
+    values = mx.array([[[5.0], [3.0], [4.0], [1.0], [2.0]]])
+    mask = mx.ones((1, 5), dtype=mx.bool_)
+    x = type(self.random_sequence(1, 5, 1))(values, mask)
+    layer = pooling.MinPooling1D(pool_size=3, padding='valid')
+    y = layer.layer(x, training=False)
+    expected = np.array([[[3.0], [1.0], [1.0]]])
+    np.testing.assert_allclose(y.values, expected)
+
+  def test_from_config(self):
+    config = jax_pooling.MinPooling1D.Config(
+        pool_size=3,
+        padding='causal',
+    )
+    mlx_layer = pooling.MinPooling1D.from_config(config)
+    self.assertIsInstance(mlx_layer, pooling.MinPooling1D)
+    self.verify_contract(mlx_layer, self.random_sequence(2, 10, 4))
+
+
+class AveragePooling1DTest(test_utils.SequenceLayerTest):
+  """MLX-specific tests for AveragePooling1D."""
+
+  def test_average_values(self):
+    values = mx.array([[[3.0], [6.0], [9.0], [12.0], [15.0]]])
+    mask = mx.ones((1, 5), dtype=mx.bool_)
+    x = type(self.random_sequence(1, 5, 1))(values, mask)
+    layer = pooling.AveragePooling1D(pool_size=3, padding='valid')
+    y = layer.layer(x, training=False)
+    expected = np.array([[[6.0], [9.0], [12.0]]])
+    np.testing.assert_allclose(y.values, expected)
+
+  def test_masked_average(self):
+    values = mx.array([[[3.0], [6.0], [0.0]]])
+    mask = mx.array([[True, True, False]])
+    x = type(self.random_sequence(1, 3, 1))(values, mask)
+    layer = pooling.AveragePooling1D(
+        pool_size=3,
+        padding='valid',
+        masked_average=True,
+    )
+    y = layer.layer(x, training=False)
+    expected = np.array([[[4.5]]])
+    np.testing.assert_allclose(y.values, expected, atol=1e-5)
+
+  def test_from_config(self):
+    config = jax_pooling.AveragePooling1D.Config(
+        pool_size=3,
+        padding='causal',
+    )
+    mlx_layer = pooling.AveragePooling1D.from_config(config)
+    self.assertIsInstance(mlx_layer, pooling.AveragePooling1D)
+    self.verify_contract(mlx_layer, self.random_sequence(2, 10, 4))
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/sequence_layers/mlx/position.py b/sequence_layers/mlx/position.py
new file mode 100644
index 0000000..a41280b
--- /dev/null
+++ b/sequence_layers/mlx/position.py
@@ -0,0 +1,511 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Position embeddings and timing signals for MLX."""
+
+import dataclasses
+import math
+from typing import Any, cast, override
+
+import mlx.core as mx
+import numpy as np
+
+from sequence_layers.mlx import types
+from sequence_layers.mlx.init_mapping import _to_mx_dtype
+from sequence_layers.specs import position as position_spec
+
+from . import types as bt
+
+Sequence = bt.Sequence
+MaskedSequence = bt.MaskedSequence
+
+
+def _match_shape_along_axes(channel_shape, axes):
+  """Return a shape matching channel_shape on axes and is equal to 1 elsewhere."""
+  if axes is None:
+    return tuple(channel_shape)
+
+  target_shape = [1] * len(channel_shape)
+  if isinstance(axes, int):
+    axes = [axes]
+  for axis in axes:
+    if not -len(channel_shape) <= axis < len(channel_shape):
+      raise ValueError(f'Invalid {axis=} found in {axes=}.')
+    target_shape[axis] = channel_shape[axis]
+  return tuple(target_shape)
+
+
+def _get_timing_signal_1d_pos(
+    position, channels, min_timescale=1.0, max_timescale=1.0e4, dtype=mx.float32
+):
+  """Compute 1D sinusoidal timing signal in MLX."""
+  position = position.astype(mx.float32)
+  num_timescales = channels // 2
+  log_timescale_increment = math.log(
+      float(max_timescale) / float(min_timescale)
+  ) / max(num_timescales - 1, 1)
+
+  inv_timescales = min_timescale * np.exp(
+      np.arange(num_timescales, dtype=np.float32) * -log_timescale_increment
+  )
+  inv_timescales_mx = mx.array(inv_timescales, dtype=mx.float32)
+
+  scaled_time = (
+      mx.expand_dims(position, axis=2) * inv_timescales_mx[None, None, :]
+  )
+  timing_signal = mx.concatenate(
+      [mx.sin(scaled_time), mx.cos(scaled_time)], axis=2
+  )
+  if channels % 2 != 0:
+    padding = mx.zeros(
+        (timing_signal.shape[0], timing_signal.shape[1], 1),
+        dtype=timing_signal.dtype,
+    )
+    timing_signal = mx.concatenate([timing_signal, padding], axis=2)
+  return timing_signal.astype(dtype)
+
+
+class AddTimingSignal(
+    types.PreservesType,
+    types.PreservesShape,
+    types.SequenceLayer,
+    position_spec.AddTimingSignal[types.Sequence, types.ChannelSpec],
+):
+  """Adds sinusoids at varying frequencies to the input channels dimension."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types.SequenceLayerConfig, position_spec.AddTimingSignal.Config):
+    """Configuration for AddTimingSignal."""
+
+    min_timescale: float = 1.0
+    max_timescale: float = 1.0e4
+    trainable_scale: bool = False
+    axes: int | tuple[int, ...] | None = None
+    sharding: Any = None
+    param_dtype: types.DType = mx.float32
+    only_advance_position_for_valid_timesteps: bool = True
+    name: str | None = None
+
+    @override
+    def make(self) -> 'AddTimingSignal':
+      return AddTimingSignal(self)
+
+  def __init__(
+      self,
+      config: Config | None = None,
+      *,
+      min_timescale: float = 1.0,
+      max_timescale: float = 1.0e4,
+      trainable_scale: bool = False,
+      axes: int | tuple[int, ...] | None = None,
+      only_advance_position_for_valid_timesteps: bool = True,
+      param_dtype: types.DType = mx.float32,
+  ):
+    super().__init__()
+    if config is not None:
+      self.config = config
+    else:
+      self.config = self.Config(
+          min_timescale=min_timescale,
+          max_timescale=max_timescale,
+          trainable_scale=trainable_scale,
+          axes=axes,
+          only_advance_position_for_valid_timesteps=only_advance_position_for_valid_timesteps,
+          param_dtype=param_dtype,
+      )
+
+    self.min_timescale = self.config.min_timescale
+    self.max_timescale = self.config.max_timescale
+    self.trainable_scale = self.config.trainable_scale
+    self.axes = self.config.axes
+    self.only_advance_position_for_valid_timesteps = (
+        self.config.only_advance_position_for_valid_timesteps
+    )
+    self.param_dtype = self.config.param_dtype
+
+    if self.trainable_scale:
+      self.scale = mx.ones((), dtype=self.param_dtype)
+    else:
+      self.scale = cast(Any, None)
+
+  def _check_inputs(self, input_spec):
+    """Validates the input specification."""
+    if input_spec.dtype not in (
+        mx.float16,
+        mx.bfloat16,
+        mx.float32,
+    ):
+      raise ValueError(
+          f'{type(self).__name__} requires floating point argument.'
+      )
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    return tuple(input_shape)
+
+  @override
+  def get_output_dtype(self, input_dtype, *, constants=None):
+    return input_dtype
+
+  @override
+  def get_initial_state(
+      self, batch_size, input_spec, *, training: bool, constants=None
+  ):
+    self._check_inputs(input_spec)
+    if self.only_advance_position_for_valid_timesteps:
+      return mx.full((batch_size, 1), -1, dtype=mx.int32)
+    return mx.zeros((batch_size, 1), dtype=mx.int32)
+
+  @override
+  @types.check_step
+  def step(  # pyrefly: ignore[missing-override-decorator]
+      self, x, state: Any, *, training: bool, constants=None
+  ):
+    self._check_inputs(x.channel_spec)
+    time = x.shape[1]
+    target_shape = _match_shape_along_axes(x.channel_shape, axes=self.axes)
+
+    if self.only_advance_position_for_valid_timesteps:
+      position = state + mx.cumsum(x.mask.astype(mx.int32), axis=1)
+      state = position[:, -1:]
+    else:
+      position = state + mx.arange(time, dtype=mx.int32)
+      state = state + time
+
+    timing_signal = _get_timing_signal_1d_pos(
+        position,
+        np.prod(target_shape),
+        min_timescale=self.min_timescale,
+        max_timescale=self.max_timescale,
+        dtype=self.param_dtype,
+    )
+    batch_size = x.shape[0]
+    timing_signal = mx.reshape(
+        timing_signal, [batch_size, time] + list(target_shape)
+    )
+    if self.scale is not None:
+      timing_signal = timing_signal * self.scale
+    x = x.apply_values(lambda v: v + timing_signal.astype(v.dtype))
+    return x, state
+
+  @override
+  @types.check_layer
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    self._check_inputs(x.channel_spec)
+    target_shape = _match_shape_along_axes(x.channel_shape, axes=self.axes)
+
+    if self.only_advance_position_for_valid_timesteps:
+      position = mx.maximum(0, mx.cumsum(x.mask.astype(mx.int32), axis=1) - 1)
+    else:
+      position = mx.expand_dims(mx.arange(x.shape[1], dtype=mx.int32), axis=0)
+
+    timing_signal = _get_timing_signal_1d_pos(
+        position,
+        np.prod(target_shape),
+        min_timescale=self.min_timescale,
+        max_timescale=self.max_timescale,
+        dtype=self.param_dtype,
+    )
+    timing_signal = mx.reshape(
+        timing_signal, list(position.shape[:2]) + list(target_shape)
+    )
+    if self.scale is not None:
+      timing_signal = timing_signal * self.scale
+    x = x.apply_values(lambda v: v + timing_signal.astype(v.dtype))
+    return x
+
+  @classmethod
+  def from_config(cls, config):
+    """Instantiates the layer from config."""
+    mlx_config = cls.Config(
+        min_timescale=config.min_timescale,
+        max_timescale=config.max_timescale,
+        trainable_scale=config.trainable_scale,
+        axes=config.axes,
+        only_advance_position_for_valid_timesteps=config.only_advance_position_for_valid_timesteps,
+        param_dtype=_to_mx_dtype(config.param_dtype),
+        name=config.name,
+    )
+    return cls(mlx_config)
+
+
+class ApplyRotaryPositionalEncoding(
+    types.PreservesType,
+    types.PreservesShape,
+    types.SequenceLayer,
+    position_spec.ApplyRotaryPositionalEncoding[
+        types.Sequence, types.ChannelSpec
+    ],
+):
+  """Applies Rotary Positional Encodings (RoPE) to the sequence."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(
+      types.SequenceLayerConfig,
+      position_spec.ApplyRotaryPositionalEncoding.Config,
+  ):
+    """Configuration for ApplyRotaryPositionalEncoding."""
+
+    max_wavelength: float
+    axis: int = -1
+    only_advance_position_for_valid_timesteps: bool = True
+    positions_in_at_least_fp32: bool = True
+    positions_name: str | None = None
+    name: str | None = None
+
+    @override
+    def make(self) -> 'ApplyRotaryPositionalEncoding':
+      return ApplyRotaryPositionalEncoding(self)
+
+  def __init__(
+      self,
+      config: Config | None = None,
+      *,
+      max_wavelength: float | None = None,
+      axis: int = -1,
+      only_advance_position_for_valid_timesteps: bool = True,
+      positions_in_at_least_fp32: bool = True,
+      positions_name: str | None = None,
+  ):
+    super().__init__()
+    if config is not None:
+      self.config = config
+    else:
+      if max_wavelength is None:
+        raise ValueError('Must provide either config or max_wavelength')
+      self.config = self.Config(
+          max_wavelength=max_wavelength,
+          axis=axis,
+          only_advance_position_for_valid_timesteps=only_advance_position_for_valid_timesteps,
+          positions_in_at_least_fp32=positions_in_at_least_fp32,
+          positions_name=positions_name,
+      )
+
+    self.max_wavelength = self.config.max_wavelength
+    self._axis = self.config.axis
+    self.only_advance_position_for_valid_timesteps = (
+        self.config.only_advance_position_for_valid_timesteps
+    )
+    self.positions_in_at_least_fp32 = self.config.positions_in_at_least_fp32
+    self.positions_name = self.config.positions_name
+
+  def _validate(self):
+    """Validates the configuration properties."""
+    if self.only_advance_position_for_valid_timesteps and self.positions_name:
+      raise ValueError(
+          'only_advance_position_for_valid_timesteps is incompatible with'
+          f' {self.positions_name=}.'
+      )
+
+  def _check_inputs(self, input_spec):
+    """Validates input specifications and shape constraints."""
+    self._validate()
+    if input_spec.dtype not in (
+        mx.float16,
+        mx.bfloat16,
+        mx.float32,
+    ):
+      raise ValueError(
+          f'{type(self).__name__} requires floating point argument.'
+      )
+    input_shape = (None, None) + tuple(input_spec.shape)
+    axis = self._axis + len(input_shape) if self._axis < 0 else self._axis
+    if axis <= 1:
+      raise ValueError(
+          f'{type(self).__name__} axis ({self._axis}) must refer to a'
+          f' channels dimension ({input_spec=}).'
+      )
+    axis_size = input_shape[axis]
+    if axis_size is not None and axis_size % 2 != 0:
+      raise ValueError(
+          f'{type(self).__name__} requires input_shape[{axis}]={axis_size} to'
+          ' be even.'
+      )
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    return tuple(input_shape)
+
+  @override
+  def get_output_dtype(self, input_dtype, *, constants=None):
+    return input_dtype
+
+  def _apply_rope(self, x, offset_or_positions):
+    """Applies rotary position encoding to x.
+
+    If rotation axis is the last dimension and we are using a simple temporal offset
+    (i.e. not custom positions from positions_name), we leverage the highly optimized
+    `mx.fast.rope` C++ operation. Otherwise, we fall back to manual trig calculation.
+    """
+    axis = self._axis + x.ndim if self._axis < 0 else self._axis
+
+    is_custom_positions = (
+        hasattr(offset_or_positions, 'ndim') and offset_or_positions.ndim >= 2
+    )
+
+    if is_custom_positions or axis != x.ndim - 1:
+      # Manual fallback
+      channel_ndim = x.ndim - 2
+      axis_dim = x.shape[axis]
+      assert axis_dim % 2 == 0
+
+      freq_exponents = (
+          2.0 * mx.arange(axis_dim // 2).astype(mx.float32) / axis_dim
+      )
+      timescale = self.max_wavelength**freq_exponents
+
+      broadcast_shape = [1] * x.ndim
+      broadcast_shape[axis] = axis_dim // 2
+
+      if is_custom_positions:
+        positions = offset_or_positions
+      else:
+        offset = offset_or_positions
+        positions = mx.expand_dims(
+            mx.arange(x.shape[1]), axis=0
+        ) + mx.expand_dims(offset, axis=1)
+
+      positions_f = positions.astype(mx.float32)
+      radians = positions_f.reshape(
+          positions_f.shape + (1,) * channel_ndim
+      ) / timescale.reshape(broadcast_shape)
+      sin_r = mx.sin(radians)
+      cos_r = mx.cos(radians)
+
+      splits = mx.split(x, 2, axis=axis)
+      x1, x2 = splits[0], splits[1]
+      result = mx.concatenate(
+          [x1 * cos_r - x2 * sin_r, x2 * cos_r + x1 * sin_r],
+          axis=axis,
+      )
+      return result.astype(x.dtype)
+
+    # Optimized mx.fast.rope path
+    offset = offset_or_positions
+    original_axes = list(range(x.ndim))
+    if x.ndim >= 3:
+      transpose_axes = original_axes.copy()
+      transpose_axes.pop(1)
+      transpose_axes.insert(-1, 1)
+      x_t = mx.transpose(x, transpose_axes)
+    else:
+      x_t = x
+
+    y_t = mx.fast.rope(
+        x_t,
+        dims=x.shape[-1],
+        traditional=False,
+        base=self.max_wavelength,
+        scale=1.0,
+        offset=offset,
+    )
+
+    if x.ndim >= 3:
+      inv_axes = original_axes.copy()
+      inv_axes.pop(-2)
+      inv_axes.insert(1, x.ndim - 2)
+      y = mx.transpose(y_t, inv_axes)
+    else:
+      y = y_t
+    return y.astype(x.dtype)
+
+  @override
+  def get_initial_state(
+      self, batch_size, input_spec, *, training: bool, constants=None
+  ):
+    self._validate()
+    self._check_inputs(input_spec)
+    if self.positions_name:
+      return ()
+    if self.only_advance_position_for_valid_timesteps:
+      return mx.full((batch_size, 1), -1, dtype=mx.int32)
+    return mx.zeros((batch_size, 1), dtype=mx.int32)
+
+  @override
+  @types.check_step
+  def step(  # pyrefly: ignore[missing-override-decorator]
+      self, x, state: Any, *, training: bool, constants=None
+  ):
+    self._check_inputs(x.channel_spec)
+    x_time = x.shape[1]
+
+    if self.positions_name:
+      if constants is None or self.positions_name not in constants:
+        raise ValueError(
+            f'Expected constants dict containing {self.positions_name!r}'
+        )
+      positions_const = constants[self.positions_name]
+      if isinstance(positions_const, (Sequence, MaskedSequence)):
+        offset_or_positions = positions_const.values
+      else:
+        offset_or_positions = positions_const
+    elif self.only_advance_position_for_valid_timesteps:
+      offset = mx.maximum(0, state[:, 0] + 1)
+      positions = state + mx.cumsum(x.mask.astype(mx.int32), axis=1)
+      state = positions[:, -1:]
+      offset_or_positions = offset
+    else:
+      offset = state[:, 0]
+      state = state + x_time
+      offset_or_positions = offset
+
+    y = x.apply_values(self._apply_rope, offset_or_positions)
+    return y, state
+
+  @override
+  @types.check_layer
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    self._check_inputs(x.channel_spec)
+    if self.positions_name:
+      if constants is None or self.positions_name not in constants:
+        raise ValueError(
+            f'Expected constants dict containing {self.positions_name!r}'
+        )
+      positions_const = constants[self.positions_name]
+      if isinstance(positions_const, (Sequence, MaskedSequence)):
+        offset_or_positions = positions_const.values
+      else:
+        offset_or_positions = positions_const
+    elif self.only_advance_position_for_valid_timesteps:
+      offset_or_positions = mx.maximum(
+          0, mx.cumsum(x.mask.astype(mx.int32), axis=1) - 1
+      )
+    else:
+      offset_or_positions = mx.zeros((x.shape[0],), dtype=mx.int32)
+
+    y = x.apply_values(self._apply_rope, offset_or_positions)
+    return y
+
+  @property
+  @override
+  def receptive_field(self) -> types.ReceptiveField:
+    return (0, 0)
+
+  @classmethod
+  def from_config(cls, config):
+    """Instantiates the layer from config."""
+    mlx_config = cls.Config(
+        max_wavelength=config.max_wavelength,
+        axis=config.axis,
+        only_advance_position_for_valid_timesteps=(
+            config.only_advance_position_for_valid_timesteps
+        ),
+        positions_in_at_least_fp32=config.positions_in_at_least_fp32,
+        positions_name=config.positions_name,
+        name=config.name,
+    )
+    return cls(mlx_config)
diff --git a/sequence_layers/mlx/position_test.py b/sequence_layers/mlx/position_test.py
new file mode 100644
index 0000000..e292ce0
--- /dev/null
+++ b/sequence_layers/mlx/position_test.py
@@ -0,0 +1,142 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for position encoding MLX sequence layers."""
+
+from absl.testing import parameterized
+import mlx.core as mx
+
+from sequence_layers.mlx import test_utils
+from sequence_layers.specs import position_behaviors
+
+
+class AddTimingSignalTest(
+    position_behaviors.AddTimingSignalTest,
+    test_utils.SequenceLayerTest,
+    parameterized.TestCase,
+):
+
+  @parameterized.product(
+      param_dtype=(mx.float32, mx.float16),
+      input_dtype=(mx.float32, mx.float16, mx.bfloat16),
+      trainable_scale=(False, True),
+  )
+  def test_dtypes(
+      self,
+      param_dtype,
+      input_dtype,
+      trainable_scale,
+  ):
+    channel_shape = (2, 3)
+    min_timescale = 1.0
+    max_timescale = 1.0e4
+    config = self.sl.AddTimingSignal.Config(
+        min_timescale=min_timescale,
+        max_timescale=max_timescale,
+        trainable_scale=trainable_scale,
+        param_dtype=param_dtype,
+        name='add_timing_signal',
+    )
+    layer = self.make_layer(config)
+    batch_size = 2
+    x = self.random_sequence(batch_size, 1, *channel_shape, dtype=input_dtype)
+    layer = self.init_layer(layer, x)
+
+    # Check params dtype if trainable
+    variables = self.get_variables(layer)
+    params = variables.get('params', {}) if isinstance(variables, dict) else {}
+    if trainable_scale:
+      # In MLX, scale is a direct parameter attribute on the module if defined
+      scale_param = getattr(layer, 'scale', None)
+      self.assertIsNotNone(scale_param)
+      self.assertEqual(scale_param.dtype, param_dtype)
+
+    for time in range(13 * layer.block_size, 15 * layer.block_size):
+      x = self.random_sequence(
+          batch_size, time, *channel_shape, dtype=input_dtype
+      )
+      self.verify_contract(
+          layer,
+          x,
+          training=False,
+      )
+
+
+class ApplyRotaryPositionalEncodingTest(
+    position_behaviors.ApplyRotaryPositionalEncodingTest,
+    test_utils.SequenceLayerTest,
+    parameterized.TestCase,
+):
+
+  @parameterized.product(
+      input_dtype=(mx.float32, mx.float16, mx.bfloat16),
+      only_advance_position_for_valid_timesteps=(False, True),
+      positions_in_at_least_fp32=(False, True),
+  )
+  def test_dtypes(
+      self,
+      input_dtype,
+      only_advance_position_for_valid_timesteps,
+      positions_in_at_least_fp32,
+  ):
+    max_wavelength = 1.0e4
+    channel_shape = (2,)
+    config = self.sl.ApplyRotaryPositionalEncoding.Config(
+        max_wavelength=max_wavelength,
+        only_advance_position_for_valid_timesteps=only_advance_position_for_valid_timesteps,
+        positions_in_at_least_fp32=positions_in_at_least_fp32,
+        name='rope',
+    )
+    layer = self.make_layer(config)
+    batch_size = 2
+    x = self.random_sequence(batch_size, 1, *channel_shape, dtype=input_dtype)
+    layer = self.init_layer(layer, x)
+    for time in range(13 * layer.block_size, 15 * layer.block_size):
+      x = self.random_sequence(
+          batch_size,
+          time,
+          *channel_shape,
+          random_mask=only_advance_position_for_valid_timesteps,
+          dtype=input_dtype,
+      )
+      self.verify_contract(layer, x, training=False)
+
+  def test_step_positions_advance(self):
+    max_wavelength = 10000.0
+    config = self.sl.ApplyRotaryPositionalEncoding.Config(
+        max_wavelength=max_wavelength,
+        only_advance_position_for_valid_timesteps=True,
+    )
+    layer = self.make_layer(config)
+    spec = self.sl.types.ShapeDType((8,), mx.float32)
+    state = layer.get_initial_state(1, spec, training=False)
+
+    # Step with valid mask.
+    x1 = self.sl.types.MaskedSequence(
+        mx.ones((1, 1, 8)),
+        mx.ones((1, 1), dtype=mx.bool_),
+    )
+    _, state = layer.step(x1, state, training=False)
+    self.assertEqual(int(state[0, 0]), 0)
+
+    # Step with invalid mask.
+    x2 = self.sl.types.MaskedSequence(
+        mx.ones((1, 1, 8)),
+        mx.zeros((1, 1), dtype=mx.bool_),
+    )
+    _, state = layer.step(x2, state, training=False)
+    self.assertEqual(int(state[0, 0]), 0)
+
+
+if __name__ == '__main__':
+  parameterized.absltest.main()
diff --git a/sequence_layers/mlx/projection_configs.py b/sequence_layers/mlx/projection_configs.py
new file mode 100644
index 0000000..409fc22
--- /dev/null
+++ b/sequence_layers/mlx/projection_configs.py
@@ -0,0 +1,137 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MLX-native attention projection configuration dataclasses.
+
+These are pure-Python dataclasses that mirror the JAX-side projection configs
+from sequence_layers.jax.attention.common, but without JAX-specific fields
+(sharding, einsum_factory, quantization_provider). They retain initializer
+fields as Callable | None so that downstream code can still configure kernel
+and bias initialization.
+"""
+
+import dataclasses
+from typing import Callable
+
+from sequence_layers.specs import attention as attention_spec
+
+
+@dataclasses.dataclass(frozen=True)
+class QueryKeyValueProjectionConfig(attention_spec.QueryKeyValueProjectionConfig):
+  """Base class for QKV projection configuration."""
+
+
+@dataclasses.dataclass(frozen=True)
+class CombinedQueryKeyValueProjection(
+    attention_spec.CombinedQueryKeyValueProjection,
+    QueryKeyValueProjectionConfig,
+):
+  """Use a single projection matrix for query/key/value projection.
+
+  * Incompatible with Grouped Query Attention (num_query_heads != num_kv_heads).
+  * Supports shared key and value projection.
+  """
+
+  # Kernel initializer for the combined query/key/value projection.
+  # The variable shape is [input_dimension, 3, num_heads, units_per_head].
+  # If share_kv_projection is True, the variable shape is [input_dimension, 2,
+  # num_heads, units_per_head].
+  qkv_kernel_init: Callable | None = None
+
+  # Bias initializer for the combined query/key/value projection.
+  # The variable shape is [3, num_heads, units_per_head].
+  bias_init: Callable | None = None
+
+  # If true, share the key and value projection matrices.
+  share_kv_projection: bool = False
+
+
+@dataclasses.dataclass(frozen=True)
+class SeparateQueryKeyValueProjection(
+    attention_spec.SeparateQueryKeyValueProjection,
+    QueryKeyValueProjectionConfig,
+):
+  """Use separate projection matrices for query/key/value projection.
+
+  * Supports Grouped Query Attention (num_query_heads != num_kv_heads).
+  * Does not support shared key and value projection. Use
+    QueryAndSharedKeyValueProjection.
+  """
+
+  # Kernel initializers for the separate query/key/value projections.
+  # The variable shape is [input_dimension, num_heads or num_kv_heads,
+  # units_per_head].
+  q_kernel_init: Callable | None = None
+  k_kernel_init: Callable | None = None
+  v_kernel_init: Callable | None = None
+
+  # Bias initializer for the separate query/key/value projections.
+  # The variable shape is [num_heads or num_kv_heads, units_per_head].
+  bias_init: Callable | None = None
+
+
+@dataclasses.dataclass(frozen=True)
+class QueryAndKeyValueProjection(
+    attention_spec.QueryAndKeyValueProjection,
+    QueryKeyValueProjectionConfig,
+):
+  """Use separate query and key/value projection matrices.
+
+  * Supports Grouped Query Attention (num_query_heads != num_kv_heads).
+  * Does not support shared key and value projection. Use
+    QueryAndSharedKeyValueProjection.
+  """
+
+  # Kernel initializer for the query projection.
+  # The variable shape is [input_dimension, num_heads, units_per_head].
+  q_kernel_init: Callable | None = None
+
+  # Bias initializer for the query projection.
+  # The variable shape is [num_heads, units_per_head].
+  q_bias_init: Callable | None = None
+
+  # Kernel initializer for the key/value projection.
+  # The variable shape is [input_dimension, 2, num_kv_heads, units_per_head].
+  kv_kernel_init: Callable | None = None
+
+  # Bias initializer for the key/value projection.
+  # The variable shape is [2, num_kv_heads, units_per_head].
+  kv_bias_init: Callable | None = None
+
+
+@dataclasses.dataclass(frozen=True)
+class QueryAndSharedKeyValueProjection(
+    attention_spec.QueryAndSharedKeyValueProjection,
+    QueryKeyValueProjectionConfig,
+):
+  """Use separate query and shared key/value projection matrices.
+
+  * Supports Grouped Query Attention (num_query_heads != num_kv_heads).
+  * Requires shared key and value projection.
+  """
+
+  # Kernel initializer for the query projection.
+  # The variable shape is [input_dimension, num_heads, units_per_head].
+  q_kernel_init: Callable | None = None
+
+  # Bias initializer for the query projection.
+  # The variable shape is [num_heads, units_per_head].
+  q_bias_init: Callable | None = None
+
+  # Kernel initializer for the shared key/value projection.
+  # The variable shape is [input_dimension, num_kv_heads, units_per_head].
+  kv_kernel_init: Callable | None = None
+
+  # Bias initializer for the shared key/value projection.
+  # The variable shape is [num_kv_heads, units_per_head].
+  kv_bias_init: Callable | None = None
diff --git a/sequence_layers/mlx/signal.py b/sequence_layers/mlx/signal.py
new file mode 100644
index 0000000..3452559
--- /dev/null
+++ b/sequence_layers/mlx/signal.py
@@ -0,0 +1,62 @@
+"""Signal utilities for MLX, ported from sequence_layers.jax.signal."""
+
+import numpy as np
+import mlx.core as mx
+
+
+def _raised_cosine_window(window_length, periodic, dtype, a, b):
+  """Computes a raised cosine window."""
+  if window_length == 1:
+    return np.ones([1], dtype=dtype)
+  even = 1 - window_length % 2
+  n = np.asarray(window_length + int(periodic) * even - 1, dtype=dtype)
+  count = np.arange(window_length, dtype=dtype)
+  cos_arg = 2 * np.pi * count / n
+  return a - b * np.cos(cos_arg)
+
+
+def hann_window(window_length, periodic=True, dtype=np.float32):
+  """Computes a hann window. Ported from tf.signal."""
+  return _raised_cosine_window(window_length, periodic, dtype, 0.5, 0.5)
+
+
+def hamming_window(window_length, periodic=True, dtype=np.float32):
+  """Computes a Hamming window."""
+  a0 = 0.54
+  return _raised_cosine_window(window_length, periodic, dtype, a0, 1.0 - a0)
+
+
+def inverse_stft_window_fn(frame_step, forward_window_fn=hann_window):
+  """Generates a window function that can be used in inverse STFT.
+
+  Constructs a window that is equal to the forward window with a further
+  pointwise amplitude correction.
+
+  Args:
+    frame_step: The number of samples to step.
+    forward_window_fn: Window function used in the forward STFT transform.
+
+  Returns:
+    A callable that takes a window length and a dtype keyword argument and
+    returns a [window_length] array of window samples.
+  """
+
+  def inverse_stft_window_fn_inner(frame_length, dtype=np.float32):
+    """Computes a window suitable for inverse STFT reconstruction."""
+    # Use equation 7 from Griffin + Lim.
+    forward_window = forward_window_fn(frame_length, dtype=dtype)
+    # Convert to mx array for computation.
+    fw = mx.array(forward_window, dtype=mx.float32)
+    denom = mx.square(fw)
+    overlaps = -(-frame_length // frame_step)  # Ceiling division.
+    denom = mx.pad(denom, [(0, overlaps * frame_step - frame_length)])
+    denom = mx.reshape(denom, [overlaps, frame_step])
+    denom = mx.sum(denom, axis=0, keepdims=True)
+    denom = mx.tile(denom, [overlaps, 1])
+    denom = mx.reshape(denom, [overlaps * frame_step])
+    denom = denom[:frame_length]
+    result = mx.where(denom == 0.0, 0, fw / denom)
+    # Convert back to numpy for consistency with the forward window.
+    return np.array(result, dtype=dtype)
+
+  return inverse_stft_window_fn_inner
diff --git a/sequence_layers/mlx/simple.py b/sequence_layers/mlx/simple.py
new file mode 100644
index 0000000..1744734
--- /dev/null
+++ b/sequence_layers/mlx/simple.py
@@ -0,0 +1,1591 @@
+"""Simple sequence layers for MLX."""
+
+import dataclasses
+from fractions import Fraction
+import math
+from typing import Any, Callable, override
+
+from absl import logging
+from mlx import nn
+import mlx.core as mx
+import numpy as np
+
+from sequence_layers.mlx import types
+from sequence_layers.specs import simple as spec
+
+Sequence = types.Sequence
+MaskedSequence = types.MaskedSequence
+ShapeDType = types.ShapeDType
+
+
+def _to_mx_dtype(dtype: Any) -> mx.Dtype | None:
+  """Converts various dtype representations to MLX DType."""
+  if dtype is None:
+    return None
+  if isinstance(dtype, str):
+    if dtype == 'float32':
+      return mx.float32
+    if dtype == 'float16':
+      return mx.float16
+    if dtype == 'int32':
+      return mx.int32
+    if dtype == 'bool':
+      return mx.bool_
+  if dtype == np.float32:
+    return mx.float32
+  if dtype == np.float16:
+    return mx.float16
+  if dtype == np.int32:
+    return mx.int32
+  if dtype in (np.bool_, bool):
+    return mx.bool_
+  return dtype
+
+
+# ---------------------------------------------------------------------------
+# Identity
+# ---------------------------------------------------------------------------
+
+
+class Identity(
+    types.PreservesType,
+    types.StatelessPointwise,
+    spec.Identity[types.Sequence, types.ShapeDType],
+):
+  """Identity pass-through of the input."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.Identity.Config):
+    """Configuration for Identity layer."""
+
+    name: str | None = None
+
+    @override
+    def make(self) -> 'Identity':
+      """Creates the Identity layer."""
+      return Identity(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+
+  @override
+  @types.check_layer
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    """Returns the input sequence unchanged."""
+    return x
+
+
+# ---------------------------------------------------------------------------
+# Activation layers
+# ---------------------------------------------------------------------------
+
+
+class Relu(
+    types.PreservesType,
+    types.StatelessPointwiseFunctor,
+    spec.Relu[types.Sequence, types.ShapeDType],
+):
+  """A Relu layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.Relu.Config):
+    """Configuration for Relu layer."""
+
+    name: str | None = None
+
+    @override
+    def make(self) -> 'Relu':
+      """Creates the Relu layer."""
+      return Relu(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+
+  @property
+  @override
+  def mask_required(self):
+    return False
+
+  @override
+  def fn(self, values: mx.array, mask: mx.array) -> tuple[mx.array, mx.array]:
+    """Transforms each scalar in values independently using ReLU."""
+    return nn.relu(values), mask
+
+
+class Gelu(
+    types.PreservesType,
+    types.StatelessPointwiseFunctor,
+    spec.Gelu[types.Sequence, types.ShapeDType],
+):
+  """A Gelu layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.Gelu.Config):
+    """Configuration for Gelu layer."""
+
+    name: str | None = None
+
+    @override
+    def make(self) -> 'Gelu':
+      """Creates the Gelu layer."""
+      return Gelu(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+
+  @property
+  @override
+  def mask_required(self):
+    return False
+
+  @override
+  def fn(self, values: mx.array, mask: mx.array) -> tuple[mx.array, mx.array]:
+    """Transforms each scalar in values independently using GELU."""
+    return nn.gelu(values), mask
+
+
+class Abs(
+    types.PreservesType,
+    types.StatelessPointwiseFunctor,
+    spec.Abs[types.Sequence, types.ShapeDType],
+):
+  """Absolute value layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.Abs.Config):
+    """Configuration for Abs layer."""
+
+    name: str | None = None
+
+    @override
+    def make(self) -> 'Abs':
+      """Creates the Abs layer."""
+      return Abs(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+
+  @property
+  @override
+  def mask_required(self):
+    return False
+
+  @override
+  def fn(self, values: mx.array, mask: mx.array) -> tuple[mx.array, mx.array]:
+    """Transforms each scalar in values independently using absolute value."""
+    return mx.abs(values), mask
+
+
+class Exp(
+    types.PreservesType,
+    types.StatelessPointwiseFunctor,
+    spec.Exp[types.Sequence, types.ShapeDType],
+):
+  """Exponential layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.Exp.Config):
+    """Configuration for Exp layer."""
+
+    name: str | None = None
+
+    @override
+    def make(self) -> 'Exp':
+      """Creates the Exp layer."""
+      return Exp(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+
+  @property
+  @override
+  def mask_required(self):
+    return False
+
+  @override
+  def fn(self, values: mx.array, mask: mx.array) -> tuple[mx.array, mx.array]:
+    """Transforms each scalar in values independently using exponential."""
+    return mx.exp(values), mask
+
+
+class Log(
+    types.PreservesType,
+    types.StatelessPointwiseFunctor,
+    spec.Log[types.Sequence, types.ShapeDType],
+):
+  """Logarithm layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.Log.Config):
+    """Configuration for Log layer."""
+
+    name: str | None = None
+
+    @override
+    def make(self) -> 'Log':
+      """Creates the Log layer."""
+      return Log(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+
+  @property
+  @override
+  def mask_required(self):
+    return False
+
+  @override
+  def fn(self, values: mx.array, mask: mx.array) -> tuple[mx.array, mx.array]:
+    """Transforms each scalar in values independently using natural logarithm."""
+    return mx.log(values), mask
+
+
+class Swish(
+    types.PreservesType,
+    types.StatelessPointwiseFunctor,
+    spec.Swish[types.Sequence, types.ShapeDType],
+):
+  """A Swish/SiLU layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.Swish.Config):
+    """Configuration for Swish layer."""
+
+    name: str | None = None
+
+    @override
+    def make(self) -> 'Swish':
+      """Creates the Swish layer."""
+      return Swish(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+
+  @property
+  @override
+  def mask_required(self):
+    return False
+
+  @override
+  def fn(self, values: mx.array, mask: mx.array) -> tuple[mx.array, mx.array]:
+    """Transforms each scalar in values independently using Swish (SiLU)."""
+    return nn.silu(values), mask
+
+
+class Tanh(
+    types.PreservesType,
+    types.StatelessPointwiseFunctor,
+    spec.Tanh[types.Sequence, types.ShapeDType],
+):
+  """A tanh layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.Tanh.Config):
+    """Configuration for Tanh layer."""
+
+    name: str | None = None
+
+    @override
+    def make(self) -> 'Tanh':
+      """Creates the Tanh layer."""
+      return Tanh(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+
+  @property
+  @override
+  def mask_required(self):
+    return False
+
+  @override
+  def fn(self, values: mx.array, mask: mx.array) -> tuple[mx.array, mx.array]:
+    """Transforms each scalar in values independently using hyperbolic tangent."""
+    return mx.tanh(values), mask
+
+
+class Sigmoid(
+    types.PreservesType, types.StatelessPointwiseFunctor, spec.Sigmoid
+):
+  """A sigmoid layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.Sigmoid.Config):
+    """Configuration for Sigmoid layer."""
+
+    name: str | None = None
+
+    @override
+    def make(self) -> 'Sigmoid':
+      """Creates the Sigmoid layer."""
+      return Sigmoid(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+
+  @property
+  @override
+  def mask_required(self):
+    return False
+
+  @override
+  def fn(self, values: mx.array, mask: mx.array) -> tuple[mx.array, mx.array]:
+    """Transforms each scalar in values independently using Sigmoid."""
+    return mx.sigmoid(values), mask
+
+
+class LeakyRelu(
+    types.PreservesType, types.StatelessPointwiseFunctor, spec.LeakyRelu
+):
+  """A Leaky Relu layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.LeakyRelu.Config):
+    """Configuration for LeakyRelu layer."""
+
+    negative_slope: float = 0.01
+    name: str | None = None
+
+    @override
+    def make(self) -> 'LeakyRelu':
+      """Creates the LeakyRelu layer."""
+      return LeakyRelu(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+
+  @property
+  @override
+  def mask_required(self):
+    return False
+
+  @override
+  def fn(self, values: mx.array, mask: mx.array) -> tuple[mx.array, mx.array]:
+    """Transforms each scalar in values independently using Leaky ReLU."""
+    return nn.leaky_relu(values, self.config.negative_slope), mask
+
+
+class Elu(
+    types.PreservesType,
+    types.StatelessPointwiseFunctor,
+    spec.Elu[types.Sequence, types.ShapeDType],
+):
+  """An ELU activation layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.Elu.Config):
+    """Configuration for Elu layer."""
+
+    alpha: complex = 1.0
+    name: str | None = None
+
+    @override
+    def make(self) -> 'Elu':
+      """Creates the Elu layer."""
+      return Elu(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+
+  @property
+  @override
+  def mask_required(self):
+    return False
+
+  @override
+  def fn(self, values: mx.array, mask: mx.array) -> tuple[mx.array, mx.array]:
+    """Transforms each scalar in values independently using ELU."""
+    return nn.elu(values, self.config.alpha), mask
+
+
+class Softmax(
+    types.PreservesType, types.StatelessPointwiseFunctor, spec.Softmax
+):
+  """A softmax layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.Softmax.Config):
+    """Configuration for Softmax layer."""
+
+    axis: int = -1
+    name: str | None = None
+
+    @override
+    def make(self) -> 'Softmax':
+      """Creates the Softmax layer."""
+      return Softmax(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+
+  @property
+  @override
+  def mask_required(self):
+    return False
+
+  @override
+  def fn(self, values: mx.array, mask: mx.array) -> tuple[mx.array, mx.array]:
+    """Transforms each scalar in values independently using Softmax."""
+    axis = self.config.axis
+    if (axis if axis >= 0 else values.ndim + axis) < 2:
+      raise ValueError(
+          'The softmax cannot be applied on the batch or time'
+          f' dimension (got {axis=} for shape={values.shape})'
+      )
+    return mx.softmax(values, axis=axis), mask
+
+
+class Softplus(
+    types.PreservesType, types.StatelessPointwiseFunctor, spec.Softplus
+):
+  """A softplus layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.Softplus.Config):
+    """Configuration for Softplus layer."""
+
+    name: str | None = None
+
+    @override
+    def make(self) -> 'Softplus':
+      """Creates the Softplus layer."""
+      return Softplus(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+
+  @property
+  @override
+  def mask_required(self):
+    return False
+
+  @override
+  def fn(self, values: mx.array, mask: mx.array) -> tuple[mx.array, mx.array]:
+    """Transforms each scalar in values independently using Softplus."""
+    return nn.softplus(values), mask
+
+
+# ---------------------------------------------------------------------------
+# Value manipulation
+# ---------------------------------------------------------------------------
+
+
+class Cast(
+    types.StatelessPointwiseFunctor, spec.Cast[types.Sequence, types.ShapeDType]
+):
+  """Cast input values to the specified type."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types.SequenceLayerConfig, spec.Cast.Config):
+    """Configuration for Cast layer."""
+
+    dtype: object = mx.float32
+    name: str | None = None
+
+    @override
+    def make(self) -> 'Cast':
+      return Cast(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+    self._dtype = _to_mx_dtype(config.dtype)
+
+  @property
+  @override
+  def mask_required(self):
+    return False
+
+  @override
+  def fn(self, values: mx.array, mask: mx.array) -> tuple[mx.array, mx.array]:
+    """Casts input values to the specified type."""
+    return values.astype(self._dtype), mask  # type: ignore
+
+  @override
+  def get_output_dtype(self, input_dtype, *, constants=None) -> mx.Dtype:
+    assert self._dtype is not None
+    return self._dtype
+
+
+class Scale(
+    types.PreservesType,
+    types.StatelessPointwise,
+    spec.Scale[types.Sequence, types.ShapeDType],
+):
+  """Scales the input by a provided constant or array."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.Scale.Config):
+    """Configuration for Scale layer."""
+
+    scale: complex | np.ndarray | types.HashableArray = 1.0
+    name: str | None = None
+
+    def __post_init__(self):
+      object.__setattr__(
+          self, 'scale', types.HashableArray.from_array(self.scale)
+      )
+
+    @override
+    def make(self) -> 'Scale':
+      """Creates the Scale layer."""
+      return Scale(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+    assert isinstance(config.scale, types.HashableArray)
+    self._scale = mx.array(config.scale.to_array())
+
+  @override
+  def get_output_shape(
+      self,
+      input_shape: types.ShapeLike,
+      *,
+      constants: types.Constants | None = None,
+  ) -> types.Shape:
+    del constants
+    s_shape = (
+        ()
+        if isinstance(self._scale, (int, float, complex))
+        else self._scale.shape
+    )
+    if len(s_shape) > len(input_shape):
+      raise ValueError(
+          f'Scale parameter has too many dimensions ({len(s_shape)}) to'
+          f' broadcast with input shape ({len(input_shape)}).'
+      )
+    try:
+      return np.broadcast_shapes(tuple(input_shape), s_shape)
+    except ValueError as e:
+      raise ValueError(
+          f'Cannot broadcast shape {input_shape} with scale shape {s_shape}'
+      ) from e
+
+  @types.check_layer
+  @override
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    """Scales the input sequence by a learned or fixed scale."""
+    return x.apply_values_masked(lambda v: v * self._scale)
+
+
+class Add(
+    types.PreservesType,
+    types.StatelessPointwise,
+    spec.Add[types.Sequence, types.ShapeDType],
+):
+  """Adds a provided constant or array to the input."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.Add.Config):
+    """Configuration for Add layer."""
+
+    shift: Any
+    name: str | None = None
+
+    @override
+    def make(self) -> 'Add':
+      """Creates the Add layer."""
+      return Add(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+    shift = config.shift
+    if hasattr(shift, 'data') and hasattr(shift, 'dtype'):
+      self._shift = mx.array(np.array(shift.data, dtype=shift.dtype))
+    elif hasattr(shift, 'array'):
+      self._shift = mx.array(np.asarray(shift.array))
+    elif isinstance(shift, np.ndarray):
+      self._shift = mx.array(shift)
+    else:
+      self._shift = shift
+
+  @override
+  def get_output_shape(
+      self,
+      input_shape: types.ShapeLike,
+      *,
+      constants: types.Constants | None = None,
+  ) -> types.Shape:
+    del constants
+    s_shape = (
+        ()
+        if isinstance(self._shift, (int, float, complex))
+        else self._shift.shape
+    )
+    if len(s_shape) > len(input_shape):
+      raise ValueError(
+          f'Shift parameter has too many dimensions ({len(s_shape)}) to'
+          f' broadcast with input shape ({len(input_shape)}).'
+      )
+    try:
+      return np.broadcast_shapes(tuple(input_shape), s_shape)
+    except ValueError as e:
+      raise ValueError(
+          f'Cannot broadcast shape {input_shape} with shift shape {s_shape}'
+      ) from e
+
+  @types.check_layer
+  @override
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    """Adds a learned or fixed shift to the input sequence."""
+    return x.apply_values(lambda v: v + self._shift)
+
+
+# ---------------------------------------------------------------------------
+# Masking
+# ---------------------------------------------------------------------------
+
+
+class MaskInvalid(
+    types.PreservesType, types.StatelessPointwise, spec.MaskInvalid
+):
+  """Masks invalid timesteps to zero (or a specified value)."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.MaskInvalid.Config):
+    """Configuration for MaskInvalid layer."""
+
+    mask_value: Any = None
+    name: str | None = None
+
+    @override
+    def make(self) -> 'MaskInvalid':
+      """Creates the MaskInvalid layer."""
+      return MaskInvalid(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+
+  @types.check_layer
+  @override
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    """Masks invalid values (NaN, Inf) in the input sequence."""
+    return x.mask_invalid(self.config.mask_value)
+
+
+# ---------------------------------------------------------------------------
+# Gated units
+# ---------------------------------------------------------------------------
+
+
+class GatedUnit(
+    types.PreservesType,
+    types.Stateless,
+    spec.GatedUnit[types.Sequence, types.ShapeDType],
+):
+  """Computes a generalized Gated Unit, reducing input channels by 2x."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.GatedUnit.Config):
+    """Configuration for GatedUnit layer."""
+
+    feature_activation: Callable | None = None
+    gate_activation: Callable | None = None
+    name: str | None = None
+
+    @override
+    def make(self) -> 'GatedUnit':
+      return GatedUnit(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+    self._feature_activation = config.feature_activation
+    self._gate_activation = config.gate_activation
+
+  @property
+  @override
+  def receptive_field(self) -> tuple[int, int]:
+    return (0, 0)
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    channels = input_shape[-1]
+    if channels % 2 != 0:
+      raise ValueError(
+          f'Final dimension of input ({input_shape=}) must have'
+          ' an even number of channels.'
+      )
+    return tuple(input_shape[:-1]) + (channels // 2,)
+
+  @types.check_layer
+  @override
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    """Applies a gated unit to the input sequence."""
+    feature, gate = mx.split(x.values, 2, axis=-1)
+    if self._feature_activation:
+      feature = self._feature_activation(feature)
+    if self._gate_activation:
+      gate = self._gate_activation(gate)
+    return Sequence(feature * gate, x.mask)
+
+
+class GatedLinearUnit(
+    GatedUnit, spec.GatedLinearUnit[types.Sequence, types.ShapeDType]
+):
+  """Computes a Gated Linear Unit, reducing input channels by 2x."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(GatedUnit.Config, spec.GatedLinearUnit.Config):
+    """Configuration for GatedLinearUnit layer."""
+
+    name: str | None = None
+
+    @override
+    def make(self) -> 'GatedLinearUnit':
+      """Create GatedLinearUnit layer."""
+      return GatedLinearUnit(
+          GatedUnit.Config(
+              feature_activation=None,
+              gate_activation=mx.sigmoid,
+              name=self.name,
+          )
+      )
+
+
+class GatedTanhUnit(
+    GatedUnit, spec.GatedTanhUnit[types.Sequence, types.ShapeDType]
+):
+  """Computes a Gated Tanh Unit, reducing input channels by 2x."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(GatedUnit.Config, spec.GatedTanhUnit.Config):
+    """Configuration for GatedTanhUnit layer."""
+
+    name: str | None = None
+
+    @override
+    def make(self) -> 'GatedTanhUnit':
+      return GatedTanhUnit(
+          GatedUnit.Config(
+              feature_activation=mx.tanh,
+              gate_activation=mx.sigmoid,
+              name=self.name,
+          )
+      )
+
+
+# ---------------------------------------------------------------------------
+# Shape manipulation
+# ---------------------------------------------------------------------------
+
+
+class Flatten(
+    types.PreservesType,
+    types.StatelessPointwise,
+    spec.Flatten[types.Sequence, types.ShapeDType],
+):
+  """Flattens the channel dimensions of the input sequence."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types.SequenceLayerConfig):
+    """Configuration for Flatten layer."""
+
+    name: str | None = None
+
+    @override
+    def make(self) -> 'Flatten':
+      return Flatten(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    return (math.prod(input_shape),)
+
+  @types.check_layer
+  @override
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    """Flattens the channel dimensions of the input sequence."""
+    batch_size, time = x.values.shape[:2]
+    num_elements = math.prod(x.channel_shape)
+    new_values = mx.reshape(x.values, (batch_size, time, num_elements))
+    if isinstance(x, MaskedSequence):
+      return MaskedSequence(new_values, x.mask)
+    return Sequence(new_values, x.mask)
+
+
+class Reshape(
+    types.PreservesType,
+    types.Stateless,
+    spec.Reshape[types.Sequence, types.ShapeDType],
+):
+  """Reshapes the channels dimension of the input."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.Reshape.Config):
+    """Configuration for Reshape layer."""
+
+    output_shape: tuple[int, ...] = ()
+    name: str | None = None
+
+    def __post_init__(self):
+      object.__setattr__(self, 'output_shape', tuple(self.output_shape))
+
+    @override
+    def make(self) -> 'Reshape':
+      return Reshape(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+
+  @property
+  @override
+  def receptive_field(self) -> tuple[int, int]:
+    return (0, 0)
+
+  def _validate(self, input_shape):
+    """Validates that input and output shapes have the same number of elements."""
+    in_elems = math.prod(input_shape)
+
+    out_elems = math.prod(self.config.output_shape)
+    if in_elems != out_elems:
+      raise ValueError(
+          f'Reshape output_shape={self.config.output_shape} must have'
+          f' the same number of elements as {input_shape=}.'
+      )
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    self._validate(input_shape)
+    return self.config.output_shape
+
+  @types.check_layer
+  @override
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    """Reshapes the channel dimensions of the input sequence."""
+    self._validate(x.channel_shape)
+    b, t = x.values.shape[:2]
+    new_values = mx.reshape(x.values, (b, t) + self.config.output_shape)
+    if isinstance(x, MaskedSequence):
+      return MaskedSequence(new_values, x.mask)
+    return Sequence(new_values, x.mask)
+
+
+class ExpandDims(
+    types.PreservesType,
+    types.Stateless,
+    spec.ExpandDims[types.Sequence, types.ShapeDType],
+):
+  """Expands channel dimensions of the input sequence."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.ExpandDims.Config):
+    """Configuration for ExpandDims layer."""
+
+    axis: int | tuple[int, ...] = 0
+    name: str | None = None
+
+    def __post_init__(self):
+      if not isinstance(self.axis, int):
+        object.__setattr__(self, 'axis', tuple(self.axis))
+
+    @override
+    def make(self) -> 'ExpandDims':
+      return ExpandDims(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+    self._axis: tuple[int, ...] = (
+        (config.axis,) if isinstance(config.axis, int) else tuple(config.axis)
+    )
+
+  @property
+  @override
+  def receptive_field(self) -> tuple[int, int]:
+    return (0, 0)
+
+  def _normalize_axes(self, input_shape):
+    """Normalizes axes to positive indices."""
+    rank = len(input_shape)
+
+    dims = sorted(set(a + rank + 1 if a < 0 else a for a in self._axis))
+    for d in dims:
+      if d < 0 or d > rank:
+        raise ValueError(
+            f'ExpandDims axes must refer to channel dims. Got: {self._axis}.'
+        )
+    return dims
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    dims = self._normalize_axes(input_shape)
+    out = list(input_shape)
+    for a in dims:
+      out.insert(a, 1)
+    return tuple(out)
+
+  @types.check_layer
+  @override
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    """Expands the dimensions of the input sequence by inserting new axes."""
+    dims = [2 + d for d in self._normalize_axes(x.channel_shape)]
+    new_values = mx.expand_dims(x.values, axis=dims)
+    if isinstance(x, MaskedSequence):
+      return MaskedSequence(new_values, x.mask)
+    return Sequence(new_values, x.mask)
+
+
+class Squeeze(
+    types.PreservesType,
+    types.Stateless,
+    spec.Squeeze[types.Sequence, types.ShapeDType],
+):
+  """Squeezes singleton channel dimensions of the input."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.Squeeze.Config):
+    """Configuration for Squeeze layer."""
+
+    axis: int | tuple[int, ...] | None = None
+    name: str | None = None
+
+    @override
+    def make(self) -> 'Squeeze':
+      return Squeeze(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+
+  @property
+  @override
+  def receptive_field(self) -> tuple[int, int]:
+    return (0, 0)
+
+  def _channel_squeeze_axes(self, input_shape):
+    """Return channel-relative axes to squeeze."""
+    if self.config.axis is None:
+      # Squeeze all singleton channel dims.
+      return tuple(i for i, n in enumerate(input_shape) if n == 1)
+    # If axis is given, it's in full-tensor coords. Convert to channel.
+    if isinstance(self.config.axis, int):
+      axes = (self.config.axis,)
+    else:
+      axes = tuple(self.config.axis)
+    return axes
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    squeeze_axes = self._channel_squeeze_axes(input_shape)
+    out = []
+    for i, s in enumerate(input_shape):
+      if i not in squeeze_axes:
+        out.append(s)
+    return tuple(out)
+
+  @types.check_layer
+  @override
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    """Squeezes the dimensions of the input sequence by removing axes of size 1."""
+    ch_axes = self._channel_squeeze_axes(x.channel_shape)
+    # Convert to full-tensor axes (offset by 2 for batch, time).
+    full_axes = tuple(a + 2 for a in ch_axes)
+    new_values = mx.squeeze(x.values, axis=full_axes)
+    if isinstance(x, MaskedSequence):
+      return MaskedSequence(new_values, x.mask)
+    return Sequence(new_values, x.mask)
+
+
+class Transpose(
+    types.PreservesType,
+    types.Stateless,
+    spec.Transpose[types.Sequence, types.ShapeDType],
+):
+  """Permutes the channel axes of the input."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.Transpose.Config):
+    """Configuration for Transpose layer."""
+
+    axes: tuple[int, ...] | None = None
+    name: str | None = None
+
+    @override
+    def make(self) -> 'Transpose':
+      return Transpose(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+    self._axes: tuple[int, ...] | None = (
+        tuple(config.axes) if config.axes is not None else None
+    )
+
+  @property
+  @override
+  def receptive_field(self) -> tuple[int, int]:
+    return (0, 0)
+
+  def _resolve_axes(self, input_shape):
+    """Resolves axes for transpose."""
+    input_axes = tuple(range(2, 2 + len(input_shape)))
+
+    if self._axes is None:
+      return input_axes[::-1]
+    sorted_axes = tuple(sorted(self._axes))
+    if sorted_axes != input_axes:
+      raise ValueError(
+          f'Provided axes {sorted_axes} do not match input axes {input_axes}.'
+      )
+    return tuple(self._axes)
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    axes = self._resolve_axes(input_shape)
+    return tuple(input_shape[a - 2] for a in axes)
+
+  @types.check_layer
+  @override
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    """Transposes the channel dimensions of the input sequence."""
+    axes = self._resolve_axes(x.channel_shape)
+    new_values = mx.transpose(x.values, (0, 1) + axes)
+    if isinstance(x, MaskedSequence):
+      return MaskedSequence(new_values, x.mask)
+    return Sequence(new_values, x.mask)
+
+
+# ---------------------------------------------------------------------------
+# Encoding
+# ---------------------------------------------------------------------------
+
+
+class OneHot(types.Stateless, spec.OneHot[types.Sequence, types.ShapeDType]):
+  """Computes one-hot vector of the input."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.OneHot.Config):
+    """Configuration for OneHot layer."""
+
+    depth: int
+    compute_dtype: Any = mx.float32
+    name: str | None = None
+
+    @override
+    def make(self) -> 'OneHot':
+      return OneHot(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+    self._compute_dtype = _to_mx_dtype(config.compute_dtype)
+
+  @property
+  @override
+  def receptive_field(self) -> tuple[int, int]:
+    return (0, 0)
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    return tuple(input_shape) + (self.config.depth,)
+
+  @override
+  def get_output_dtype(self, input_dtype, *, constants=None) -> mx.Dtype:
+    assert self._compute_dtype is not None
+    return self._compute_dtype
+
+  @types.check_layer
+  @override
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    """Converts integer values to one-hot representations."""
+
+    def one_hot_fn(v):
+      indices = v.astype(mx.int32)
+      return mx.eye(self.config.depth, dtype=self._compute_dtype)[indices]
+
+    return x.apply_values(one_hot_fn)
+
+
+class Embedding(
+    types.Stateless, spec.Embedding[types.Sequence, types.ShapeDType]
+):
+  """Computes embeddings of integer input codes.
+
+  Backed by mlx.nn.Embedding.
+  """
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.Embedding.Config):
+    """Configuration for Embedding layer."""
+
+    num_embeddings: int = 1
+    dimension: int = 1
+    compute_dtype: types.DType | None = None
+    param_dtype: types.DType = mx.float32
+    name: str | None = None
+
+    @override
+    def make(self) -> 'Embedding':
+      return Embedding(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+    self._param_dtype = _to_mx_dtype(config.param_dtype)
+    self._compute_dtype = (
+        _to_mx_dtype(config.compute_dtype)
+        if config.compute_dtype is not None
+        else None
+    )
+    self._embedding = nn.Embedding(config.num_embeddings, config.dimension)
+
+  @property
+  @override
+  def receptive_field(self) -> tuple[int, int]:
+    return (0, 0)
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    return tuple(input_shape) + (self.config.dimension,)
+
+  @override
+  def get_output_dtype(self, input_dtype, *, constants=None) -> mx.Dtype:
+    if self._compute_dtype is not None:
+      return self._compute_dtype
+    assert self._param_dtype is not None
+    return self._param_dtype
+
+  @types.check_layer
+  @override
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    """Embeds integer values using a learned embedding matrix."""
+
+    def embed_fn(v):
+      result = self._embedding(v.astype(mx.int32))
+      compute_dtype = self._compute_dtype
+      if compute_dtype is not None:
+        result = result.astype(compute_dtype)  # type: ignore
+      return result
+
+    return x.apply_values(embed_fn)
+
+
+# ---------------------------------------------------------------------------
+# Regularization
+# ---------------------------------------------------------------------------
+
+
+class Dropout(
+    types.PreservesType,
+    types.StatelessPointwise,
+    spec.Dropout[types.Sequence, types.ShapeDType],
+):
+  """Dropout layer (pass-through during inference)."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.Dropout.Config):
+    """Configuration for Dropout layer."""
+
+    rate: float = 0.0
+    broadcast_dims: tuple[int, ...] = ()
+    name: str | None = None
+
+    @override
+    def make(self) -> 'Dropout':
+      """Creates the Dropout layer."""
+      return Dropout(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+
+  @types.check_layer
+  @override
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    """Applies dropout to the input sequence."""
+    if training:
+      raise NotImplementedError('Dropout training is not implemented in MLX.')
+    # Inference-only: dropout is a no-op.
+    return x
+
+
+# ---------------------------------------------------------------------------
+# Sampling
+# ---------------------------------------------------------------------------
+
+
+class Downsample1D(
+    types.PreservesType,
+    types.Stateless,
+    spec.Downsample1D[types.Sequence, types.ShapeDType],
+):
+  """A 1D downsampling layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.Downsample1D.Config):
+    """Configuration for Downsample1D layer."""
+
+    rate: int
+    name: str | None = None
+
+    @override
+    def make(self) -> 'Downsample1D':
+      return Downsample1D(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+
+  @property
+  @override
+  def block_size(self):
+    return self.config.rate
+
+  @property
+  @override
+  def output_ratio(self):
+    return Fraction(1, self.config.rate)
+
+  @property
+  @override
+  def receptive_field(self) -> tuple[int, int]:
+    return (0, 0)
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    return tuple(input_shape)
+
+  @types.check_layer
+  @override
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    """Downsamples the input sequence along the time axis."""
+    new_values = x.values[:, :: self.config.rate]
+    new_mask = x.mask[:, :: self.config.rate]
+    if isinstance(x, MaskedSequence):
+      return MaskedSequence(new_values, new_mask)
+    return Sequence(new_values, new_mask)
+
+
+class Upsample1D(
+    types.PreservesType,
+    types.Stateless,
+    spec.Upsample1D[types.Sequence, types.ShapeDType],
+):
+  """A 1D upsampling layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.Upsample1D.Config):
+    """Configuration for Upsample1D layer."""
+
+    rate: int
+    name: str | None = None
+
+    @override
+    def make(self) -> 'Upsample1D':
+      return Upsample1D(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+
+  @property
+  @override
+  def output_ratio(self):
+    return Fraction(self.config.rate, 1)
+
+  @property
+  @override
+  def receptive_field(self) -> tuple[int, int]:
+    return (0, 0)
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    return tuple(input_shape)
+
+  @types.check_layer
+  @override
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    """Upsamples the input sequence along the time axis."""
+    # Repeat each timestep `rate` times along the time axis.
+    b, t = x.values.shape[:2]
+    channel_shape = x.values.shape[2:]
+    # [b, t, 1, ...] -> [b, t, rate, ...] -> [b, t*rate, ...]
+    expanded = mx.expand_dims(x.values, axis=2)
+    tiled = mx.repeat(expanded, self.config.rate, axis=2)
+    new_values = mx.reshape(tiled, (b, t * self.config.rate) + channel_shape)
+    # Same for mask: [b, t] -> [b, t*rate]
+    new_mask = mx.repeat(
+        mx.expand_dims(x.mask, axis=2), self.config.rate, axis=2
+    )
+    new_mask = mx.reshape(new_mask, (b, t * self.config.rate))
+    if isinstance(x, MaskedSequence):
+      return MaskedSequence(new_values, new_mask)
+    return Sequence(new_values, new_mask)
+
+
+# ---------------------------------------------------------------------------
+# CheckpointName (identity for inference)
+# ---------------------------------------------------------------------------
+
+
+class CheckpointName(
+    types.PreservesType,
+    types.StatelessPointwiseFunctor,
+    spec.CheckpointName[types.Sequence, types.ShapeDType],
+):
+  """Identity pass-through (checkpoint naming is JAX-only)."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.CheckpointName.Config):
+    """Configuration for CheckpointName layer."""
+
+    checkpoint_name: str = ''
+    name: str | None = None
+
+    @override
+    def make(self) -> 'CheckpointName':
+      """Creates the CheckpointName layer."""
+      return CheckpointName(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+
+  @override
+  def get_accumulated_input_latency(self, input_latency: int) -> int:
+    return super().get_accumulated_input_latency(input_latency)
+
+  @property
+  @override
+  def mask_required(self):
+    return False
+
+  @override
+  def fn(self, values: mx.array, mask: mx.array) -> tuple[mx.array, mx.array]:
+    """Identity function for CheckpointName."""
+    return values, mask
+
+
+# ---------------------------------------------------------------------------
+# Lambda
+# ---------------------------------------------------------------------------
+
+
+class Lambda(types.Stateless, spec.Lambda[types.Sequence, types.ShapeDType]):
+  """A SequenceLayer that wraps a Python callable."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.Lambda.Config):
+    """Configuration for Lambda layer."""
+
+    fn: Callable
+    sequence_input: bool = False
+    mask_required: bool = True
+    # Accepted for JAX compatibility but ignored by MLX Lambda.
+    expected_input_spec: object = None
+    expected_output_spec: object = None
+    name: str | None = None
+
+    @override
+    def make(self) -> 'Lambda':
+      return Lambda(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+    self._cached_output_specs = {}
+
+  @property
+  @override
+  def receptive_field(self) -> tuple[int, int]:
+    return (0, 0)
+
+  def _probe_output(self, input_shape, input_dtype):
+    """Probe the function with a dummy to infer output shape/dtype."""
+    if self.config.expected_output_spec is not None:
+      return self.config.expected_output_spec
+    cache_key = (tuple(input_shape), input_dtype)
+    if cache_key in self._cached_output_specs:
+      return self._cached_output_specs[cache_key]
+    try:
+      dummy_values = mx.zeros((1, 1) + tuple(input_shape), dtype=input_dtype)
+      dummy_mask = mx.ones((1, 1), dtype=mx.bool_)
+      assert self.config.fn is not None
+      if self.config.sequence_input:
+        result = self.config.fn(Sequence(dummy_values, dummy_mask))
+        out_shape = result.values.shape[2:]
+        out_dtype = result.values.dtype
+      else:
+        out_values = self.config.fn(dummy_values)
+        out_shape = out_values.shape[2:]
+        out_dtype = out_values.dtype
+      out_spec = types.ShapeDType(out_shape, out_dtype)
+      self._cached_output_specs[cache_key] = out_spec
+      return out_spec
+    except Exception:  # pylint: disable=broad-exception-caught
+      return None
+
+  @override
+  def get_output_shape(self, input_shape, *, constants=None):
+    out_spec = self._probe_output(input_shape, mx.float32)
+    if out_spec is not None:
+      return tuple(out_spec.shape)
+    return tuple(input_shape)
+
+  @override
+  def get_output_dtype(self, input_dtype, *, constants=None):
+    out_spec = self._probe_output((1,), input_dtype)
+    if out_spec is not None:
+      return out_spec.dtype
+    return input_dtype
+
+  @override
+  def layer(self, x, *, training: bool, constants=None):
+    """Applies a custom Python callable to the input sequence."""
+    assert self.config.fn is not None
+    if self.config.sequence_input:
+      result = self.config.fn(x)
+      if not isinstance(result, (Sequence, MaskedSequence)):
+        raise ValueError(
+            'Lambda with sequence_input=True must return a Sequence, '
+            f'got {type(result)}'
+        )
+      return result
+
+    new_values = self.config.fn(x.values)
+    if self.config.mask_required or not isinstance(x, MaskedSequence):
+      return Sequence(new_values, x.mask)
+    return MaskedSequence(new_values, x.mask)
+
+
+# ---------------------------------------------------------------------------
+# Logging
+# ---------------------------------------------------------------------------
+
+
+class Logging(
+    types.PreservesType,
+    types.StatelessPointwise,
+    spec.Logging[types.Sequence, types.ShapeDType],
+):
+  """Logs input info and returns the input unchanged."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(spec.Logging.Config):
+    """Configuration for Logging layer."""
+
+    prefix: str = ''
+    dump_tensors: bool = False
+    name: str | None = None
+
+    @override
+    def make(self) -> 'Logging':
+      """Creates the Logging layer."""
+      return Logging(self)
+
+  def __init__(self, config: Config):
+    super().__init__()
+    self.config = config
+
+  @override
+  def get_initial_state(
+      self,
+      batch_size: int,
+      input_spec: types.ChannelSpec,
+      *,
+      training: bool,
+      constants: types.Constants | None = None,
+  ) -> types.State:
+    if self.config.dump_tensors:
+      logging.info(
+          f'{self.config.prefix} get_initial_state(): batch_size={batch_size}, '
+          f'input_spec={input_spec}, training={training}, constants={constants}'
+      )
+    else:
+      logging.info(
+          f'{self.config.prefix} get_initial_state(): batch_size={batch_size}, '
+          f'input_spec={input_spec}, training={training}'
+      )
+    return super().get_initial_state(
+        batch_size, input_spec, training=training, constants=constants
+    )
+
+  @override
+  def step(
+      self,
+      x: types.Sequence,
+      state: types.State,
+      *,
+      training: bool,
+      constants: types.Constants | None = None,
+  ) -> tuple[types.Sequence, types.State]:
+    if self.config.dump_tensors:
+      logging.info(
+          f'{self.config.prefix} step(): x={x.values}, state={state}, '
+          f'training={training}, constants={constants}'
+      )
+    else:
+      logging.info(
+          f'{self.config.prefix} step(): x.shape={x.shape}, x.dtype={x.dtype}, '
+          f'state={state}, training={training}'
+      )
+    return super().step(x, state, training=training, constants=constants)
+
+  @types.check_layer
+  @override
+  def layer(  # pyrefly: ignore[missing-override-decorator]
+      self, x, *, training: bool, constants=None
+  ):
+    """Logs the input sequence values for debugging."""
+    if self.config.dump_tensors:
+      logging.info(
+          f'{self.config.prefix} layer(): x={x.values}, training={training},'
+          f' constants={constants}'
+      )
+    else:
+      logging.info(
+          f'{self.config.prefix} layer(): x.shape={x.shape}, x.dtype={x.dtype},'
+          f' training={training}'
+      )
+    return x
diff --git a/sequence_layers/mlx/simple_test.py b/sequence_layers/mlx/simple_test.py
new file mode 100644
index 0000000..f4d54fa
--- /dev/null
+++ b/sequence_layers/mlx/simple_test.py
@@ -0,0 +1,200 @@
+"""Tests for simple MLX sequence layers."""
+
+from typing import Any, override
+
+from absl.testing import absltest
+import numpy as np
+
+from sequence_layers.mlx import simple
+from sequence_layers.mlx import test_utils
+from sequence_layers.specs import simple_behaviors as spec
+
+
+class ModuleSpecTest(test_utils.SequenceLayerTest, spec.ModuleSpecTest):
+  pass
+
+
+class IdentityTest(test_utils.SequenceLayerTest, spec.IdentityTest):
+
+  def test_preserves_values(self):
+    layer = simple.Identity.Config().make()
+    x = self.random_sequence(2, 3, 4)
+    y = layer.layer(x, training=False)
+    np.testing.assert_array_equal(y.values, x.values)
+    np.testing.assert_array_equal(y.mask, x.mask)
+
+
+class PointwiseMathTest(test_utils.SequenceLayerTest, spec.PointwiseMathTest):
+
+  @override
+  def make_layer(  # pyrefly: ignore[bad-override-param-name]
+      self, config: Any
+  ) -> Any:
+    if isinstance(config, str):
+      layer_cls = getattr(self.sl, config)
+      return layer_cls(layer_cls.Config())
+    return super().make_layer(config)
+
+
+class CastTest(test_utils.SequenceLayerTest, spec.CastTest):
+  pass
+
+
+class ScaleTest(test_utils.SequenceLayerTest, spec.ScaleTest):
+  pass
+
+
+class AddTest(test_utils.SequenceLayerTest, spec.AddTest):
+  pass
+
+
+class MaskInvalidTest(test_utils.SequenceLayerTest, spec.MaskInvalidTest):
+  pass
+
+
+class GatedUnitTest(test_utils.SequenceLayerTest, spec.GatedUnitTest):
+  pass
+
+
+class FlattenTest(test_utils.SequenceLayerTest, spec.FlattenTest):
+  pass
+
+
+class ReshapeTest(test_utils.SequenceLayerTest, spec.ReshapeTest):
+
+  def test_mismatch_raises(self):
+    layer = simple.Reshape.Config(output_shape=(5,)).make()
+
+    with self.assertRaises(ValueError):
+      layer.get_output_shape((12,))
+
+
+class ExpandDimsTest(test_utils.SequenceLayerTest, spec.ExpandDimsTest):
+  pass
+
+
+class SqueezeTest(test_utils.SequenceLayerTest, spec.SqueezeTest):
+  pass
+
+
+class TransposeTest(test_utils.SequenceLayerTest, spec.TransposeTest):
+
+  def test_reverse(self):
+    layer = simple.Transpose.Config().make()
+    self.assertEqual(layer.get_output_shape((2, 3, 4)), (4, 3, 2))
+
+  def test_explicit(self):
+    layer = simple.Transpose.Config(axes=(3, 2, 4)).make()
+
+    self.assertEqual(layer.get_output_shape((5, 6, 7)), (6, 5, 7))
+
+
+class OneHotTest(test_utils.SequenceLayerTest, spec.OneHotTest):
+  pass
+
+
+class EmbeddingTest(test_utils.SequenceLayerTest, spec.EmbeddingTest):
+  pass
+
+
+class DropoutTest(test_utils.SequenceLayerTest, spec.DropoutTest):
+  pass
+
+
+class Downsample1DTest(test_utils.SequenceLayerTest, spec.Downsample1DTest):
+  pass
+
+
+class Upsample1DTest(test_utils.SequenceLayerTest, spec.Upsample1DTest):
+  pass
+
+
+# class BackendDispatchTest(parameterized.TestCase):
+#   """Test config.make(backend='mlx') for simple layers."""
+#
+#   def test_identity(self):
+#     import sequence_layers.mlx  # Register backends.
+#     from sequence_layers.jax import simple as jax_simple
+#
+#     config = jax_simple.Identity.Config()
+#     mlx_layer = config.make(backend='mlx')
+#     self.assertIsInstance(mlx_layer, simple.Identity)
+#
+#   def test_relu(self):
+#     import sequence_layers.mlx
+#     from sequence_layers.jax import simple as jax_simple
+#
+#     config = jax_simple.Relu.Config()
+#     mlx_layer = config.make(backend='mlx')
+#     self.assertIsInstance(mlx_layer, simple.Relu)
+#
+#   def test_tanh(self):
+#     import sequence_layers.mlx
+#     from sequence_layers.jax import simple as jax_simple
+#
+#     config = jax_simple.Tanh.Config()
+#     mlx_layer = config.make(backend='mlx')
+#     self.assertIsInstance(mlx_layer, simple.Tanh)
+#
+#   def test_gated_linear_unit(self):
+#     import sequence_layers.mlx
+#     from sequence_layers.jax import simple as jax_simple
+#
+#     config = jax_simple.GatedLinearUnit.Config()
+#     mlx_layer = config.make(backend='mlx')
+#     self.assertIsInstance(mlx_layer, simple.GatedLinearUnit)
+#
+#   def test_reshape(self):
+#     import sequence_layers.mlx
+#     from sequence_layers.jax import simple as jax_simple
+#
+#     config = jax_simple.Reshape.Config(output_shape=(2, 3))
+#     mlx_layer = config.make(backend='mlx')
+#     self.assertIsInstance(mlx_layer, simple.Reshape)
+#
+#   def test_downsample(self):
+#     import sequence_layers.mlx
+#     from sequence_layers.jax import simple as jax_simple
+#
+#     config = jax_simple.Downsample1D.Config(rate=2)
+#     mlx_layer = config.make(backend='mlx')
+#     self.assertIsInstance(mlx_layer, simple.Downsample1D)
+
+
+class CheckpointNameTest(test_utils.SequenceLayerTest, spec.CheckpointNameTest):
+
+  def test_layer(self):
+    layer = simple.CheckpointName.Config(checkpoint_name='test').make()
+
+    x = self.random_sequence(2, 3, 4)
+    self.verify_contract(layer, x)
+
+  def test_passthrough(self):
+    layer = simple.CheckpointName.Config(checkpoint_name='test').make()
+
+    x = self.random_sequence(1, 3, 4)
+    y = layer.layer(x, training=False)
+    np.testing.assert_array_equal(y.values, x.values)
+    np.testing.assert_array_equal(y.mask, x.mask)
+
+  #   def test_from_config(self):
+
+
+#     import sequence_layers.mlx
+#     from sequence_layers.jax import simple as jax_simple
+#
+#     config = jax_simple.CheckpointName.Config(checkpoint_name='test')
+#     mlx_layer = config.make(backend='mlx')
+#     self.assertIsInstance(mlx_layer, simple.CheckpointName)
+
+
+class LambdaTest(test_utils.SequenceLayerTest, spec.LambdaTest):
+  """Test behavior of Lambda layer."""
+
+
+class LoggingTest(test_utils.SequenceLayerTest, spec.LoggingTest):
+  """Test behavior of Logging layer."""
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/sequence_layers/mlx/typing.py b/sequence_layers/mlx/typing.py
new file mode 100644
index 0000000..32aaa4d
--- /dev/null
+++ b/sequence_layers/mlx/typing.py
@@ -0,0 +1,43 @@
+"""Lightweight typing utilities for MLX sequence layers.
+
+Provides type annotation helpers compatible with the jaxtyping-style API
+used in the JAX version, but without JAX dependencies. Since runtime type
+checking is disabled, these are purely for documentation and IDE support.
+"""
+
+from typing import Any, Callable, TypeVar
+
+import mlx.core as mx
+import numpy as np
+
+try:
+  from jaxtyping import Float, Int, Shaped, PyTree
+except ImportError:
+  # Fallback: define no-op type aliases if jaxtyping is not available.
+  Float = Any
+  Int = Any
+  Shaped = Any
+  PyTree = Any
+
+
+class _MetaArrayT(type):
+  types = ()
+
+  def __instancecheck__(cls, obj):
+    return isinstance(obj, cls.types)
+
+
+class ArrayT(metaclass=_MetaArrayT):
+  types = (mx.array, np.ndarray)
+
+
+ScalarInt = Any
+ScalarFloat = Any
+AnyPyTree = Any
+
+_F = TypeVar('_F', bound=Callable)
+
+
+def typed(function: _F) -> _F:
+  """No-op decorator for type-checked functions (runtime checking disabled)."""
+  return function
diff --git a/sequence_layers/mlx/utils.py b/sequence_layers/mlx/utils.py
index c5e3693..812b0ae 100644
--- a/sequence_layers/mlx/utils.py
+++ b/sequence_layers/mlx/utils.py
@@ -7,8 +7,12 @@
 from mlx import nn
 import mlx.core as mx
 import numpy as np
+
+from sequence_layers.specs import combinators as spec_combinators
 from sequence_layers.specs import types as specs_types
 
+CombinationMode = spec_combinators.CombinationMode
+
 
 def get_output_latency(config, accumulated_output_latency=0):
   """Returns the output latency of the provided SequenceLayerConfig.
@@ -32,13 +36,6 @@ def _get_accumulated_output_latency(layer, output_latency):
   """Computes accumulated output latency for a layer.
 
   Mirrors SequenceLayer.get_accumulated_output_latency from JAX types.
-
-  Args:
-    layer: The layer to compute latency for.
-    output_latency: The accumulated output latency of preceding layers.
-
-  Returns:
-    The accumulated output latency.
   """
   # Check for Serial-like combinators that chain layers.
   if hasattr(layer, 'layers') and isinstance(layer.layers, (list, tuple)):
@@ -84,57 +81,6 @@ def get_required_stepwise_delay(output_ratio, input_latency):
   return int(-input_latency % (1 / output_ratio))
 
 
-def call_layer_with_emits(
-    layer, x, *, training=False, constants=None, **kwargs
-):
-  """Calls layer_with_emits safely, handling signature mismatches in non-abstractified layers."""
-
-  sig = inspect.signature(layer.layer_with_emits)
-  call_kwargs = {}
-  if 'training' in sig.parameters:
-    call_kwargs['training'] = training
-  if 'constants' in sig.parameters:
-    call_kwargs['constants'] = constants
-  for k, v in kwargs.items():
-    if k in sig.parameters:
-      call_kwargs[k] = v
-  return layer.layer_with_emits(x, **call_kwargs)
-
-
-def call_step_with_emits(
-    layer, x, state, *, training=False, constants=None, **kwargs
-):
-  """Calls step_with_emits safely, handling signature mismatches in non-abstractified layers."""
-
-  sig = inspect.signature(layer.step_with_emits)
-  call_kwargs = {}
-  if 'training' in sig.parameters:
-    call_kwargs['training'] = training
-  if 'constants' in sig.parameters:
-    call_kwargs['constants'] = constants
-  for k, v in kwargs.items():
-    if k in sig.parameters:
-      call_kwargs[k] = v
-  return layer.step_with_emits(x, state, **call_kwargs)
-
-
-def call_get_initial_state(
-    layer, batch_size, input_spec, *, training=False, constants=None, **kwargs
-):
-  """Calls get_initial_state safely, handling signature mismatches in non-abstractified layers."""
-
-  sig = inspect.signature(layer.get_initial_state)
-  call_kwargs = {}
-  if 'training' in sig.parameters:
-    call_kwargs['training'] = training
-  if 'constants' in sig.parameters:
-    call_kwargs['constants'] = constants
-  for k, v in kwargs.items():
-    if k in sig.parameters:
-      call_kwargs[k] = v
-  return layer.get_initial_state(batch_size, input_spec, **call_kwargs)
-
-
 def _to_mx_dtype(dtype: Any) -> Any:
   """Converts various dtype representations to MLX DType."""
   if dtype is None:
@@ -203,8 +149,7 @@ def make_layer(config, backend='mlx') -> Any:
       layer = config.make(backend=backend)
       if layer is not None:
         return layer
-    # If it's an MLX-specific config, it might have no-arg make() returning
-    # MLX layer.
+    # If it's an MLX-specific config, it might have no-arg make() returning MLX layer.
     config_module = config.__class__.__module__
     if 'mlx' in config_module:
       layer = config.make()
@@ -221,7 +166,7 @@ def make_layer(config, backend='mlx') -> Any:
     if class_name.endswith('Config'):
       class_name = class_name[:-6]
 
-  import sequence_layers.mlx as mlx_module  # pylint: disable=import-outside-toplevel,g-import-not-at-top
+  import sequence_layers.mlx as mlx_module  # pylint: disable=import-outside-toplevel
 
   if not hasattr(mlx_module, class_name):
     raise AttributeError(
@@ -277,7 +222,7 @@ def make_layer(config, backend='mlx') -> Any:
 
     try:
       mlx_config = mlx_config_class(**kwargs)
-      return mlx_config.make()
+      return mlx_class(mlx_config)
     except Exception as e:  # pylint: disable=broad-exception-caught
       raise AttributeError(
           f"Concrete MLX class '{class_name}' does not implement from_config "
@@ -291,3 +236,112 @@ def make_layer(config, backend='mlx') -> Any:
 
 
 # pylint: enable=too-many-nested-blocks
+
+
+def call_layer_with_emits(
+    layer, x, *, training=False, constants=None, **kwargs
+):
+  """Calls layer_with_emits safely, handling signature mismatches in non-abstractified layers."""
+  sig = inspect.signature(layer.layer_with_emits)
+  call_kwargs = {}
+  if 'training' in sig.parameters:
+    call_kwargs['training'] = training
+  if 'constants' in sig.parameters:
+    call_kwargs['constants'] = constants
+  for k, v in kwargs.items():
+    if k in sig.parameters:
+      call_kwargs[k] = v
+  return layer.layer_with_emits(x, **call_kwargs)
+
+
+def call_step_with_emits(
+    layer, x, state, *, training=False, constants=None, **kwargs
+):
+  """Calls step_with_emits safely, handling signature mismatches in non-abstractified layers."""
+  sig = inspect.signature(layer.step_with_emits)
+  call_kwargs = {}
+  if 'training' in sig.parameters:
+    call_kwargs['training'] = training
+  if 'constants' in sig.parameters:
+    call_kwargs['constants'] = constants
+  for k, v in kwargs.items():
+    if k in sig.parameters:
+      call_kwargs[k] = v
+  return layer.step_with_emits(x, state, **call_kwargs)
+
+
+def call_get_initial_state(
+    layer, batch_size, input_spec, *, training=False, constants=None, **kwargs
+):
+  """Calls get_initial_state safely, handling signature mismatches in non-abstractified layers."""
+  sig = inspect.signature(layer.get_initial_state)
+  call_kwargs = {}
+  if 'training' in sig.parameters:
+    call_kwargs['training'] = training
+  if 'constants' in sig.parameters:
+    call_kwargs['constants'] = constants
+  for k, v in kwargs.items():
+    if k in sig.parameters:
+      call_kwargs[k] = v
+  return layer.get_initial_state(batch_size, input_spec, **call_kwargs)
+
+
+def _patch_spec_configs():
+  # pylint: disable=import-outside-toplevel,missing-function-docstring,reimported
+  def _make(self):
+    return make_layer(self)
+
+  # Patch the base class
+  specs_types.SequenceLayerConfig.make = _make
+
+  # Patch all spec modules dynamically
+  modules: list[Any] = []
+  try:
+    from sequence_layers.specs import combinators as spec_comb
+
+    modules.append(spec_comb)
+  except ImportError:
+    pass
+  try:
+    from sequence_layers.specs import convolution as spec_conv
+
+    modules.append(spec_conv)
+  except ImportError:
+    pass
+  try:
+    from sequence_layers.specs import dense as spec_dense
+
+    modules.append(spec_dense)
+  except ImportError:
+    pass
+  try:
+    from sequence_layers.specs import normalization as spec_norm
+
+    modules.append(spec_norm)
+  except ImportError:
+    pass
+  try:
+    from sequence_layers.specs import pooling as spec_pool
+
+    modules.append(spec_pool)
+  except ImportError:
+    pass
+  try:
+    from sequence_layers.specs import simple as spec_simple
+
+    modules.append(spec_simple)
+  except ImportError:
+    pass
+
+  for mod in modules:
+    for name in dir(mod):
+      attr = getattr(mod, name)
+      if isinstance(attr, type) and hasattr(attr, 'Config'):
+        config_cls = getattr(attr, 'Config')
+        if isinstance(config_cls, type) and issubclass(
+            config_cls, specs_types.SequenceLayerConfig
+        ):
+          config_cls.make = _make
+
+
+_patch_spec_configs()
diff --git a/sequence_layers/specs/attention.py b/sequence_layers/specs/attention.py
new file mode 100644
index 0000000..c4c7acb
--- /dev/null
+++ b/sequence_layers/specs/attention.py
@@ -0,0 +1,285 @@
+"""Specifications for attention layers.
+
+See the corresponding _behaviors module for behaviors.
+"""
+
+import abc
+import dataclasses
+from typing import Any, override, Protocol, runtime_checkable, Type
+
+from sequence_layers.specs import types as types_spec
+
+# =============================================================================
+# Projection Config Specifications
+# =============================================================================
+
+
+@dataclasses.dataclass(frozen=True)
+class QueryKeyValueProjectionConfig:
+  """Base class for QKV projection configuration."""
+
+
+@dataclasses.dataclass(frozen=True)
+class CombinedQueryKeyValueProjection(QueryKeyValueProjectionConfig):
+  """Use a single projection matrix for query/key/value projection.
+
+  * Incompatible with Grouped Query Attention (num_query_heads != num_kv_heads).
+  * Supports shared key and value projection.
+  """
+
+  # If true, share the key and value projection matrices.
+  share_kv_projection: bool = False
+
+
+@dataclasses.dataclass(frozen=True)
+class SeparateQueryKeyValueProjection(QueryKeyValueProjectionConfig):
+  """Use separate projection matrices for query/key/value projection.
+
+  * Supports Grouped Query Attention (num_query_heads != num_kv_heads).
+  * Does not support shared key and value projection. Use
+    QueryAndSharedKeyValueProjection.
+  """
+
+
+@dataclasses.dataclass(frozen=True)
+class QueryAndKeyValueProjection(QueryKeyValueProjectionConfig):
+  """Use separate query and key/value projection matrices.
+
+  * Supports Grouped Query Attention (num_query_heads != num_kv_heads).
+  * Does not support shared key and value projection. Use
+    QueryAndSharedKeyValueProjection.
+  """
+
+
+@dataclasses.dataclass(frozen=True)
+class QueryAndSharedKeyValueProjection(QueryKeyValueProjectionConfig):
+  """Use separate query and shared key/value projection matrices.
+
+  * Supports Grouped Query Attention (num_query_heads != num_kv_heads).
+  * Requires shared key and value projection.
+  """
+
+
+# =============================================================================
+# Attention Layer Specifications
+# =============================================================================
+
+
+class DotProductSelfAttention[
+    SequenceT: types_spec.Sequence,
+    ShapeDTypeT: types_spec.ChannelSpec,
+](
+    types_spec.Emitting[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for DotProductSelfAttention layer.
+
+  Multi-headed dot-product self-attention with causal masking and KV caching.
+  """
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for DotProductSelfAttention layer."""
+
+    num_heads: int
+    units_per_head: int
+    max_past_horizon: int
+    max_future_horizon: int = 0
+    num_kv_heads: int | None = None
+    attention_probabilities_dropout_rate: float = 0.0
+    broadcast_dropout_across_queries: bool = False
+    use_bias: bool = False
+    input_projection: QueryKeyValueProjectionConfig = dataclasses.field(
+        default_factory=CombinedQueryKeyValueProjection
+    )
+    query_network: types_spec.SequenceLayerConfig | None = None
+    key_network: types_spec.SequenceLayerConfig | None = None
+    value_network: types_spec.SequenceLayerConfig | None = None
+    attention_logits_soft_cap: float | None = None
+    per_dim_scale: bool = False
+    query_scale: float | None = None
+    zero_fully_masked: bool = False
+    compute_dtype: types_spec.DType | None = None
+    param_dtype: types_spec.DType | None = None
+    num_sink_embeddings: int = 0
+    use_sink_scalars: bool = False
+    use_kv_cache_ringbuffer: bool = False
+    emit_attention_weights: bool = False
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class LocalDotProductSelfAttention[
+    SequenceT: types_spec.Sequence,
+    ShapeDTypeT: types_spec.ChannelSpec,
+](DotProductSelfAttention[SequenceT, ShapeDTypeT], metaclass=abc.ABCMeta):
+  """Specification for LocalDotProductSelfAttention layer.
+
+  Local/block-based self-attention.
+  """
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(DotProductSelfAttention.Config):
+    """Configuration for LocalDotProductSelfAttention layer."""
+
+    block_size: int = 1
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class DotProductAttention[
+    SequenceT: types_spec.Sequence,
+    ShapeDTypeT: types_spec.ChannelSpec,
+](
+    types_spec.Emitting[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for DotProductAttention layer.
+
+  Multi-headed cross-attention attending to an external source.
+  """
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for DotProductAttention layer."""
+
+    source_name: str
+    num_heads: int
+    units_per_head: int
+    attention_probabilities_dropout_rate: float = 0.0
+    broadcast_dropout_across_queries: bool = False
+    use_bias: bool = False
+    input_projection: QueryKeyValueProjectionConfig = dataclasses.field(
+        default_factory=QueryAndKeyValueProjection
+    )
+    query_network: types_spec.SequenceLayerConfig | None = None
+    key_network: types_spec.SequenceLayerConfig | None = None
+    value_network: types_spec.SequenceLayerConfig | None = None
+    attention_logits_soft_cap: float | None = None
+    per_dim_scale: bool = False
+    query_scale: float | None = None
+    zero_fully_masked: bool = False
+    compute_dtype: types_spec.DType | None = None
+    param_dtype: types_spec.DType | None = None
+    emit_attention_weights: bool = False
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class StreamingDotProductAttention[
+    SequenceT: types_spec.Sequence,
+    ShapeDTypeT: types_spec.ChannelSpec,
+](
+    types_spec.Emitting[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for StreamingDotProductAttention layer.
+
+  Streaming cross-attention with rolling KV buffer. Also covers
+  StreamingLocalDotProductAttention (which differs only in layer-mode
+  efficiency via block_size, not in step-mode behavior or output).
+  """
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for StreamingDotProductAttention layer."""
+
+    source_name: str
+    num_heads: int
+    units_per_head: int
+    max_past_horizon: int
+    max_future_horizon: int = 0
+    block_size: int = 1
+    attention_probabilities_dropout_rate: float = 0.0
+    broadcast_dropout_across_queries: bool = False
+    use_bias: bool = False
+    use_query_delay_buffer: bool = True
+    input_projection: QueryKeyValueProjectionConfig = dataclasses.field(
+        default_factory=QueryAndKeyValueProjection
+    )
+    query_network: types_spec.SequenceLayerConfig | None = None
+    key_network: types_spec.SequenceLayerConfig | None = None
+    value_network: types_spec.SequenceLayerConfig | None = None
+    attention_logits_soft_cap: float | None = None
+    per_dim_scale: bool = False
+    query_scale: float | None = None
+    zero_fully_masked: bool = False
+    compute_dtype: types_spec.DType | None = None
+    param_dtype: types_spec.DType | None = None
+    num_sink_embeddings: int = 0
+    use_sink_scalars: bool = False
+    use_kv_cache_ringbuffer: bool = False
+    emit_attention_weights: bool = False
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+# =============================================================================
+# ModuleSpec Protocol
+# =============================================================================
+
+
+# pylint: disable=invalid-name
+# pylint: disable=missing-function-docstring
+@runtime_checkable
+class ModuleSpec(Protocol):
+  """Protocol for the attention submodule of a backend."""
+
+  @property
+  def DotProductSelfAttention(self) -> Type[DotProductSelfAttention]:
+    ...
+
+  @property
+  def DotProductAttention(self) -> Type[DotProductAttention]:
+    ...
+
+  @property
+  def StreamingDotProductAttention(
+      self,
+  ) -> Type[StreamingDotProductAttention]:
+    ...
+
+  @property
+  def StreamingLocalDotProductAttention(
+      self,
+  ) -> Type[Any]:
+    ...
+
+  @property
+  def LocalDotProductSelfAttention(
+      self,
+  ) -> Type[LocalDotProductSelfAttention]:
+    ...
+
+  @property
+  def CombinedQueryKeyValueProjection(
+      self,
+  ) -> type[CombinedQueryKeyValueProjection]:
+    ...
+
+  @property
+  def SeparateQueryKeyValueProjection(
+      self,
+  ) -> type[SeparateQueryKeyValueProjection]:
+    ...
+
+  @property
+  def QueryAndKeyValueProjection(self) -> type[QueryAndKeyValueProjection]:
+    ...
+
+  @property
+  def QueryAndSharedKeyValueProjection(
+      self,
+  ) -> type[QueryAndSharedKeyValueProjection]:
+    ...
diff --git a/sequence_layers/specs/attention_behaviors.py b/sequence_layers/specs/attention_behaviors.py
new file mode 100644
index 0000000..22bda20
--- /dev/null
+++ b/sequence_layers/specs/attention_behaviors.py
@@ -0,0 +1,775 @@
+"""Behavior tests for attention layers.
+
+Backend-specific test files should inherit from these tests.
+"""
+
+# pylint: disable=abstract-method
+
+from absl.testing import parameterized
+import numpy as np
+
+from sequence_layers.specs import test_utils
+
+
+class DotProductSelfAttentionTest(test_utils.SequenceLayerTest):
+  """Test behavior of DotProductSelfAttention layer."""
+
+  def test_layer(self):
+    layer = self.sl.DotProductSelfAttention.Config(
+        num_heads=4,
+        units_per_head=8,
+        max_past_horizon=32,
+        name='dot_product_self_attention',
+    ).make()
+    x = self.random_sequence(2, 5, 16)
+    layer = self.init_layer(layer, x)
+    self.verify_contract(layer, x, atol=1e-4, rtol=1e-4)
+
+  def test_causal(self):
+    layer = self.sl.DotProductSelfAttention.Config(
+        num_heads=2,
+        units_per_head=4,
+        max_past_horizon=64,
+        max_future_horizon=0,
+    ).make()
+    x = self.random_sequence(2, 5, 8)
+    layer = self.init_layer(layer, x)
+    self.verify_contract(layer, x, atol=1e-4, rtol=1e-4)
+
+  def test_gqa(self):
+    """Test Grouped Query Attention (fewer KV heads)."""
+    layer = self.sl.DotProductSelfAttention.Config(
+        num_heads=8,
+        units_per_head=4,
+        max_past_horizon=32,
+        num_kv_heads=2,
+        input_projection=self.sl.SeparateQueryKeyValueProjection(),
+    ).make()
+    x = self.random_sequence(2, 5, 16)
+    layer = self.init_layer(layer, x)
+    self.verify_contract(layer, x, atol=1e-4, rtol=1e-4)
+
+  def test_output_shape(self):
+    layer = self.sl.DotProductSelfAttention.Config(
+        num_heads=4,
+        units_per_head=8,
+        max_past_horizon=32,
+    ).make()
+    x = self.random_sequence(2, 5, 16)
+    layer = self.init_layer(layer, x)
+    self.assertEqual(layer.get_output_shape((16,)), (4, 8))
+
+  def test_step_builds_kv_cache(self):
+    # We cannot easily test this generically if `state` structures differ,
+    # but we can check that `state` is returned and updated.
+    # Let's write a generic version or keep it backend-specific if it relies
+    # too much on backend-specific types.
+    # Actually, the state structure in JAX is:
+    # (keys, values, state_index, ...) or similar.
+    # In MLX it's:
+    # (keys, values, step)
+    # But wait, both have a step/index in state.
+    # Let's make this backend-specific since the state layout is too divergent
+    # (e.g. JAX uses flax.FrozenDict/tuple, MLX uses list/tuple).
+    pass
+
+  def test_per_dim_scale(self):
+    # Also relies on checking internal attributes like `layer._per_dim_scale`
+    # which are named differently or don't exist in the same way.
+    # In JAX it is a parameter. In MLX it is a parameter.
+    # Let's keep per_dim_scale testing backend-specific for now, or at least the
+    # parameter checking part.
+    pass
+
+  def test_per_dim_scale_step(self):
+    layer = self.sl.DotProductSelfAttention.Config(
+        num_heads=2,
+        units_per_head=4,
+        max_past_horizon=10,
+        per_dim_scale=True,
+    ).make()
+    x = self.random_sequence(2, 5, 8)
+    layer = self.init_layer(layer, x)
+    self.verify_contract(layer, x, atol=1e-4, rtol=1e-4)
+
+  @parameterized.parameters(
+      # max_past_horizon > 0, max_future_horizon == 0. Steppable.
+      (1, 2, 3, 0, False),
+      (1, 2, 3, 0, True),
+      (3, 5, 3, 0, False),
+      (3, 5, 3, 0, True),
+      # max_past_horizon > 0, max_future_horizon > 0. Steppable.
+      (3, 5, 3, 2, False),
+      (3, 5, 3, 2, True),
+      (3, 5, 3, 5, False),
+      (3, 5, 3, 5, True),
+  )
+  def test_use_kv_cache_ringbuffer(
+      self,
+      num_heads: int,
+      units_per_head: int,
+      max_past_horizon: int,
+      max_future_horizon: int,
+      random_mask: bool,
+  ):
+    """Test ring buffer wrap-around: layer() vs step() parity."""
+    batch_size = 2
+    layer = self.sl.DotProductSelfAttention.Config(
+        num_heads=num_heads,
+        units_per_head=units_per_head,
+        max_past_horizon=max_past_horizon,
+        max_future_horizon=max_future_horizon,
+        per_dim_scale=True,
+        use_kv_cache_ringbuffer=True,
+        name='dot_product_self_attention',
+    ).make()
+
+    channels = 1
+    x_init = self.random_sequence(batch_size, 1, channels)
+    layer = self.init_layer(layer, x_init)
+
+    self.assertEqual(layer.block_size, 1)
+    self.assertEqual(layer.output_ratio, 1)
+    self.assertEqual(layer.name, 'dot_product_self_attention')
+    self.assertEqual(
+        layer.get_output_shape((channels,)), (num_heads, units_per_head)
+    )
+    self.assertTrue(layer.supports_step)
+    self.assertEqual(layer.input_latency, max(0, max_future_horizon))
+
+    for time in [1, 2, 3, 11, 12]:
+      with self.subTest(f'time_{time}'):
+        x = self.random_sequence(
+            batch_size, time, channels, random_mask=random_mask
+        )
+        self.verify_contract(
+            layer,
+            x,
+            training=False,
+            test_2x_step=False,
+            atol=1e-4,
+            rtol=1e-4,
+        )
+
+  @parameterized.product(
+      (
+          # CombinedQueryKeyValueProjection. GQA is not supported.
+          {
+              'input_projection_name': 'CombinedQueryKeyValueProjection',
+              'share_kv_projection': False,
+              'num_heads': 3,
+              'num_kv_heads': None,
+          },
+          {
+              'input_projection_name': 'CombinedQueryKeyValueProjection',
+              'share_kv_projection': True,
+              'num_heads': 3,
+              'num_kv_heads': None,
+          },
+          # SeparateQueryKeyValueProjection. MHA and GQA supported.
+          {
+              'input_projection_name': 'SeparateQueryKeyValueProjection',
+              'share_kv_projection': False,
+              'num_heads': 3,
+              'num_kv_heads': None,
+          },
+          {
+              'input_projection_name': 'SeparateQueryKeyValueProjection',
+              'share_kv_projection': False,
+              'num_heads': 6,
+              'num_kv_heads': 3,
+          },
+          # QueryAndKeyValueProjection. MHA and GQA supported.
+          {
+              'input_projection_name': 'QueryAndKeyValueProjection',
+              'share_kv_projection': False,
+              'num_heads': 3,
+              'num_kv_heads': None,
+          },
+          {
+              'input_projection_name': 'QueryAndKeyValueProjection',
+              'share_kv_projection': False,
+              'num_heads': 6,
+              'num_kv_heads': 3,
+          },
+          # QueryAndSharedKeyValueProjection. MHA and GQA supported.
+          {
+              'input_projection_name': 'QueryAndSharedKeyValueProjection',
+              'share_kv_projection': False,
+              'num_heads': 3,
+              'num_kv_heads': None,
+          },
+          {
+              'input_projection_name': 'QueryAndSharedKeyValueProjection',
+              'share_kv_projection': False,
+              'num_heads': 6,
+              'num_kv_heads': 3,
+          },
+      ),
+  )
+  def test_projection_config_contract(
+      self,
+      input_projection_name: str,
+      share_kv_projection: bool,
+      num_heads: int,
+      num_kv_heads: int | None,
+  ):
+    proj_cls = getattr(self.sl.attention, input_projection_name)
+    if input_projection_name == 'CombinedQueryKeyValueProjection':
+      input_projection = proj_cls(share_kv_projection=share_kv_projection)
+    else:
+      input_projection = proj_cls()
+
+    batch_size, units_per_head = 2, 5
+    max_past_horizon = 7
+    max_future_horizon = 11
+
+    l = self.sl.DotProductSelfAttention.Config(
+        num_heads=num_heads,
+        num_kv_heads=num_kv_heads,
+        units_per_head=units_per_head,
+        input_projection=input_projection,
+        max_past_horizon=max_past_horizon,
+        max_future_horizon=max_future_horizon,
+        name='dot_product_self_attention',
+    ).make()
+
+    x = self.random_sequence(batch_size, 16, 2)
+    l = self.init_layer(l, x)
+
+    self.assertEqual(l.block_size, 1)
+    self.assertEqual(l.output_ratio, 1)
+    self.assertEqual(l.name, 'dot_product_self_attention')
+    self.assertEqual(l.get_output_shape((2,)), (num_heads, units_per_head))
+    self.assertTrue(l.supports_step)
+    self.assertEqual(l.input_latency, max(0, max_future_horizon))
+
+    self.verify_contract(
+        l,
+        x,
+        training=False,
+        grad_atol=1e-5,
+        grad_rtol=1e-5,
+    )
+
+  def test_attention_emits(self):
+    layer = self.sl.DotProductSelfAttention.Config(
+        num_heads=2,
+        units_per_head=4,
+        max_past_horizon=10,
+        emit_attention_weights=True,
+        name='self_attn_emits',
+    ).make()
+    x = self.random_sequence(2, 5, 8)
+    layer = self.init_layer(layer, x)
+
+    y_emits, emits = layer.layer_with_emits(x, training=False)
+    self.assertIsNotNone(emits)
+    self.assertTrue(hasattr(emits, 'probabilities'))
+
+    probs = emits.probabilities
+    self.assertEqual(probs.shape, (2, 5, 2, 5))
+
+    sum_probs = np.sum(np.asarray(probs.values), axis=-1)
+    np.testing.assert_allclose(sum_probs, 1.0, atol=1e-5)
+
+    y_standard = layer.layer(x, training=False)
+    np.testing.assert_allclose(
+        np.asarray(y_emits.values), np.asarray(y_standard.values), atol=1e-5
+    )
+
+
+class DotProductAttentionTest(test_utils.SequenceLayerTest):
+  """Test behavior of DotProductAttention layer."""
+
+  def _make_constants(self, batch, time, features, name='source'):
+    """Helper to create random sequence for cross-attention constants."""
+    source = self.random_sequence(batch, time, features)
+    return {name: source}
+
+  def test_layer(self):
+    layer = self.sl.DotProductAttention.Config(
+        source_name='source',
+        num_heads=2,
+        units_per_head=4,
+        name='dot_product_attention',
+    ).make()
+    constants = self._make_constants(2, 6, 12)
+    x = self.random_sequence(2, 5, 8)
+    layer = self.init_layer(layer, x, constants=constants)
+    self.verify_contract(
+        layer,
+        x,
+        constants=constants,
+        atol=1e-4,
+        rtol=1e-4,
+    )
+
+  def test_output_shape(self):
+    layer = self.sl.DotProductAttention.Config(
+        source_name='enc',
+        num_heads=4,
+        units_per_head=8,
+    ).make()
+    x = self.random_sequence(1, 5, 16)
+    constants = self._make_constants(1, 5, 16, name='enc')
+    layer = self.init_layer(layer, x, constants=constants)
+    self.assertEqual(layer.get_output_shape((16,)), (4, 8))
+
+  def test_step_reuses_precomputed_kv(self):
+    layer = self.sl.DotProductAttention.Config(
+        source_name='source',
+        num_heads=2,
+        units_per_head=4,
+    ).make()
+    constants = self._make_constants(1, 6, 12)
+    x = self.random_sequence(1, 1, 8)
+    layer = self.init_layer(layer, x, constants=constants)
+
+    input_spec = self.sl.types.ShapeDType((8,), x.dtype)
+    state = layer.get_initial_state(
+        1, input_spec, training=False, constants=constants
+    )
+    # KV should be pre-computed.
+    keys_v = state[0]
+    self.assertEqual(keys_v.shape, (1, 6, 2, 4))
+
+    for _ in range(3):
+      x_step = self.random_sequence(1, 1, 8)
+      y, state = layer.step(x_step, state, training=False, constants=constants)
+      self.assertEqual(y.channel_shape, (2, 4))
+
+  def test_missing_source_raises(self):
+    layer = self.sl.DotProductAttention.Config(
+        source_name='missing',
+        num_heads=2,
+        units_per_head=4,
+    ).make()
+    x = self.random_sequence(1, 3, 8)
+    layer = self.init_layer(layer, x, bind_only=True)
+    with self.assertRaises(ValueError):
+      layer.layer(x, constants={}, training=False)
+
+  def test_logits_soft_cap(self):
+    num_heads, units_per_head = 3, 5
+    batch_size, source_time, source_channels = 2, 11, 2
+    source_name = 'source'
+    l = self.sl.DotProductAttention.Config(
+        source_name,
+        num_heads=num_heads,
+        units_per_head=units_per_head,
+        attention_logits_soft_cap=50.0,
+        name='dot_product_attention',
+    ).make()
+
+    source = self.random_sequence(batch_size, source_time, source_channels)
+    constants = {source_name: source}
+    time, channels = 21, 3
+    x = self.random_sequence(batch_size, time, channels)
+    l = self.init_layer(l, x, constants=constants)
+
+    self.assertEqual(l.block_size, 1)
+    self.assertEqual(l.output_ratio, 1)
+    self.assertEqual(l.name, 'dot_product_attention')
+
+    self.assertEqual(
+        l.get_output_shape((channels,)),
+        (num_heads, units_per_head),
+    )
+    self.verify_contract(
+        l,
+        x,
+        training=False,
+        constants=constants,
+        grad_atol=1e-5,
+        grad_rtol=1e-5,
+    )
+
+  def test_attention_emits(self):
+    layer = self.sl.DotProductAttention.Config(
+        source_name='enc_source',
+        num_heads=2,
+        units_per_head=4,
+        emit_attention_weights=True,
+        name='cross_attn_emits',
+    ).make()
+    source = self.random_sequence(2, 6, 8)
+    x = self.random_sequence(2, 4, 8)
+    constants = {'enc_source': source}
+    layer = self.init_layer(layer, x, constants=constants)
+
+    y_emits, emits = layer.layer_with_emits(
+        x, constants=constants, training=False
+    )
+    self.assertIsNotNone(emits)
+    self.assertTrue(hasattr(emits, 'probabilities_by_source'))
+    self.assertIn('enc_source', emits.probabilities_by_source)
+
+    probs = emits.probabilities_by_source['enc_source']
+    self.assertEqual(probs.shape, (2, 4, 2, 6))
+
+    sum_probs = np.sum(np.asarray(probs.values), axis=-1)
+    np.testing.assert_allclose(sum_probs, 1.0, atol=1e-5)
+
+    y_standard = layer.layer(x, constants=constants, training=False)
+    np.testing.assert_allclose(
+        np.asarray(y_emits.values), np.asarray(y_standard.values), atol=1e-5
+    )
+
+
+class StreamingDotProductAttentionTest(test_utils.SequenceLayerTest):
+  """Test behavior of StreamingDotProductAttention layer."""
+
+  def test_layer_basic(self):
+    """Basic contract verification."""
+    num_heads, units_per_head = 2, 4
+    max_past_horizon = 4
+    batch_size, source_time, source_channels = 2, 8, 12
+    time, channels = 8, 8
+    source_name = 'source'
+
+    layer = self.sl.StreamingDotProductAttention.Config(
+        source_name,
+        num_heads=num_heads,
+        units_per_head=units_per_head,
+        max_past_horizon=max_past_horizon,
+        name='streaming_dot_product_attention',
+    ).make()
+
+    source = self.random_sequence(batch_size, source_time, source_channels)
+    constants = {source_name: source}
+    x = self.random_sequence(batch_size, time, channels)
+    layer = self.init_layer(layer, x, constants=constants)
+
+    self.assertEqual(layer.block_size, 1)
+    self.assertEqual(layer.output_ratio, 1)
+    self.assertEqual(layer.name, 'streaming_dot_product_attention')
+    self.assertEqual(
+        layer.get_output_shape((channels,)),
+        (num_heads, units_per_head),
+    )
+    self.verify_contract(
+        layer,
+        x,
+        training=False,
+        constants=constants,
+        stream_constants=True,
+        atol=1e-4,
+        rtol=1e-4,
+    )
+
+  def test_future_horizon(self):
+    """Delay buffer with max_future_horizon > 0."""
+    num_heads, units_per_head = 2, 4
+    max_past_horizon = 4
+    max_future_horizon = 2
+    batch_size, source_time, source_channels = 2, 8, 8
+    time, channels = 8, 8
+    source_name = 'source'
+
+    layer = self.sl.StreamingDotProductAttention.Config(
+        source_name,
+        num_heads=num_heads,
+        units_per_head=units_per_head,
+        max_past_horizon=max_past_horizon,
+        max_future_horizon=max_future_horizon,
+        name='streaming_dot_product_attention_future',
+    ).make()
+
+    source = self.random_sequence(batch_size, source_time, source_channels)
+    constants = {source_name: source}
+    x = self.random_sequence(batch_size, time, channels)
+    layer = self.init_layer(layer, x, constants=constants)
+
+    self.assertEqual(layer.input_latency, max_future_horizon)
+    self.verify_contract(
+        layer,
+        x,
+        training=False,
+        constants=constants,
+        stream_constants=True,
+        atol=1e-4,
+        rtol=1e-4,
+    )
+
+  def test_with_rope(self):
+    """Position processing networks (RoPE)."""
+    num_heads, units_per_head = 2, 4
+    max_past_horizon = 16
+    batch_size, source_time, source_channels = 2, 8, 12
+    time, channels = 8, 8
+    source_name = 'source'
+
+    rope_q = self.sl.ApplyRotaryPositionalEncoding.Config(
+        max_wavelength=10000.0
+    )
+    rope_k = self.sl.ApplyRotaryPositionalEncoding.Config(
+        max_wavelength=10000.0
+    )
+
+    layer = self.sl.StreamingDotProductAttention.Config(
+        source_name,
+        num_heads=num_heads,
+        units_per_head=units_per_head,
+        max_past_horizon=max_past_horizon,
+        query_network=rope_q,
+        key_network=rope_k,
+        name='streaming_dot_product_attention_rope',
+    ).make()
+
+    source = self.random_sequence(batch_size, source_time, source_channels)
+    constants = {source_name: source}
+    x = self.random_sequence(batch_size, time, channels)
+    layer = self.init_layer(layer, x, constants=constants)
+
+    self.verify_contract(
+        layer,
+        x,
+        training=False,
+        constants=constants,
+        stream_constants=True,
+        atol=1e-4,
+        rtol=1e-4,
+    )
+
+  def test_query_key_value_network_supports_step(self):
+    x = self.random_sequence(2, 1, 3)
+    source = self.random_sequence(2, 1, 5)
+    constants = {'source': source}
+
+    l = self.sl.StreamingDotProductAttention.Config(
+        source_name='source',
+        num_heads=3,
+        units_per_head=5,
+        max_past_horizon=3,
+        max_future_horizon=0,
+        query_network=self.sl.AddTimingSignal.Config(),
+        key_network=self.sl.AddTimingSignal.Config(),
+        value_network=self.sl.AddTimingSignal.Config(),
+    ).make()
+    l = self.init_layer(l, x, constants=constants)
+    self.assertTrue(l.supports_step)
+
+    l = self.sl.StreamingDotProductAttention.Config(
+        source_name='source',
+        num_heads=3,
+        units_per_head=5,
+        max_past_horizon=3,
+        max_future_horizon=0,
+        query_network=self.sl.test_utils.NonSteppableLayer.Config(),
+        key_network=self.sl.AddTimingSignal.Config(),
+        value_network=self.sl.AddTimingSignal.Config(),
+    ).make()
+    l = self.init_layer(l, x, constants=constants)
+    self.assertFalse(l.supports_step)
+
+    l = self.sl.StreamingDotProductAttention.Config(
+        source_name='source',
+        num_heads=3,
+        units_per_head=5,
+        max_past_horizon=3,
+        max_future_horizon=0,
+        query_network=self.sl.AddTimingSignal.Config(),
+        key_network=self.sl.test_utils.NonSteppableLayer.Config(),
+        value_network=self.sl.AddTimingSignal.Config(),
+    ).make()
+    l = self.init_layer(l, x, constants=constants)
+    self.assertFalse(l.supports_step)
+
+    l = self.sl.StreamingDotProductAttention.Config(
+        source_name='source',
+        num_heads=3,
+        units_per_head=5,
+        max_past_horizon=3,
+        max_future_horizon=0,
+        query_network=self.sl.AddTimingSignal.Config(),
+        key_network=self.sl.AddTimingSignal.Config(),
+        value_network=self.sl.test_utils.NonSteppableLayer.Config(),
+    ).make()
+    l = self.init_layer(l, x, constants=constants)
+    self.assertFalse(l.supports_step)
+
+
+class LocalDotProductSelfAttentionTest(test_utils.SequenceLayerTest):
+  """Test behavior of LocalDotProductSelfAttention layer."""
+
+  test_step_in_future_horizon = True
+
+  def test_layer_basic(self):
+    """Basic contract verification."""
+    num_heads, units_per_head = 4, 4
+    max_past_horizon = 8
+    block_size = 2
+    batch_size, time, channels = 2, 8, 16
+
+    layer = self.sl.LocalDotProductSelfAttention.Config(
+        num_heads=num_heads,
+        units_per_head=units_per_head,
+        max_past_horizon=max_past_horizon,
+        block_size=block_size,
+        name='local_dot_product_self_attention',
+    ).make()
+
+    x = self.random_sequence(batch_size, time, channels)
+    layer = self.init_layer(layer, x)
+
+    self.assertEqual(layer.output_ratio, 1)
+    self.assertEqual(layer.name, 'local_dot_product_self_attention')
+    self.assertEqual(
+        layer.get_output_shape((channels,)),
+        (num_heads, units_per_head),
+    )
+    self.verify_contract(
+        layer,
+        x,
+        training=False,
+        atol=1e-4,
+        rtol=1e-4,
+    )
+
+  def test_future_horizon(self):
+    """Delay buffer with max_future_horizon > 0."""
+    num_heads, units_per_head = 2, 4
+    max_past_horizon = 4
+    max_future_horizon = 2
+    block_size = 1
+    batch_size, time, channels = 2, 8, 8
+
+    layer = self.sl.LocalDotProductSelfAttention.Config(
+        num_heads=num_heads,
+        units_per_head=units_per_head,
+        max_past_horizon=max_past_horizon,
+        max_future_horizon=max_future_horizon,
+        block_size=block_size,
+        name='local_dot_product_self_attention_future',
+    ).make()
+
+    x = self.random_sequence(batch_size, time, channels)
+    layer = self.init_layer(layer, x)
+
+    self.assertEqual(layer.input_latency, max_future_horizon)
+    self.verify_contract(
+        layer,
+        x,
+        training=False,
+        test_step=self.test_step_in_future_horizon,
+        atol=1e-4,
+        rtol=1e-4,
+    )
+
+  def test_with_soft_cap(self):
+    """Soft cap on attention logits."""
+    num_heads, units_per_head = 2, 4
+    max_past_horizon = 8
+    block_size = 1
+    batch_size, time, channels = 2, 8, 8
+
+    layer = self.sl.LocalDotProductSelfAttention.Config(
+        num_heads=num_heads,
+        units_per_head=units_per_head,
+        max_past_horizon=max_past_horizon,
+        block_size=block_size,
+        attention_logits_soft_cap=50.0,
+        name='local_dot_product_self_attention_soft_cap',
+    ).make()
+
+    x = self.random_sequence(batch_size, time, channels)
+    layer = self.init_layer(layer, x)
+
+    self.verify_contract(
+        layer,
+        x,
+        training=False,
+        atol=1e-4,
+        rtol=1e-4,
+    )
+
+  def test_with_rope(self):
+    """Rotary Positional Encoding on query/key."""
+    num_heads, units_per_head = 2, 4
+    max_past_horizon = 8
+    block_size = 1
+    batch_size, time, channels = 2, 8, 8
+
+    rope_q = self.sl.ApplyRotaryPositionalEncoding.Config(
+        max_wavelength=10000.0
+    )
+    rope_k = self.sl.ApplyRotaryPositionalEncoding.Config(
+        max_wavelength=10000.0
+    )
+
+    layer = self.sl.LocalDotProductSelfAttention.Config(
+        num_heads=num_heads,
+        units_per_head=units_per_head,
+        max_past_horizon=max_past_horizon,
+        block_size=block_size,
+        query_network=rope_q,
+        key_network=rope_k,
+        name='local_dot_product_self_attention_rope',
+    ).make()
+
+    x = self.random_sequence(batch_size, time, channels)
+    layer = self.init_layer(layer, x)
+
+    self.verify_contract(
+        layer,
+        x,
+        training=False,
+        atol=1e-4,
+        rtol=1e-4,
+    )
+
+  def test_query_key_value_network_supports_step(self):
+    x = self.random_sequence(2, 1, 3)
+
+    l = self.sl.LocalDotProductSelfAttention.Config(
+        num_heads=3,
+        units_per_head=5,
+        max_past_horizon=3,
+        max_future_horizon=0,
+        block_size=1,
+        query_network=self.sl.AddTimingSignal.Config(),
+        key_network=self.sl.AddTimingSignal.Config(),
+        value_network=self.sl.AddTimingSignal.Config(),
+    ).make()
+    l = self.init_layer(l, x)
+    self.assertTrue(l.supports_step)
+
+    l = self.sl.LocalDotProductSelfAttention.Config(
+        num_heads=3,
+        units_per_head=5,
+        max_past_horizon=3,
+        max_future_horizon=0,
+        block_size=1,
+        query_network=self.sl.test_utils.NonSteppableLayer.Config(),
+        key_network=self.sl.AddTimingSignal.Config(),
+        value_network=self.sl.AddTimingSignal.Config(),
+    ).make()
+    l = self.init_layer(l, x)
+    self.assertFalse(l.supports_step)
+
+    l = self.sl.LocalDotProductSelfAttention.Config(
+        num_heads=3,
+        units_per_head=5,
+        max_past_horizon=3,
+        max_future_horizon=0,
+        block_size=1,
+        query_network=self.sl.AddTimingSignal.Config(),
+        key_network=self.sl.test_utils.NonSteppableLayer.Config(),
+        value_network=self.sl.AddTimingSignal.Config(),
+    ).make()
+    l = self.init_layer(l, x)
+    self.assertFalse(l.supports_step)
+
+    l = self.sl.LocalDotProductSelfAttention.Config(
+        num_heads=3,
+        units_per_head=5,
+        max_past_horizon=3,
+        max_future_horizon=0,
+        block_size=1,
+        query_network=self.sl.AddTimingSignal.Config(),
+        key_network=self.sl.AddTimingSignal.Config(),
+        value_network=self.sl.test_utils.NonSteppableLayer.Config(),
+    ).make()
+    l = self.init_layer(l, x)
+    self.assertFalse(l.supports_step)
diff --git a/sequence_layers/specs/combinators.py b/sequence_layers/specs/combinators.py
new file mode 100644
index 0000000..8cae1e2
--- /dev/null
+++ b/sequence_layers/specs/combinators.py
@@ -0,0 +1,165 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Specifications for combinator layers.
+
+See the corresponding _behaviors module for behaviors.
+"""
+
+import abc
+import dataclasses
+import enum
+from typing import (Any, Callable, override, Protocol, runtime_checkable,
+                    Sequence)
+
+from sequence_layers.specs import types as types_spec
+
+
+@enum.unique
+class CombinationMode(enum.Enum):
+  """The type of combination to perform."""
+
+  STACK = 1
+  CONCAT = 2
+  ADD = 3
+  MEAN = 4
+  PRODUCT = 5
+
+
+class Serial[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.Emitting[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Serial layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Serial."""
+
+    layers: Sequence[types_spec.SequenceLayerConfig] = ()
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class SerialModules[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.Emitting[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for SerialModules layer."""
+
+
+class Residual[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.Emitting[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Residual layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Residual."""
+
+    layers: Sequence[types_spec.SequenceLayerConfig] = ()
+    shortcut_layers: Sequence[types_spec.SequenceLayerConfig] | None = None
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class Repeat[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.Emitting[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Repeat layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Repeat."""
+
+    layer: types_spec.SequenceLayerConfig
+    num_repeats: int
+    remat: bool = False
+    prevent_cse: bool = False
+    policy: Callable[..., bool] | None = None
+    unroll_layer: bool = False
+    unroll_step: bool = False
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class Parallel[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.Emitting[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Parallel layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Parallel."""
+
+    layers: Sequence[types_spec.SequenceLayerConfig]
+    combination: CombinationMode = CombinationMode.STACK
+    share_scope: bool | Sequence[bool] = False
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+# pylint: disable=invalid-name
+# pylint: disable=missing-function-docstring
+@runtime_checkable
+class ModuleSpec(Protocol):
+  """Protocol for combinators module."""
+
+  @property
+  def CombinationMode(self) -> type[CombinationMode]:
+    ...
+
+  @property
+  def Serial(self) -> type[Serial]:
+    ...
+
+  @property
+  def SerialModules(self) -> type[SerialModules]:
+    ...
+
+  @property
+  def Residual(self) -> type[Residual]:
+    ...
+
+  @property
+  def Repeat(self) -> type[Repeat]:
+    ...
+
+  @property
+  def Parallel(self) -> type[Parallel]:
+    ...
diff --git a/sequence_layers/specs/combinators_behaviors.py b/sequence_layers/specs/combinators_behaviors.py
new file mode 100644
index 0000000..9578c6e
--- /dev/null
+++ b/sequence_layers/specs/combinators_behaviors.py
@@ -0,0 +1,477 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Shared behavior tests for combinators."""
+
+# pylint: disable=abstract-method
+# pyrefly: disable=bad-instantiation
+
+import dataclasses
+import fractions
+from typing import Any, override
+
+import numpy as np
+
+from sequence_layers.specs import combinators as spec_combinators
+from sequence_layers.specs import test_utils as test_utils_spec
+from sequence_layers.specs import types as types_spec
+
+
+# pylint: disable=abstract-method
+# pyrefly: disable=bad-instantiation
+class CombinatorBehaviorsTest(test_utils_spec.SequenceLayerTest):
+  """Base test class for shared combinator tests."""
+
+  def create_dummy_layer_config(
+      self,
+      val: float = 1.0,
+      state_val: float = 0.0,
+      block_size: int = 1,
+      output_ratio: int = 1,
+      input_latency: int = 0,
+      out_features: int | None = None,
+  ) -> Any:
+    """Helper to create a dummy layer config tied to the active backend."""
+    # pylint: disable=missing-class-docstring,missing-function-docstring,unused-argument
+    backend_sl = self.sl
+    xp = self.xp
+
+    if "jax" in backend_sl.__name__:  # pyrefly: ignore[missing-attribute]
+
+      @dataclasses.dataclass
+      class DummyAddLayer(backend_sl.types.Emitting):
+        val: float = 1.0
+        state_val: float = 0.0
+        _block_size: int = 1
+        _output_ratio: fractions.Fraction = fractions.Fraction(1)
+        _input_latency: int = 0
+        out_features: int | None = None
+
+        @property
+        @override
+        def supports_step(self) -> bool:
+          return True
+
+        @property
+        @override
+        def block_size(self) -> int:
+          return self._block_size
+
+        @property
+        @override
+        def output_ratio(self) -> fractions.Fraction:
+          return self._output_ratio
+
+        @property
+        @override
+        def input_latency(self) -> int:
+          return self._input_latency
+
+        @property
+        @override
+        def receptive_field_per_step(  # pyrefly: ignore[bad-override]
+            self,
+        ) -> dict[int, Any]:
+          return {0: (0, 0)}
+
+        @override
+        def get_output_shape(
+            self, input_shape, *, constants=None
+        ) -> tuple[int, ...]:
+          if self.out_features is not None:
+            return (self.out_features,)
+          return tuple(input_shape)
+
+        @override
+        def get_output_dtype(self, input_dtype, *, constants=None):
+          return input_dtype
+
+        @override
+        def get_initial_state(
+            self,
+            batch_size,
+            input_spec,
+            *,
+            training=False,
+            constants=None,
+            **kwargs,
+        ):
+          return xp.broadcast_to(
+              xp.array(self.state_val, dtype=xp.float32), (batch_size,)
+          )
+
+        @override
+        def layer_with_emits(
+            self, x, *, training=False, constants=None, **kwargs
+        ):
+          y_values = x.values + self.val
+          if self.out_features is not None:
+            in_ch = x.values.shape[-1]
+            if self.out_features > in_ch:
+              pad_shape = list(y_values.shape)
+              pad_shape[-1] = self.out_features - in_ch
+              zeros = xp.zeros(tuple(pad_shape), dtype=y_values.dtype)
+              y_values = xp.concatenate([y_values, zeros], axis=-1)
+            else:
+              y_values = y_values[..., : self.out_features]
+          y_values = y_values * x.mask[..., None]
+          emit_val = y_values * 0 + self.val
+          return type(x)(  # pyrefly: ignore[bad-instantiation]
+              y_values, x.mask
+          ), {"emit_val": emit_val}
+
+        @override
+        def step_with_emits(
+            self, x, state, *, training=False, constants=None, **kwargs
+        ):
+          y_values = x.values + self.val
+          if self.out_features is not None:
+            in_ch = x.values.shape[-1]
+            if self.out_features > in_ch:
+              pad_shape = list(y_values.shape)
+              pad_shape[-1] = self.out_features - in_ch
+              zeros = xp.zeros(tuple(pad_shape), dtype=y_values.dtype)
+              y_values = xp.concatenate([y_values, zeros], axis=-1)
+            else:
+              y_values = y_values[..., : self.out_features]
+          y_values = y_values * x.mask[..., None]
+          emit_val = y_values * 0 + self.val
+          return (
+              type(x)(y_values, x.mask),  # pyrefly: ignore[bad-instantiation]
+              state + 1.0,
+              {"emit_val": emit_val},
+          )
+
+    else:
+
+      class DummyAddLayer(backend_sl.types.Emitting):
+
+        def __init__(
+            self,
+            val,
+            state_val,
+            _block_size,
+            _output_ratio,
+            _input_latency,
+            out_features,
+        ):
+          super().__init__()
+          self.val = val
+          self.state_val = state_val
+          self._block_size = _block_size
+          self._output_ratio = _output_ratio
+          self._input_latency = _input_latency
+          self.out_features = out_features
+
+        @property
+        @override
+        def supports_step(self) -> bool:
+          return True
+
+        @property
+        @override
+        def block_size(self) -> int:
+          return self._block_size
+
+        @property
+        @override
+        def output_ratio(self) -> fractions.Fraction:
+          return self._output_ratio
+
+        @property
+        @override
+        def input_latency(self) -> int:
+          return self._input_latency
+
+        @property
+        @override
+        def receptive_field_per_step(  # pyrefly: ignore[bad-override]
+            self,
+        ) -> dict[int, Any]:
+          return {0: (0, 0)}
+
+        @override
+        def get_output_shape(
+            self, input_shape, *, constants=None
+        ) -> tuple[int, ...]:
+          if self.out_features is not None:
+            return (self.out_features,)
+          return tuple(input_shape)
+
+        @override
+        def get_output_dtype(self, input_dtype, *, constants=None):
+          return input_dtype
+
+        @override
+        def get_initial_state(
+            self,
+            batch_size,
+            input_spec,
+            *,
+            training=False,
+            constants=None,
+            **kwargs,
+        ):
+          return xp.broadcast_to(
+              xp.array(self.state_val, dtype=xp.float32), (batch_size,)
+          )
+
+        @override
+        def layer_with_emits(
+            self, x, *, training=False, constants=None, **kwargs
+        ):
+          y_values = x.values + self.val
+          if self.out_features is not None:
+            in_ch = x.values.shape[-1]
+            if self.out_features > in_ch:
+              pad_shape = list(y_values.shape)
+              pad_shape[-1] = self.out_features - in_ch
+              zeros = xp.zeros(tuple(pad_shape), dtype=y_values.dtype)
+              y_values = xp.concatenate([y_values, zeros], axis=-1)
+            else:
+              y_values = y_values[..., : self.out_features]
+          y_values = y_values * x.mask[..., None]
+          emit_val = y_values * 0 + self.val
+          return type(x)(  # pyrefly: ignore[bad-instantiation]
+              y_values, x.mask
+          ), {"emit_val": emit_val}
+
+        @override
+        def step_with_emits(
+            self, x, state, *, training=False, constants=None, **kwargs
+        ):
+          y_values = x.values + self.val
+          if self.out_features is not None:
+            in_ch = x.values.shape[-1]
+            if self.out_features > in_ch:
+              pad_shape = list(y_values.shape)
+              pad_shape[-1] = self.out_features - in_ch
+              zeros = xp.zeros(tuple(pad_shape), dtype=y_values.dtype)
+              y_values = xp.concatenate([y_values, zeros], axis=-1)
+            else:
+              y_values = y_values[..., : self.out_features]
+          y_values = y_values * x.mask[..., None]
+          emit_val = y_values * 0 + self.val
+          return (
+              type(x)(y_values, x.mask),  # pyrefly: ignore[bad-instantiation]
+              state + 1.0,
+              {"emit_val": emit_val},
+          )
+
+    @dataclasses.dataclass(frozen=True)
+    class DummyConfig(types_spec.SequenceLayerConfig):
+      val: float
+      state_val: float
+      block_size: int
+      output_ratio: int
+      input_latency: int
+      out_features: int | None
+
+      @override
+      def make(self, backend="jax"):
+        return DummyAddLayer(  # pyrefly: ignore[bad-instantiation]
+            val=self.val,
+            state_val=self.state_val,
+            _block_size=self.block_size,
+            _output_ratio=fractions.Fraction(self.output_ratio),
+            _input_latency=self.input_latency,
+            out_features=self.out_features,
+        )
+
+    return DummyConfig(
+        val=val,
+        state_val=state_val,
+        block_size=block_size,
+        output_ratio=output_ratio,
+        input_latency=input_latency,
+        out_features=out_features,
+    )
+
+  def test_serial_basic(self):
+    config = self.sl.combinators.Serial.Config([
+        self.create_dummy_layer_config(val=1.0),
+        self.create_dummy_layer_config(val=2.0),
+    ])
+    layer = self.make_layer(config)
+    x = self.random_sequence(2, 5, 3)
+    layer = self.init_layer(layer, x)
+    y = layer.layer(x, training=False)
+
+    # Serial adds: x + 1.0 + 2.0 = x + 3.0
+    expected = (x.values + 3.0) * x.mask[..., None]
+    np.testing.assert_allclose(y.values, expected, atol=1e-6)
+
+    self.verify_contract(layer, x)
+
+  def test_serial_empty(self):
+    config = self.sl.combinators.Serial.Config([])
+    layer = self.make_layer(config)
+    x = self.random_sequence(2, 5, 3)
+    layer = self.init_layer(layer, x)
+    y = layer.layer(x, training=False)
+    np.testing.assert_allclose(y.values, x.values, atol=1e-6)
+    self.verify_contract(layer, x)
+
+  def test_serial_state_tracking(self):
+    config = self.sl.combinators.Serial.Config([
+        self.create_dummy_layer_config(val=1.0, state_val=10.0),
+        self.create_dummy_layer_config(val=2.0, state_val=20.0),
+    ])
+    layer = self.make_layer(config)
+    x = self.random_sequence(1, 1, 2)
+    layer = self.init_layer(layer, x)
+    state = layer.get_initial_state(1, x.channel_spec, training=False)
+    self.assertEqual(state, (10.0, 20.0))
+
+    y, next_state = layer.step(x, state, training=False)
+    # States should increment
+    self.assertEqual(next_state, (11.0, 21.0))
+    expected = (x.values + 3.0) * x.mask[..., None]
+    np.testing.assert_allclose(y.values, expected, atol=1e-6)
+
+  def test_residual_basic(self):
+    # y = body(x) + shortcut(x)
+    # body = add 2.0, shortcut = identity
+    config = self.sl.combinators.Residual.Config(
+        [self.create_dummy_layer_config(val=2.0)]
+    )
+    layer = self.make_layer(config)
+    x = self.random_sequence(2, 4, 3)
+    layer = self.init_layer(layer, x)
+    y = layer.layer(x, training=False)
+
+    # expected = (x + 2.0) + x = 2x + 2.0
+    expected = (x.values * 2.0 + 2.0) * x.mask[..., None]
+    np.testing.assert_allclose(y.values, expected, atol=1e-6)
+
+    self.verify_contract(layer, x)
+
+  def test_residual_with_custom_shortcut(self):
+    # body = add 2.0, shortcut = add 5.0
+    config = self.sl.combinators.Residual.Config(
+        [self.create_dummy_layer_config(val=2.0)],
+        shortcut_layers=[self.create_dummy_layer_config(val=5.0)],
+    )
+    layer = self.make_layer(config)
+    x = self.random_sequence(2, 4, 3)
+    layer = self.init_layer(layer, x)
+    y = layer.layer(x, training=False)
+
+    # expected = (x + 2.0) + (x + 5.0) = 2x + 7.0
+    expected = (x.values * 2.0 + 7.0) * x.mask[..., None]
+    np.testing.assert_allclose(y.values, expected, atol=1e-6)
+
+    self.verify_contract(layer, x)
+
+  def test_repeat_basic(self):
+    # Repeats a layer N times
+    config = self.sl.combinators.Repeat.Config(
+        layer=self.create_dummy_layer_config(val=1.5),
+        num_repeats=4,
+    )
+    layer = self.make_layer(config)
+    x = self.random_sequence(2, 5, 3)
+    layer = self.init_layer(layer, x)
+    y = layer.layer(x, training=False)
+
+    # expected = x + 4 * 1.5 = x + 6.0
+    expected = (x.values + 6.0) * x.mask[..., None]
+    np.testing.assert_allclose(y.values, expected, atol=1e-6)
+
+    self.verify_contract(layer, x)
+
+  def test_parallel_stack(self):
+    config = self.sl.combinators.Parallel.Config(
+        [
+            self.create_dummy_layer_config(val=1.0),
+            self.create_dummy_layer_config(val=2.0),
+        ],
+        combination=spec_combinators.CombinationMode.STACK,
+    )
+    layer = self.make_layer(config)
+    x = self.random_sequence(2, 3, 4)
+    layer = self.init_layer(layer, x)
+    y = layer.layer(x, training=False)
+
+    self.assertEqual(y.channel_shape, (2, 4))
+    self.verify_contract(layer, x)
+
+  def test_parallel_concat(self):
+    config = self.sl.combinators.Parallel.Config(
+        [
+            self.create_dummy_layer_config(val=1.0, out_features=3),
+            self.create_dummy_layer_config(val=2.0, out_features=5),
+        ],
+        combination=spec_combinators.CombinationMode.CONCAT,
+    )
+    layer = self.make_layer(config)
+    x = self.random_sequence(2, 3, 4)
+    layer = self.init_layer(layer, x)
+    y = layer.layer(x, training=False)
+
+    self.assertEqual(y.channel_shape, (8,))
+    self.verify_contract(layer, x)
+
+  def test_parallel_add(self):
+    config = self.sl.combinators.Parallel.Config(
+        [
+            self.create_dummy_layer_config(val=1.0),
+            self.create_dummy_layer_config(val=2.0),
+        ],
+        combination=spec_combinators.CombinationMode.ADD,
+    )
+    layer = self.make_layer(config)
+    x = self.random_sequence(2, 3, 4)
+    layer = self.init_layer(layer, x)
+    y = layer.layer(x, training=False)
+
+    expected = (x.values * 2.0 + 3.0) * x.mask[..., None]
+    np.testing.assert_allclose(y.values, expected, atol=1e-6)
+
+    self.verify_contract(layer, x)
+
+  def test_parallel_mean(self):
+    config = self.sl.combinators.Parallel.Config(
+        [
+            self.create_dummy_layer_config(val=1.0),
+            self.create_dummy_layer_config(val=3.0),
+        ],
+        combination=spec_combinators.CombinationMode.MEAN,
+    )
+    layer = self.make_layer(config)
+    x = self.random_sequence(2, 3, 4)
+    layer = self.init_layer(layer, x)
+    y = layer.layer(x, training=False)
+
+    expected = (x.values + 2.0) * x.mask[..., None]
+    np.testing.assert_allclose(y.values, expected, atol=1e-6)
+
+    self.verify_contract(layer, x)
+
+  def test_parallel_product(self):
+    config = self.sl.combinators.Parallel.Config(
+        [
+            self.create_dummy_layer_config(val=2.0),
+            self.create_dummy_layer_config(val=3.0),
+        ],
+        combination=spec_combinators.CombinationMode.PRODUCT,
+    )
+    layer = self.make_layer(config)
+    x = self.random_sequence(2, 3, 4)
+    layer = self.init_layer(layer, x)
+    y = layer.layer(x, training=False)
+
+    expected = ((x.values + 2.0) * (x.values + 3.0)) * x.mask[..., None]
+    np.testing.assert_allclose(y.values, expected, atol=1e-6)
+
+    self.verify_contract(layer, x)
diff --git a/sequence_layers/specs/conditioning.py b/sequence_layers/specs/conditioning.py
new file mode 100644
index 0000000..5d310ac
--- /dev/null
+++ b/sequence_layers/specs/conditioning.py
@@ -0,0 +1,107 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Specifications for conditioning layers.
+
+See the corresponding _behaviors module for behaviors.
+"""
+
+import abc
+import dataclasses
+import enum
+from typing import Any, override, Protocol, runtime_checkable
+
+from sequence_layers.specs import types as types_spec
+
+
+@enum.unique
+class Projection(enum.Enum):
+  """The type of projection to perform."""
+
+  # No projection.
+  IDENTITY = 1
+  # Dense projection from every element of c at a given time step, to a tensor
+  # of the same shape as x at given time step.
+  LINEAR = 2
+  # Dense projection from every element of c at a given time step, to a tensor
+  # of shape [2, x.shape...] at given time step.
+  LINEAR_AFFINE = 3
+
+
+@enum.unique
+class Combination(enum.Enum):
+  """The type of combination to perform."""
+
+  # Broadcast-add conditioning.
+  ADD = 1
+  # Broadcast-concat conditioning.
+  CONCAT = 2
+  # Affine conditioning. Requires LINEAR_AFFINE projection.
+  AFFINE = 3
+  # Affine shift conditioning. Requires LINEAR projection.
+  AFFINE_SHIFT = 4
+  # Affine scale conditioning. Requires LINEAR projection.
+  AFFINE_SCALE = 5
+  # Broadcast-multiply conditioning. Requires LINEAR or IDENTITY projection.
+  MUL = 6
+  # Broadcast-concat conditioning via prepending.
+  CONCAT_BEFORE = 7
+
+
+class BaseConditioning[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.SequenceLayer[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Base specification for conditioning layers."""
+
+  # For backward compatibility with nested enum references
+  Projection = Projection
+  Combination = Combination
+
+
+class Conditioning[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    BaseConditioning[SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Conditioning layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Conditioning."""
+
+    conditioning_name: str
+    projection: Projection
+    combination: Combination
+    projection_channel_shape: types_spec.Shape | None = None
+    streaming: bool = False
+    affine_scale_offset: complex = 1.0
+    compute_dtype: Any = None
+    param_dtype: Any = None
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+@runtime_checkable
+class ModuleSpec(Protocol):
+  """Protocol for conditioning module."""
+
+  @property
+  def Conditioning(self) -> type[Conditioning]:
+    ...
diff --git a/sequence_layers/specs/conditioning_behaviors.py b/sequence_layers/specs/conditioning_behaviors.py
new file mode 100644
index 0000000..c5c3140
--- /dev/null
+++ b/sequence_layers/specs/conditioning_behaviors.py
@@ -0,0 +1,362 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Behavior tests for conditioning layers."""
+
+# pylint: disable=abstract-method
+# pyrefly: disable=bad-instantiation
+
+from absl.testing import parameterized
+import numpy as np
+
+from sequence_layers.specs import conditioning as conditioning_spec
+from sequence_layers.specs import test_utils
+
+
+class ConditioningTest(test_utils.SequenceLayerTest):
+  """Test behavior of Conditioning layer."""
+
+  def _make_constants(self, conditioning_seq, name='cond'):
+    return {name: conditioning_seq}
+
+  def test_identity_add(self):
+    config = self.sl.Conditioning.Config(
+        conditioning_name='cond',
+        projection=conditioning_spec.Projection.IDENTITY,
+        combination=conditioning_spec.Combination.ADD,
+    )
+    layer = self.make_layer(config)
+    cond_seq = self.random_sequence(2, 8, 4)
+    constants = self._make_constants(cond_seq)
+    x = self.random_sequence(2, 8, 4)
+    layer = self.init_layer(layer, x, constants=constants)
+    self.verify_contract(layer, x, pad_constants=True, constants=constants)
+
+  def test_identity_add_output_shape(self):
+    config = self.sl.Conditioning.Config(
+        conditioning_name='cond',
+        projection=conditioning_spec.Projection.IDENTITY,
+        combination=conditioning_spec.Combination.ADD,
+    )
+    layer = self.make_layer(config)
+    cond_seq = self.random_sequence(2, 8, 4)
+    constants = self._make_constants(cond_seq)
+    self.assertEqual(layer.get_output_shape((4,), constants=constants), (4,))
+
+  def test_identity_add_broadcast(self):
+    config = self.sl.Conditioning.Config(
+        conditioning_name='cond',
+        projection=conditioning_spec.Projection.IDENTITY,
+        combination=conditioning_spec.Combination.ADD,
+    )
+    layer = self.make_layer(config)
+    cond_seq = self.random_sequence(2, 8, 1)
+    constants = self._make_constants(cond_seq)
+    self.assertEqual(layer.get_output_shape((4,), constants=constants), (4,))
+
+  def test_tensor_conditioning(self):
+    """Conditioning with a [B, dim] tensor (not a Sequence)."""
+    config = self.sl.Conditioning.Config(
+        conditioning_name='cond',
+        projection=conditioning_spec.Projection.IDENTITY,
+        combination=conditioning_spec.Combination.ADD,
+    )
+    layer = self.make_layer(config)
+    # Generate a random sequence and extract its values to get a raw tensor.
+    cond_seq = self.random_sequence(2, 1, 4)
+    # Squeeze the time dimension to get [B, C]
+    cond_values = cond_seq.values
+    if hasattr(cond_values, 'squeeze'):
+      cond_tensor = cond_values.squeeze(axis=1)
+    else:
+      # Fallback if squeeze is not available (should be for both JAX and MLX)
+      cond_tensor = cond_values[:, 0]
+
+    constants = self._make_constants(cond_tensor)
+    x = self.random_sequence(2, 8, 4)
+    layer = self.init_layer(layer, x, constants=constants)
+    y = layer.layer(x, training=False, constants=constants)
+    self.assertEqual(y.channel_shape, (4,))
+
+  def test_step_non_streaming(self):
+    """Non-streaming: full conditioning passed, layer slices per step."""
+    config = self.sl.Conditioning.Config(
+        conditioning_name='cond',
+        projection=conditioning_spec.Projection.IDENTITY,
+        combination=conditioning_spec.Combination.ADD,
+        streaming=False,
+    )
+    layer = self.make_layer(config)
+    cond_seq = self.random_sequence(2, 8, 4)
+    constants = self._make_constants(cond_seq)
+    x = self.random_sequence(2, 8, 4)
+
+    layer = self.init_layer(layer, x, constants=constants)
+
+    # Layer mode.
+    y_layer = layer.layer(x, training=False, constants=constants)
+
+    # Step mode (pass full conditioning; layer slices internally).
+    y_step, _ = self._step_by_step(layer, x, block_size=1, constants=constants)
+    self.assertSequencesClose(y_step, y_layer)
+
+  def test_step_streaming(self):
+    """Streaming: conditioning chunks arrive with input chunks."""
+    config = self.sl.Conditioning.Config(
+        conditioning_name='cond',
+        projection=conditioning_spec.Projection.IDENTITY,
+        combination=conditioning_spec.Combination.ADD,
+        streaming=True,
+    )
+    layer = self.make_layer(config)
+    cond_seq = self.random_sequence(2, 8, 4)
+    x = self.random_sequence(2, 8, 4)
+    constants = self._make_constants(cond_seq)
+
+    layer = self.init_layer(layer, x, constants=constants)
+
+    # Layer mode.
+    y_layer = layer.layer(x, training=False, constants=constants)
+
+    # Step mode with stream_constants.
+    y_step, _ = self._step_by_step(
+        layer,
+        x,
+        block_size=1,
+        stream_constants=constants,
+    )
+    self.assertSequencesClose(y_step, y_layer)
+
+  def test_identity_concat(self):
+    config = self.sl.Conditioning.Config(
+        conditioning_name='cond',
+        projection=conditioning_spec.Projection.IDENTITY,
+        combination=conditioning_spec.Combination.CONCAT,
+    )
+    layer = self.make_layer(config)
+    cond_seq = self.random_sequence(2, 8, 3)
+    constants = self._make_constants(cond_seq)
+    x = self.random_sequence(2, 8, 4)
+    layer = self.init_layer(layer, x, constants=constants)
+    y = layer.layer(x, training=False, constants=constants)
+    self.assertEqual(y.channel_shape, (7,))
+
+  def test_concat_before(self):
+    config = self.sl.Conditioning.Config(
+        conditioning_name='cond',
+        projection=conditioning_spec.Projection.IDENTITY,
+        combination=conditioning_spec.Combination.CONCAT_BEFORE,
+    )
+    layer = self.make_layer(config)
+    cond_seq = self.random_sequence(2, 8, 3)
+    constants = self._make_constants(cond_seq)
+    x = self.random_sequence(2, 8, 4)
+    layer = self.init_layer(layer, x, constants=constants)
+    y = layer.layer(x, training=False, constants=constants)
+    self.assertEqual(y.channel_shape, (7,))
+    # CONCAT_BEFORE should have conditioning first.
+    y_cond = y[:, :, :3]
+    # Align cond_seq mask with y mask for comparison (y.mask is c.mask & x.mask)
+    cond_seq_aligned = self.sl.Sequence(cond_seq.values, y.mask).mask_invalid()
+    self.assertSequencesClose(y_cond, cond_seq_aligned)
+
+  def test_identity_mul(self):
+    config = self.sl.Conditioning.Config(
+        conditioning_name='cond',
+        projection=conditioning_spec.Projection.IDENTITY,
+        combination=conditioning_spec.Combination.MUL,
+    )
+    layer = self.make_layer(config)
+    cond_seq = self.random_sequence(2, 8, 4)
+    constants = self._make_constants(cond_seq)
+    x = self.random_sequence(2, 8, 4)
+    layer = self.init_layer(layer, x, constants=constants)
+    self.verify_contract(layer, x, pad_constants=True, constants=constants)
+
+  def test_linear_add(self):
+    config = self.sl.Conditioning.Config(
+        conditioning_name='cond',
+        projection=conditioning_spec.Projection.LINEAR,
+        combination=conditioning_spec.Combination.ADD,
+    )
+    layer = self.make_layer(config)
+    cond_seq = self.random_sequence(2, 8, 6)
+    constants = self._make_constants(cond_seq)
+    x = self.random_sequence(2, 8, 4)
+    layer = self.init_layer(layer, x, constants=constants)
+    self.verify_contract(layer, x, pad_constants=True, constants=constants)
+
+  def test_linear_add_output_shape(self):
+    config = self.sl.Conditioning.Config(
+        conditioning_name='cond',
+        projection=conditioning_spec.Projection.LINEAR,
+        combination=conditioning_spec.Combination.ADD,
+    )
+    layer = self.make_layer(config)
+    cond_seq = self.random_sequence(2, 8, 6)
+    constants = self._make_constants(cond_seq)
+    # LINEAR projects conditioning to input channel shape.
+    self.assertEqual(layer.get_output_shape((4,), constants=constants), (4,))
+
+  def test_with_projection_channel_shape(self):
+    config = self.sl.Conditioning.Config(
+        conditioning_name='cond',
+        projection=conditioning_spec.Projection.LINEAR,
+        combination=conditioning_spec.Combination.ADD,
+        projection_channel_shape=(8,),
+    )
+    layer = self.make_layer(config)
+    cond_seq = self.random_sequence(2, 8, 6)
+    constants = self._make_constants(cond_seq)
+    # Projects to (8,), then broadcast-add with input (8,).
+    self.assertEqual(layer.get_output_shape((8,), constants=constants), (8,))
+
+  def test_linear_affine_shift(self):
+    config = self.sl.Conditioning.Config(
+        conditioning_name='cond',
+        projection=conditioning_spec.Projection.LINEAR,
+        combination=conditioning_spec.Combination.AFFINE_SHIFT,
+    )
+    layer = self.make_layer(config)
+    cond_seq = self.random_sequence(2, 8, 6)
+    constants = self._make_constants(cond_seq)
+    x = self.random_sequence(2, 8, 4)
+    layer = self.init_layer(layer, x, constants=constants)
+    self.verify_contract(layer, x, pad_constants=True, constants=constants)
+
+  def test_linear_affine_scale(self):
+    config = self.sl.Conditioning.Config(
+        conditioning_name='cond',
+        projection=conditioning_spec.Projection.LINEAR,
+        combination=conditioning_spec.Combination.AFFINE_SCALE,
+    )
+    layer = self.make_layer(config)
+    cond_seq = self.random_sequence(2, 8, 6)
+    constants = self._make_constants(cond_seq)
+    x = self.random_sequence(2, 8, 4)
+    layer = self.init_layer(layer, x, constants=constants)
+    self.verify_contract(layer, x, pad_constants=True, constants=constants)
+
+  def test_linear_affine(self):
+    config = self.sl.Conditioning.Config(
+        conditioning_name='cond',
+        projection=conditioning_spec.Projection.LINEAR_AFFINE,
+        combination=conditioning_spec.Combination.AFFINE,
+    )
+    layer = self.make_layer(config)
+    cond_seq = self.random_sequence(2, 8, 6)
+    constants = self._make_constants(cond_seq)
+    x = self.random_sequence(2, 8, 4)
+    layer = self.init_layer(layer, x, constants=constants)
+    self.verify_contract(layer, x, pad_constants=True, constants=constants)
+
+  def test_linear_affine_output_shape(self):
+    config = self.sl.Conditioning.Config(
+        conditioning_name='cond',
+        projection=conditioning_spec.Projection.LINEAR_AFFINE,
+        combination=conditioning_spec.Combination.AFFINE,
+    )
+    layer = self.make_layer(config)
+    cond_seq = self.random_sequence(2, 8, 6)
+    constants = self._make_constants(cond_seq)
+    # AFFINE combination strips the '2' dim from projected shape.
+    self.assertEqual(layer.get_output_shape((4,), constants=constants), (4,))
+
+  def test_affine_requires_linear_affine(self):
+    config = self.sl.Conditioning.Config(
+        conditioning_name='cond',
+        projection=conditioning_spec.Projection.LINEAR,
+        combination=conditioning_spec.Combination.AFFINE,
+    )
+    layer = self.make_layer(config)
+    cond_seq = self.random_sequence(2, 8, 4)
+    constants = self._make_constants(cond_seq)
+    with self.assertRaises(ValueError):
+      layer.get_output_shape((4,), constants=constants)
+
+  def test_affine_shift_requires_linear(self):
+    config = self.sl.Conditioning.Config(
+        conditioning_name='cond',
+        projection=conditioning_spec.Projection.IDENTITY,
+        combination=conditioning_spec.Combination.AFFINE_SHIFT,
+    )
+    layer = self.make_layer(config)
+    cond_seq = self.random_sequence(2, 8, 4)
+    constants = self._make_constants(cond_seq)
+    with self.assertRaises(ValueError):
+      layer.get_output_shape((4,), constants=constants)
+
+  def test_affine_scale_requires_linear(self):
+    config = self.sl.Conditioning.Config(
+        conditioning_name='cond',
+        projection=conditioning_spec.Projection.IDENTITY,
+        combination=conditioning_spec.Combination.AFFINE_SCALE,
+    )
+    layer = self.make_layer(config)
+    cond_seq = self.random_sequence(2, 8, 4)
+    constants = self._make_constants(cond_seq)
+    with self.assertRaises(ValueError):
+      layer.get_output_shape((4,), constants=constants)
+
+  def test_linear_affine_requires_affine(self):
+    config = self.sl.Conditioning.Config(
+        conditioning_name='cond',
+        projection=conditioning_spec.Projection.LINEAR_AFFINE,
+        combination=conditioning_spec.Combination.ADD,
+    )
+    layer = self.make_layer(config)
+    cond_seq = self.random_sequence(2, 8, 4)
+    constants = self._make_constants(cond_seq)
+    with self.assertRaises(ValueError):
+      layer.get_output_shape((4,), constants=constants)
+
+  def test_missing_constants(self):
+    config = self.sl.Conditioning.Config(
+        conditioning_name='cond',
+        projection=conditioning_spec.Projection.IDENTITY,
+        combination=conditioning_spec.Combination.ADD,
+    )
+    layer = self.make_layer(config)
+    x = self.random_sequence(2, 8, 4)
+    layer = self.init_layer(layer, x, bind_only=True)
+    with self.assertRaises(ValueError):
+      layer.layer(x, training=False, constants=None)
+
+  def test_missing_key(self):
+    config = self.sl.Conditioning.Config(
+        conditioning_name='cond',
+        projection=conditioning_spec.Projection.IDENTITY,
+        combination=conditioning_spec.Combination.ADD,
+    )
+    layer = self.make_layer(config)
+    x = self.random_sequence(2, 8, 4)
+    layer = self.init_layer(layer, x, bind_only=True)
+    dummy_seq = self.random_sequence(2, 8, 4)
+    with self.assertRaises(ValueError):
+      layer.layer(x, training=False, constants={'other': dummy_seq})
+
+  def test_from_config_identity_add(self):
+    config = self.sl.Conditioning.Config(
+        conditioning_name='cond',
+        projection=conditioning_spec.Projection.IDENTITY,
+        combination=conditioning_spec.Combination.ADD,
+    )
+    layer = self.make_layer(config)
+    self.assertIsInstance(layer, self.sl.Conditioning)
+
+    cond_seq = self.random_sequence(2, 5, 8)
+    constants = self._make_constants(cond_seq)
+    x = self.random_sequence(2, 5, 8)
+    layer = self.init_layer(layer, x, constants=constants)
+    y = layer.layer(x, training=False, constants=constants)
+    self.assertEqual(y.channel_shape, (8,))
diff --git a/sequence_layers/specs/convolution.py b/sequence_layers/specs/convolution.py
new file mode 100644
index 0000000..fa9f1e1
--- /dev/null
+++ b/sequence_layers/specs/convolution.py
@@ -0,0 +1,211 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Specifications for convolution layers.
+
+See the corresponding _behaviors module for behaviors.
+"""
+
+import abc
+import dataclasses
+from typing import (Any, Callable, override, Protocol, runtime_checkable,
+                    Sequence)
+
+from sequence_layers.specs import types as types_spec
+
+
+class BaseConv[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.SequenceLayer[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Base specification for convolution layers."""
+
+
+class Conv1D[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    BaseConv[SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Conv1D layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Conv1D."""
+
+    filters: int
+    kernel_size: int
+    strides: int = 1
+    dilation_rate: int = 1
+    padding: types_spec.PaddingModeString = types_spec.PaddingMode.VALID.value
+    groups: int = 1
+    use_bias: bool = True
+    activation: Callable | None = None
+    compute_dtype: Any = None
+    param_dtype: Any = None  # Can be numpy, jax, or mlx dtype
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class DepthwiseConv1D[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    BaseConv[SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for DepthwiseConv1D layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for DepthwiseConv1D."""
+
+    kernel_size: int
+    strides: int = 1
+    dilation_rate: int = 1
+    padding: types_spec.PaddingModeString = types_spec.PaddingMode.VALID.value
+    channel_multiplier: int = 1
+    use_bias: bool = True
+    activation: Callable | None = None
+    compute_dtype: Any = None
+    param_dtype: Any = None
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class Conv1DTranspose[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.SequenceLayer[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Conv1DTranspose layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Conv1DTranspose."""
+
+    filters: int
+    kernel_size: int
+    strides: int = 1
+    padding: types_spec.PaddingModeString = types_spec.PaddingMode.VALID.value
+    use_bias: bool = True
+    activation: Callable | None = None
+    compute_dtype: Any = None
+    param_dtype: Any = None
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class Conv2D[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    BaseConv[SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Conv2D layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Conv2D."""
+
+    filters: int
+    kernel_size: int | Sequence[int]
+    strides: int | Sequence[int] = 1
+    dilation_rate: int | Sequence[int] = 1
+    time_padding: types_spec.PaddingModeString = (
+        types_spec.PaddingMode.VALID.value
+    )
+    spatial_padding: types_spec.PaddingModeString | tuple[int, int] = (
+        types_spec.PaddingMode.SAME.value
+    )
+    groups: int = 1
+    use_bias: bool = True
+    activation: Callable | None = None
+    compute_dtype: Any = None
+    param_dtype: Any = None
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class Conv2DTranspose[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.SequenceLayer[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Conv2DTranspose layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Conv2DTranspose."""
+
+    filters: int
+    kernel_size: int | Sequence[int]
+    strides: int | Sequence[int] = 1
+    time_padding: types_spec.PaddingModeString = (
+        types_spec.PaddingMode.VALID.value
+    )
+    spatial_padding: types_spec.PaddingModeString | tuple[int, int] = (
+        types_spec.PaddingMode.SAME.value
+    )
+    use_bias: bool = True
+    activation: Callable | None = None
+    compute_dtype: Any = None
+    param_dtype: Any = None
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+@runtime_checkable
+class ModuleSpec(Protocol):
+  """Protocol for convolution module."""
+
+  # pylint: disable=invalid-name
+  # pylint: disable=missing-function-docstring
+
+  @property
+  def Conv1D(self) -> type[Conv1D]:
+    ...
+
+  @property
+  def DepthwiseConv1D(self) -> type[DepthwiseConv1D]:
+    ...
+
+  @property
+  def Conv1DTranspose(self) -> type[Conv1DTranspose]:
+    ...
+
+  @property
+  def Conv2D(self) -> type[Conv2D]:
+    ...
+
+  @property
+  def Conv2DTranspose(self) -> type[Conv2DTranspose]:
+    ...
diff --git a/sequence_layers/specs/convolution_behaviors.py b/sequence_layers/specs/convolution_behaviors.py
new file mode 100644
index 0000000..af34c00
--- /dev/null
+++ b/sequence_layers/specs/convolution_behaviors.py
@@ -0,0 +1,347 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Behavior tests for convolution layers.
+
+Backend-specific test files should inherit from these tests.
+"""
+
+# pylint: disable=abstract-method
+# pyrefly: disable=bad-instantiation
+
+import fractions
+
+from absl.testing import parameterized
+
+from sequence_layers.specs import test_utils
+
+
+class Conv1DTest(test_utils.SequenceLayerTest):
+  """Test behavior of Conv1D layer."""
+
+  @parameterized.product(
+      params=[
+          # 1x1 conv.
+          (1, 1, 1),
+          # even kernel_size with smaller, equal and larger strides.
+          (2, 1, 1),
+          (2, 2, 1),
+          (2, 3, 1),
+          # odd kernel_size with smaller, equal and larger strides.
+          (3, 2, 1),
+          (3, 3, 1),
+          (3, 4, 1),
+          # kernel_size smaller, equal and larger than even dilation_rate.
+          (1, 1, 2),
+          (2, 1, 2),
+          (3, 1, 2),
+          # kernel_size smaller, equal and larger than odd dilation_rate.
+          (1, 1, 3),
+          (2, 1, 3),
+          (3, 1, 3),
+      ],
+      padding=[
+          'same',
+          'valid',
+          'causal_valid',
+          'reverse_causal_valid',
+          'causal',
+          'reverse_causal',
+          'semicausal',
+      ],
+  )
+  def test_conv1d(self, params, padding):
+    kernel_size, stride, dilation_rate = params
+    config = self.sl.Conv1D.Config(
+        filters=2,
+        kernel_size=kernel_size,
+        strides=stride,
+        dilation_rate=dilation_rate,
+        padding=padding,
+        name='conv1d',
+    )
+    l = self.make_layer(config)
+    self.assertEqual(l.block_size, stride)
+    self.assertEqual(1 / l.output_ratio, stride)
+    self.assertEqual(l.name, 'conv1d')
+
+    supports_step = padding in (
+        'causal_valid',
+        'reverse_causal_valid',
+        'causal',
+        'reverse_causal',
+        'semicausal',
+    )
+    self.assertEqual(l.supports_step, supports_step)
+
+    effective_kernel_size = (kernel_size - 1) * dilation_rate + 1
+    expected_input_latency = (
+        effective_kernel_size - 1
+        if padding in ('reverse_causal_valid', 'reverse_causal')
+        else 0
+    )
+    self.assertEqual(l.input_latency, expected_input_latency)
+    self.assertEqual(l.output_latency, expected_input_latency // stride)
+
+    batch_size, channels = 2, 3
+    x = self.random_sequence(batch_size, 1, channels)
+    l = self.init_layer(l, x)
+
+    output_spec = l.get_output_spec(x.channel_spec)
+    self.assertEqual(output_spec.shape, (2,))  # config.filters = 2
+
+    for time in range(20 * l.block_size - 1, 20 * l.block_size + 2):
+      x = self.random_sequence(batch_size, time, channels)
+      self.verify_contract(l, x, training=False)
+
+
+class DepthwiseConv1DTest(test_utils.SequenceLayerTest):
+  """Test behavior of DepthwiseConv1D layer."""
+
+  @parameterized.product(
+      params=[
+          # kernel_size with smaller, equal and larger strides.
+          (2, 1, 1),
+          (2, 2, 1),
+          (2, 3, 1),
+          (3, 2, 1),
+          (3, 3, 1),
+          (3, 4, 1),
+          # dilation_rate.
+          (3, 1, 2),
+          (3, 1, 3),
+      ],
+      padding=[
+          'same',
+          'valid',
+          'causal_valid',
+          'reverse_causal_valid',
+          'causal',
+          'reverse_causal',
+          'semicausal',
+      ],
+      channel_multiplier=[1, 2],
+  )
+  def test_depthwise_conv1d(self, params, padding, channel_multiplier):
+    kernel_size, stride, dilation_rate = params
+    config = self.sl.DepthwiseConv1D.Config(
+        kernel_size=kernel_size,
+        strides=stride,
+        dilation_rate=dilation_rate,
+        padding=padding,
+        channel_multiplier=channel_multiplier,
+        name='depthwise_conv1d',
+    )
+    l = self.make_layer(config)
+    self.assertEqual(l.block_size, stride)
+    self.assertEqual(1 / l.output_ratio, stride)
+    self.assertEqual(l.name, 'depthwise_conv1d')
+
+    supports_step = padding in (
+        'causal_valid',
+        'reverse_causal_valid',
+        'causal',
+        'reverse_causal',
+        'semicausal',
+    )
+    self.assertEqual(l.supports_step, supports_step)
+
+    effective_kernel_size = (kernel_size - 1) * dilation_rate + 1
+    expected_input_latency = (
+        effective_kernel_size - 1
+        if padding in ('reverse_causal_valid', 'reverse_causal')
+        else 0
+    )
+    self.assertEqual(l.input_latency, expected_input_latency)
+    self.assertEqual(l.output_latency, expected_input_latency // stride)
+
+    batch_size, channels = 2, 3
+    x = self.random_sequence(batch_size, 1, channels)
+    l = self.init_layer(l, x)
+
+    output_spec = l.get_output_spec(x.channel_spec)
+    self.assertEqual(output_spec.shape, (channels * channel_multiplier,))
+
+    for time in range(20 * l.block_size - 1, 20 * l.block_size + 2):
+      x = self.random_sequence(batch_size, time, channels)
+      self.verify_contract(l, x, training=False)
+
+
+class Conv1DTransposeTest(test_utils.SequenceLayerTest):
+  """Test behavior of Conv1DTranspose layer."""
+
+  @parameterized.product(
+      params=[
+          (1, 1),
+          (2, 1),
+          (2, 2),
+          (2, 3),
+          (3, 2),
+          (3, 3),
+          (3, 4),
+      ],
+      padding=[
+          'same',
+          'valid',
+          'causal',
+      ],
+  )
+  def test_conv1d_transpose(self, params, padding):
+    kernel_size, stride = params
+    config = self.sl.Conv1DTranspose.Config(
+        filters=2,
+        kernel_size=kernel_size,
+        strides=stride,
+        padding=padding,
+        name='conv1d_transpose',
+    )
+    l = self.make_layer(config)
+    self.assertEqual(l.block_size, 1)
+    self.assertEqual(l.output_ratio, fractions.Fraction(stride))
+    self.assertEqual(l.name, 'conv1d_transpose')
+
+    # Transpose convolution layers in Step mode are only supported for causal padding.
+    self.assertEqual(l.supports_step, padding == 'causal')
+
+    batch_size, channels = 2, 3
+    x = self.random_sequence(batch_size, 1, channels)
+    l = self.init_layer(l, x)
+
+    output_spec = l.get_output_spec(x.channel_spec)
+    self.assertEqual(output_spec.shape, (2,))
+
+    for time in range(5, 10):
+      x = self.random_sequence(batch_size, time, channels)
+      # Just verify it runs and produces expected shape
+      y = l.layer(x, training=False)
+      self.assertEqual(y.channel_shape, (2,))
+      # Test basic verify_contract (without step check since not supported)
+      self.verify_contract(l, x, training=False)
+
+
+class Conv2DTest(test_utils.SequenceLayerTest):
+  """Test behavior of Conv2D layer."""
+
+  @parameterized.product(
+      params=[
+          # kernel_size, strides, dilation_rate
+          ((3, 3), (1, 1), (1, 1)),
+          ((3, 3), (2, 2), (1, 1)),
+          ((3, 3), (1, 1), (2, 2)),
+      ],
+      time_padding=[
+          'same',
+          'valid',
+          'causal_valid',
+          'reverse_causal_valid',
+          'causal',
+          'reverse_causal',
+          'semicausal',
+      ],
+      spatial_padding=[
+          'same',
+          'valid',
+      ],
+  )
+  def test_conv2d(self, params, time_padding, spatial_padding):
+    kernel_size, stride, dilation_rate = params
+    config = self.sl.Conv2D.Config(
+        filters=2,
+        kernel_size=kernel_size,
+        strides=stride,
+        dilation_rate=dilation_rate,
+        time_padding=time_padding,
+        spatial_padding=spatial_padding,
+        name='conv2d',
+    )
+    l = self.make_layer(config)
+    self.assertEqual(l.block_size, stride[0])
+    self.assertEqual(1 / l.output_ratio, stride[0])
+    self.assertEqual(l.name, 'conv2d')
+
+    supports_step = time_padding in (
+        'causal_valid',
+        'reverse_causal_valid',
+        'causal',
+        'reverse_causal',
+        'semicausal',
+    )
+    self.assertEqual(l.supports_step, supports_step)
+
+    effective_kernel_size_t = (kernel_size[0] - 1) * dilation_rate[0] + 1
+    expected_input_latency = (
+        effective_kernel_size_t - 1
+        if time_padding in ('reverse_causal_valid', 'reverse_causal')
+        else 0
+    )
+    self.assertEqual(l.input_latency, expected_input_latency)
+    self.assertEqual(l.output_latency, expected_input_latency // stride[0])
+
+    batch_size, spatial_dim, channels = 2, 8, 3
+    x = self.random_sequence(batch_size, 1, spatial_dim, channels)
+    l = self.init_layer(l, x)
+
+    # Channel shape of Conv2D sequence contains the spatial dimension + filters
+    output_spec = l.get_output_spec(x.channel_spec)
+    self.assertEqual(output_spec.shape[-1], 2)
+
+    for time in range(20 * l.block_size - 1, 20 * l.block_size + 2):
+      x = self.random_sequence(batch_size, time, spatial_dim, channels)
+      self.verify_contract(l, x, training=False)
+
+
+class Conv2DTransposeTest(test_utils.SequenceLayerTest):
+  """Test behavior of Conv2DTranspose layer."""
+
+  @parameterized.product(
+      params=[
+          ((3, 3), (2, 2)),
+          ((3, 3), (1, 1)),
+      ],
+      time_padding=[
+          'same',
+          'valid',
+          'causal',
+      ],
+      spatial_padding=[
+          'same',
+          'valid',
+      ],
+  )
+  def test_conv2d_transpose(self, params, time_padding, spatial_padding):
+    kernel_size, stride = params
+    config = self.sl.Conv2DTranspose.Config(
+        filters=2,
+        kernel_size=kernel_size,
+        strides=stride,
+        time_padding=time_padding,
+        spatial_padding=spatial_padding,
+        name='conv2d_transpose',
+    )
+    l = self.make_layer(config)
+    self.assertEqual(l.block_size, 1)
+    self.assertEqual(l.output_ratio, fractions.Fraction(stride[0]))
+    self.assertEqual(l.name, 'conv2d_transpose')
+    # Transpose convolution layers in Step mode are only supported for causal padding.
+    self.assertEqual(l.supports_step, time_padding == 'causal')
+
+    batch_size, spatial_dim, channels = 2, 8, 3
+    x = self.random_sequence(batch_size, 1, spatial_dim, channels)
+    l = self.init_layer(l, x)
+
+    output_spec = l.get_output_spec(x.channel_spec)
+    self.assertEqual(output_spec.shape[-1], 2)
+
+    for time in range(5, 10):
+      x = self.random_sequence(batch_size, time, spatial_dim, channels)
+      self.verify_contract(l, x, training=False)
diff --git a/sequence_layers/specs/dense.py b/sequence_layers/specs/dense.py
new file mode 100644
index 0000000..e3a591f
--- /dev/null
+++ b/sequence_layers/specs/dense.py
@@ -0,0 +1,61 @@
+"""Specifications for dense layers.
+
+See the corresponding _behaviors module for behaviors.
+"""
+
+import abc
+import dataclasses
+from typing import Any, Callable, override, Sequence
+
+from sequence_layers.specs import types as types_spec
+
+
+class Dense[
+    SequenceT: types_spec.Sequence = types_spec.Sequence,
+    ShapeDTypeT: types_spec.ChannelSpec = types_spec.ChannelSpec,
+](
+    types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Dense layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Dense layer."""
+
+    features: int
+    use_bias: bool = True
+    activation: Callable | None = None
+    compute_dtype: types_spec.DType | None = None
+    param_dtype: types_spec.DType | None = None
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class EinsumDense[
+    SequenceT: types_spec.Sequence = types_spec.Sequence,
+    ShapeDTypeT: types_spec.ChannelSpec = types_spec.ChannelSpec,
+](
+    types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for EinsumDense layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for EinsumDense layer."""
+
+    equation: str
+    output_shape: Sequence[int | None]
+    bias_axes: str = ''
+    activation: Callable | None = None
+    compute_dtype: types_spec.DType | None = None
+    param_dtype: types_spec.DType | None = None
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
diff --git a/sequence_layers/specs/dense_behaviors.py b/sequence_layers/specs/dense_behaviors.py
new file mode 100644
index 0000000..87a2773
--- /dev/null
+++ b/sequence_layers/specs/dense_behaviors.py
@@ -0,0 +1,109 @@
+"""Behavior tests for dense layers.
+
+Backend-specific test files should inherit from these tests.
+"""
+
+# pylint: disable=abstract-method
+
+from absl.testing import parameterized
+
+from sequence_layers.specs import test_utils
+
+
+class DenseTest(test_utils.SequenceLayerTest):
+  """Test behavior of Dense layer."""
+
+  def test_rank2_unsupported(self):
+    l = self.sl.Dense.Config(features=3, name='dense').make()
+    x = self.random_sequence(2, 13)
+    with self.assertRaises(ValueError):
+      l = self.init_layer(l, x)
+      l.layer(x, training=False)
+
+  @parameterized.parameters(((5,),), ((5, 7),))
+  def test_dense(self, channels_shape):
+    l = self.sl.Dense.Config(features=3, name='dense').make()
+    x = self.random_sequence(2, 13, *channels_shape, random_mask=True)
+    l = self.init_layer(l, x)
+    self.assertEqual(l.block_size, 1)
+    self.assertEqual(l.output_ratio, 1)
+    self.assertEqual(l.name, 'dense')
+    self.assertEqual(
+        l.get_output_shape_for_sequence(x), channels_shape[:-1] + (3,)
+    )
+    self.verify_contract(l, x, training=False)
+
+  @parameterized.parameters(True, False)
+  def test_use_bias(self, use_bias):
+    l = self.sl.Dense.Config(features=3, use_bias=use_bias).make()
+    x = self.random_sequence(2, 3, 5)
+    l = self.init_layer(l, x)
+    self.verify_contract(l, x, training=False)
+
+
+class EinsumDenseTest(test_utils.SequenceLayerTest):
+  """Test behavior of EinsumDense layer."""
+
+  @parameterized.parameters(
+      (
+          (2, 3, 5),
+          '...a,ab->...b',
+          (7,),
+          '',
+          (7,),
+      ),
+      (
+          (2, 3, 5, 7),
+          '...ab,ac->...cb',
+          (11, 7),
+          'c',
+          (11, 7),
+      ),
+      (
+          (2, 3, 5, 7),
+          '...ab,b->...a',
+          (None,),
+          '',
+          (5,),
+      ),
+  )
+  def test_einsum_dense(
+      self,
+      shape,
+      equation,
+      output_shape,
+      bias_axes,
+      expected_output_shape,
+  ):
+    x = self.random_sequence(*shape)
+    l = self.sl.EinsumDense.Config(
+        equation=equation,
+        output_shape=output_shape,
+        bias_axes=bias_axes,
+        name='einsum_dense',
+    ).make()
+    l = self.init_layer(l, x)
+
+    self.assertEqual(l.block_size, 1)
+    self.assertEqual(l.output_ratio, 1)
+    self.assertEqual(l.name, 'einsum_dense')
+    self.assertEqual(l.get_output_shape_for_sequence(x), expected_output_shape)
+    self.verify_contract(l, x, training=False)
+
+  def test_einsum_dense_nonbroadcasting_equation(self):
+    with self.assertRaises(ValueError):
+      x = self.random_sequence(2, 3, 4, 5, 6)
+      l = self.sl.EinsumDense.Config(
+          equation='btabc,bc->btad', output_shape=[None, 2]
+      ).make()
+      l = self.init_layer(l, x)
+      l.layer(x, training=False)
+
+  def test_einsum_dense_inconsistent_input_shape(self):
+    x = self.random_sequence(2, 3, 5)
+    l = self.sl.EinsumDense.Config(
+        equation='...abc,bc->...ad', output_shape=[None, 2]
+    ).make()
+    with self.assertRaises(ValueError):
+      l = self.init_layer(l, x)
+      l.layer(x, training=False)
diff --git a/sequence_layers/specs/dsp.py b/sequence_layers/specs/dsp.py
new file mode 100644
index 0000000..d312fd2
--- /dev/null
+++ b/sequence_layers/specs/dsp.py
@@ -0,0 +1,356 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Specifications for digital signal processing (DSP) layers.
+
+See the corresponding _behaviors module for behaviors.
+"""
+
+import abc
+import dataclasses
+from typing import Any, Callable, override, Protocol, runtime_checkable
+
+from sequence_layers.specs import types as types_spec
+
+
+class Delay[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.PreservesShape[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.SequenceLayer[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Delay layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Delay layer."""
+
+    length: int = 0
+    delay_layer_output: bool = True
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class Lookahead[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.PreservesShape[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.SequenceLayer[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Lookahead layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Lookahead layer."""
+
+    length: int = 0
+    preserve_length_in_layer: bool = False
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class Window[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.PreservesShape[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Window layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Window layer."""
+
+    axis: int
+    window_fn: Callable[..., Any] | None = None
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class Frame[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.SequenceLayer[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Frame layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Frame layer."""
+
+    frame_length: int
+    frame_step: int
+    padding: tuple[int, int] | types_spec.PaddingModeString = (
+        'reverse_causal_valid'
+    )
+    explicit_padding_is_same_like: bool = False
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class OverlapAdd[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.SequenceLayer[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for OverlapAdd layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for OverlapAdd layer."""
+
+    frame_length: int
+    frame_step: int
+    padding: types_spec.PaddingModeString = 'valid'
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class FFT[SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec](
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for FFT layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for FFT layer."""
+
+    fft_length: int | None = None
+    axis: int = -1
+    padding: str = 'right'
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class IFFT[SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec](
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for IFFT layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for IFFT layer."""
+
+    fft_length: int | None = None
+    frame_length: int | None = None
+    axis: int = -1
+    padding: str = 'right'
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class RFFT[SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec](
+    types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for RFFT layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for RFFT layer."""
+
+    fft_length: int | None = None
+    axis: int = -1
+    padding: str = 'right'
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class IRFFT[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for IRFFT layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for IRFFT layer."""
+
+    fft_length: int | None = None
+    frame_length: int | None = None
+    axis: int = -1
+    padding: str = 'right'
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class STFT[SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec](
+    types_spec.SequenceLayer[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for STFT layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for STFT layer."""
+
+    frame_length: int
+    frame_step: int
+    fft_length: int
+    window_fn: Callable[..., Any] | None = None
+    time_padding: types_spec.PaddingModeString = 'reverse_causal_valid'
+    fft_padding: str = 'right'
+    output_magnitude: bool = False
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class InverseSTFT[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.SequenceLayer[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for InverseSTFT layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for InverseSTFT layer."""
+
+    frame_length: int
+    frame_step: int
+    fft_length: int
+    window_fn: Callable[..., Any] | None = None
+    time_padding: types_spec.PaddingModeString = 'causal'
+    fft_padding: str = 'right'
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class LinearToMelSpectrogram[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for LinearToMelSpectrogram layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for LinearToMelSpectrogram layer."""
+
+    num_mel_bins: int
+    sample_rate: float
+    lower_edge_hertz: float
+    upper_edge_hertz: float
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+@runtime_checkable
+class ModuleSpec(Protocol):
+  """Protocol for DSP module."""
+
+  # pylint: disable=invalid-name
+  # pylint: disable=missing-function-docstring
+
+  @property
+  def Delay(self) -> type[Delay]:
+    ...
+
+  @property
+  def Lookahead(self) -> type[Lookahead]:
+    ...
+
+  @property
+  def Window(self) -> type[Window]:
+    ...
+
+  @property
+  def Frame(self) -> type[Frame]:
+    ...
+
+  @property
+  def OverlapAdd(self) -> type[OverlapAdd]:
+    ...
+
+  @property
+  def FFT(self) -> type[FFT]:
+    ...
+
+  @property
+  def IFFT(self) -> type[IFFT]:
+    ...
+
+  @property
+  def RFFT(self) -> type[RFFT]:
+    ...
+
+  @property
+  def IRFFT(self) -> type[IRFFT]:
+    ...
+
+  @property
+  def STFT(self) -> type[STFT]:
+    ...
+
+  @property
+  def InverseSTFT(self) -> type[InverseSTFT]:
+    ...
+
+  @property
+  def LinearToMelSpectrogram(self) -> type[LinearToMelSpectrogram]:
+    ...
diff --git a/sequence_layers/specs/dsp_behaviors.py b/sequence_layers/specs/dsp_behaviors.py
new file mode 100644
index 0000000..fc35a8b
--- /dev/null
+++ b/sequence_layers/specs/dsp_behaviors.py
@@ -0,0 +1,428 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Behavior tests for digital signal processing (DSP) layers.
+
+Backend-specific test files should inherit from these tests.
+"""
+
+# pylint: disable=abstract-method
+# pyrefly: disable=bad-instantiation
+
+from absl.testing import parameterized
+import numpy as np
+
+from sequence_layers.specs import test_utils
+
+
+def _pad_or_truncate_for_fft(
+    values: np.ndarray, padding: str, axis: int, required_input_length: int
+) -> np.ndarray:
+  """Pads or truncates values to required_input_length along axis."""
+  axis_size = values.shape[axis]
+  pad_amount = max(0, required_input_length - axis_size)
+  if padding == 'center':
+    left = pad_amount // 2
+    right = pad_amount - left
+  else:
+    assert padding == 'right'
+    left, right = 0, pad_amount
+
+  paddings = [(0, 0)] * values.ndim
+  paddings[axis] = (left, right)
+  values = np.pad(values, paddings)
+  axis_size = values.shape[axis]
+
+  trim_amount = max(0, axis_size - required_input_length)
+  if padding == 'center':
+    left = trim_amount // 2
+  else:
+    left = 0
+
+  slices = [slice(None)] * values.ndim
+  slices[axis] = slice(left, left + required_input_length)
+  return values[tuple(slices)]
+
+
+class FFTTest(test_utils.SequenceLayerTest):
+  """Test behavior of FFT layer."""
+
+  @parameterized.product(
+      shape_axis=[((2, 3, 32), -1), ((2, 3, 5, 32), -1), ((2, 3, 5, 32), -2)],
+      fft_length=[31, 32, 33],
+      padding=['center', 'right'],
+  )
+  def test_fft(self, shape_axis, fft_length, padding):
+    shape, axis = shape_axis
+    x = self.random_sequence(*shape, low_length=1)
+    config = self.sl.FFT.Config(
+        fft_length=fft_length,
+        axis=axis,
+        padding=padding,
+        name='fft',
+    )
+    l = self.make_layer(config)
+    l = self.init_layer(l, x)
+    self.assertEqual(l.block_size, 1)
+    self.assertEqual(l.output_ratio, 1)
+    self.assertEqual(l.name, 'fft')
+
+    channel_shape = list(shape[2:])
+    channel_shape[axis] = fft_length
+    self.assertEqual(l.get_output_shape(shape[2:]), tuple(channel_shape))
+    y = self.verify_contract(l, x, training=False)
+
+    # Check that the result is the same as manually padding/truncating followed by FFT.
+    def apply_fft(values):
+      values = _pad_or_truncate_for_fft(values, padding, axis, fft_length)
+      return np.fft.fft(values, n=fft_length, axis=axis)
+
+    y_expected = x.apply_values(apply_fft).mask_invalid()
+    self.assertSequencesClose(y, y_expected, atol=1e-4, rtol=1e-4)
+    self.assertEqual(y.shape[axis], fft_length)
+
+
+class IFFTTest(test_utils.SequenceLayerTest):
+  """Test behavior of IFFT layer."""
+
+  @parameterized.product(
+      shape_axis=[((2, 3, 32), -1), ((2, 3, 5, 32), -1), ((2, 3, 5, 32), -2)],
+      frame_length=[31, 32, 33, None],
+      padding=['center', 'right'],
+  )
+  def test_ifft(self, shape_axis, frame_length, padding):
+    shape, axis = shape_axis
+    fft_length = shape[axis]
+    x = self.random_sequence(*shape)
+    config = self.sl.IFFT.Config(
+        fft_length=fft_length,
+        frame_length=frame_length,
+        axis=axis,
+        padding=padding,
+        name='ifft',
+    )
+    l = self.make_layer(config)
+    l = self.init_layer(l, x)
+
+    if frame_length is None:
+      frame_length = fft_length
+
+    self.assertEqual(l.block_size, 1)
+    self.assertEqual(l.output_ratio, 1)
+    self.assertEqual(l.name, 'ifft')
+
+    channel_shape = list(shape[2:])
+    channel_shape[axis] = frame_length
+    self.assertEqual(l.get_output_shape(shape[2:]), tuple(channel_shape))
+    y = self.verify_contract(l, x, training=False)
+
+    def apply_ifft(values):
+      values = np.fft.ifft(values, n=fft_length, axis=axis)
+      return _pad_or_truncate_for_fft(values, padding, axis, frame_length)
+
+    y_expected = x.apply_values(apply_ifft).mask_invalid()
+    self.assertSequencesClose(y, y_expected, atol=1e-4, rtol=1e-4)
+    self.assertEqual(y.shape[axis], frame_length)
+
+
+class RFFTTest(test_utils.SequenceLayerTest):
+  """Test behavior of RFFT layer."""
+
+  @parameterized.product(
+      shape_axis=[((2, 3, 32), -1), ((2, 3, 5, 32), -1), ((2, 3, 5, 32), -2)],
+      fft_length=[31, 32, 33],
+      padding=['center', 'right'],
+  )
+  def test_rfft(self, shape_axis, fft_length, padding):
+    shape, axis = shape_axis
+    x = self.random_sequence(*shape)
+    config = self.sl.RFFT.Config(
+        fft_length=fft_length,
+        axis=axis,
+        padding=padding,
+        name='rfft',
+    )
+    l = self.make_layer(config)
+    l = self.init_layer(l, x)
+    self.assertEqual(l.block_size, 1)
+    self.assertEqual(l.output_ratio, 1)
+    self.assertEqual(l.name, 'rfft')
+
+    channel_shape = list(shape[2:])
+    channel_shape[axis] = fft_length // 2 + 1
+    self.assertEqual(l.get_output_shape(shape[2:]), tuple(channel_shape))
+    y = self.verify_contract(l, x, training=False)
+
+    def apply_rfft(values):
+      values = _pad_or_truncate_for_fft(values, padding, axis, fft_length)
+      return np.fft.rfft(values, n=fft_length, axis=axis)
+
+    y_expected = x.apply_values(apply_rfft).mask_invalid()
+    self.assertSequencesClose(y, y_expected, atol=1e-4, rtol=1e-4)
+    self.assertEqual(y.shape[axis], fft_length // 2 + 1)
+
+
+class IRFFTTest(test_utils.SequenceLayerTest):
+  """Test behavior of IRFFT layer."""
+
+  @parameterized.product(
+      shape_axis=[((2, 3, 17), -1), ((2, 3, 5, 17), -1)],
+      frame_length=[31, 32, 33, None],
+      padding=['center', 'right'],
+  )
+  def test_irfft(self, shape_axis, frame_length, padding):
+    shape, axis = shape_axis
+    fft_length = (shape[axis] - 1) * 2
+    x = self.random_sequence(*shape)
+    config = self.sl.IRFFT.Config(
+        fft_length=fft_length,
+        frame_length=frame_length,
+        axis=axis,
+        padding=padding,
+        name='irfft',
+    )
+    l = self.make_layer(config)
+    l = self.init_layer(l, x)
+
+    if frame_length is None:
+      frame_length = fft_length
+
+    self.assertEqual(l.block_size, 1)
+    self.assertEqual(l.output_ratio, 1)
+    self.assertEqual(l.name, 'irfft')
+
+    channel_shape = list(shape[2:])
+    channel_shape[axis] = frame_length
+    self.assertEqual(l.get_output_shape(shape[2:]), tuple(channel_shape))
+    y = self.verify_contract(l, x, training=False)
+
+    def apply_irfft(values):
+      values = np.fft.irfft(values, n=fft_length, axis=axis)
+      return _pad_or_truncate_for_fft(values, padding, axis, frame_length)
+
+    y_expected = x.apply_values(apply_irfft).mask_invalid()
+    self.assertSequencesClose(y, y_expected, atol=1e-4, rtol=1e-4)
+    self.assertEqual(y.shape[axis], frame_length)
+
+
+class FrameTest(test_utils.SequenceLayerTest):
+  """Test behavior of Frame layer."""
+
+  @parameterized.product(
+      frame_length=[1, 2, 3, 4],
+      frame_step=[1, 2, 3, 4],
+      padding=[
+          'causal_valid',
+          'reverse_causal_valid',
+          'causal',
+          'reverse_causal',
+          'semicausal',
+      ],
+  )
+  def test_frame(self, frame_length, frame_step, padding):
+    batch_size, time, channels = 2, 20 * frame_step, 3
+    x = self.random_sequence(batch_size, time, channels)
+    config = self.sl.Frame.Config(
+        frame_length=frame_length,
+        frame_step=frame_step,
+        padding=padding,
+        name='frame',
+    )
+    l = self.make_layer(config)
+    l = self.init_layer(l, x)
+    self.assertEqual(l.block_size, frame_step)
+    self.assertEqual(1 / l.output_ratio, frame_step)
+    self.assertEqual(l.name, 'frame')
+    self.assertTrue(l.supports_step)
+    self.verify_contract(l, x, training=False, atol=1e-4, rtol=1e-4)
+
+
+class OverlapAddTest(test_utils.SequenceLayerTest):
+  """Test behavior of OverlapAdd layer."""
+
+  @parameterized.product(
+      frame_length=[1, 2, 3, 4],
+      frame_step=[1, 2, 3, 4],
+      padding=['causal', 'valid', 'semicausal_full'],
+  )
+  def test_overlap_add(self, frame_length, frame_step, padding):
+    if frame_length < frame_step:
+      return  # Pre-condition requirement
+    batch_size, time, channels = 2, 20, 3
+    x = self.random_sequence(batch_size, time, frame_length, channels)
+    config = self.sl.OverlapAdd.Config(
+        frame_length=frame_length,
+        frame_step=frame_step,
+        padding=padding,
+        name='overlap_add',
+    )
+    l = self.make_layer(config)
+    l = self.init_layer(l, x)
+    self.assertEqual(l.output_ratio, frame_step)
+    self.assertEqual(l.name, 'overlap_add')
+    self.verify_contract(l, x, training=False, atol=1e-4, rtol=1e-4)
+
+
+class STFTTest(test_utils.SequenceLayerTest):
+  """Test behavior of STFT layer."""
+
+  @parameterized.product(
+      frame_length=[4, 8],
+      frame_step=[2, 4],
+      fft_length=[8, 16],
+      time_padding=[
+          'causal_valid',
+          'reverse_causal_valid',
+          'causal',
+          'reverse_causal',
+          'semicausal',
+      ],
+      fft_padding=['center', 'right'],
+  )
+  def test_stft(
+      self, frame_length, frame_step, fft_length, time_padding, fft_padding
+  ):
+    if fft_length < frame_length:
+      return
+    batch_size, time, channels = 2, 20 * frame_step, 3
+    x = self.random_sequence(batch_size, time, channels)
+    config = self.sl.STFT.Config(
+        frame_length=frame_length,
+        frame_step=frame_step,
+        fft_length=fft_length,
+        time_padding=time_padding,
+        fft_padding=fft_padding,
+        name='stft',
+    )
+    l = self.make_layer(config)
+    l = self.init_layer(l, x)
+    self.assertEqual(l.block_size, frame_step)
+    self.assertEqual(1 / l.output_ratio, frame_step)
+    self.assertEqual(l.name, 'stft')
+    self.verify_contract(l, x, training=False, atol=1e-4, rtol=1e-4)
+
+
+class InverseSTFTTest(test_utils.SequenceLayerTest):
+  """Test behavior of InverseSTFT layer."""
+
+  @parameterized.product(
+      frame_length=[4, 8],
+      frame_step=[2, 4],
+      fft_length=[8, 16],
+      time_padding=['causal', 'valid'],
+      fft_padding=['center', 'right'],
+  )
+  def test_inverse_stft(
+      self, frame_length, frame_step, fft_length, time_padding, fft_padding
+  ):
+    if fft_length < frame_length:
+      return
+    if frame_length < frame_step:
+      return
+    batch_size, time, channels = 2, 20, 3
+    # Input to InverseSTFT must be complex spectrogram bins: fft_length // 2 + 1
+    x = self.random_sequence(batch_size, time, fft_length // 2 + 1, channels)
+    config = self.sl.InverseSTFT.Config(
+        frame_length=frame_length,
+        frame_step=frame_step,
+        fft_length=fft_length,
+        time_padding=time_padding,
+        fft_padding=fft_padding,
+        name='istft',
+    )
+    l = self.make_layer(config)
+    l = self.init_layer(l, x)
+    self.assertEqual(l.output_ratio, frame_step)
+    self.assertEqual(l.name, 'istft')
+    self.verify_contract(l, x, training=False, atol=1e-4, rtol=1e-4)
+
+
+class LinearToMelSpectrogramTest(test_utils.SequenceLayerTest):
+  """Test behavior of LinearToMelSpectrogram layer."""
+
+  def test_mel_spectrogram(self):
+    batch_size, time, channels = 2, 8, 257  # 257 linear spectrogram bins
+    x = self.random_sequence(batch_size, time, channels)
+    config = self.sl.LinearToMelSpectrogram.Config(
+        num_mel_bins=40,
+        sample_rate=16000.0,
+        lower_edge_hertz=80.0,
+        upper_edge_hertz=7600.0,
+        name='mel',
+    )
+    l = self.make_layer(config)
+    l = self.init_layer(l, x)
+    self.assertEqual(l.name, 'mel')
+    self.verify_contract(l, x, training=False, atol=1e-4, rtol=1e-4)
+
+
+class DelayTest(test_utils.SequenceLayerTest):
+  """Test behavior of Delay layer."""
+
+  @parameterized.product(
+      length=[0, 1, 3],
+      delay_layer_output=[True, False],
+  )
+  def test_delay(self, length, delay_layer_output):
+    batch_size, time, channels = 2, 15, 3
+    x = self.random_sequence(batch_size, time, channels)
+    config = self.sl.Delay.Config(
+        length=length,
+        delay_layer_output=delay_layer_output,
+        name='delay',
+    )
+    l = self.make_layer(config)
+    l = self.init_layer(l, x)
+    self.assertEqual(l.name, 'delay')
+    self.assertEqual(l.input_latency, length)
+    self.assertEqual(l.output_latency, 0 if delay_layer_output else length)
+    self.verify_contract(l, x, training=False, atol=1e-4, rtol=1e-4)
+
+
+class LookaheadTest(test_utils.SequenceLayerTest):
+  """Test behavior of Lookahead layer."""
+
+  @parameterized.product(
+      length=[0, 1, 3],
+      preserve_length_in_layer=[True, False],
+  )
+  def test_lookahead(self, length, preserve_length_in_layer):
+    batch_size, time, channels = 2, 15, 3
+    x = self.random_sequence(batch_size, time, channels)
+    config = self.sl.Lookahead.Config(
+        length=length,
+        preserve_length_in_layer=preserve_length_in_layer,
+        name='lookahead',
+    )
+    l = self.make_layer(config)
+    l = self.init_layer(l, x)
+    self.assertEqual(l.name, 'lookahead')
+    self.assertEqual(l.input_latency, 0)
+    self.assertEqual(l.output_latency, length)
+    self.verify_contract(l, x, training=False, atol=1e-4, rtol=1e-4)
+
+
+class WindowTest(test_utils.SequenceLayerTest):
+  """Test behavior of Window layer."""
+
+  def test_window(self):
+    batch_size, time, channels = 2, 8, 16
+    x = self.random_sequence(batch_size, time, channels)
+    config = self.sl.Window.Config(
+        axis=-1,
+        name='window',
+    )
+    l = self.make_layer(config)
+    l = self.init_layer(l, x)
+    self.assertEqual(l.name, 'window')
+    self.verify_contract(l, x, training=False, atol=1e-4, rtol=1e-4)
diff --git a/sequence_layers/specs/normalization.py b/sequence_layers/specs/normalization.py
new file mode 100644
index 0000000..a133c5c
--- /dev/null
+++ b/sequence_layers/specs/normalization.py
@@ -0,0 +1,136 @@
+"""Specifications for normalization layers.
+
+See the corresponding _behaviors module for behaviors.
+"""
+
+import abc
+import dataclasses
+from typing import Any, override, Sequence
+
+from sequence_layers.specs import types as types_spec
+
+
+class L2Normalize[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for L2Normalize layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for L2Normalize."""
+
+    axis: int | Sequence[int] = -1
+    epsilon: float = 1e-12
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class RMSNormalization[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for RMSNormalization layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for RMSNormalization."""
+
+    axis: int | Sequence[int] = -1
+    epsilon: float = 1e-6
+    use_scale: bool = True
+    scale_init: Any | None = None
+    compute_dtype: types_spec.DType | None = None
+    param_dtype: types_spec.DType | None = None
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class LayerNormalization[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for LayerNormalization layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for LayerNormalization."""
+
+    axis: int | Sequence[int] = -1
+    epsilon: float = 1e-6
+    use_scale: bool = True
+    use_bias: bool = True
+    reductions_in_at_least_fp32: bool = True
+    compute_dtype: types_spec.DType | None = None
+    param_dtype: types_spec.DType | None = None
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class BatchNormalization[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for BatchNormalization layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for BatchNormalization."""
+
+    axis: int | Sequence[int] = -1
+    epsilon: float = 1e-5
+    momentum: float = 0.99
+    use_scale: bool = True
+    use_bias: bool = True
+    use_fast_variance: bool = True
+    compute_dtype: types_spec.DType | None = None
+    param_dtype: types_spec.DType | None = None
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class GroupNormalization[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for GroupNormalization layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for GroupNormalization."""
+
+    num_groups: int
+    axis: int | Sequence[int] = -1
+    epsilon: float = 1e-6
+    cumulative: bool = False
+    use_scale: bool = True
+    use_bias: bool = True
+    compute_dtype: types_spec.DType | None = None
+    param_dtype: types_spec.DType | None = None
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
diff --git a/sequence_layers/specs/normalization_behaviors.py b/sequence_layers/specs/normalization_behaviors.py
new file mode 100644
index 0000000..aedfec3
--- /dev/null
+++ b/sequence_layers/specs/normalization_behaviors.py
@@ -0,0 +1,312 @@
+"""Behavior tests for normalization layers.
+
+Backend-specific test files should inherit from these tests.
+"""
+
+# pylint: disable=abstract-method
+
+import itertools
+
+from absl.testing import parameterized
+import numpy as np
+
+from sequence_layers.specs import test_utils
+
+
+class L2NormalizeTest(test_utils.SequenceLayerTest):
+  """Test behavior of L2Normalize layer."""
+
+  def test_invalid_axis(self):
+    """Normalizing over the batch or time dimension is not allowed."""
+    l = self.sl.L2Normalize.Config(axis=[-1, -2]).make()
+    x = self.random_sequence(2, 3, 5)
+    with self.assertRaises(ValueError):
+      l = self.init_layer(l, x)
+      l.layer(x, training=False)
+
+  @parameterized.parameters(
+      itertools.product(
+          (False, True),
+          [
+              ((2, 10, 3), [-1]),
+              ((2, 3, 5, 9), [-1]),
+              ((2, 3, 5, 9), [-2]),
+              ((2, 3, 5, 9), [-1, -2]),
+          ],
+      )
+  )
+  def test_l2_normalization(self, training, shape_axes):
+    shape, axes = shape_axes
+    epsilon = 1e-12
+    l = self.sl.L2Normalize.Config(
+        axis=axes, epsilon=epsilon, name='l2_normalization'
+    ).make()
+    x = self.random_sequence(*shape)
+    l = self.init_layer(l, x)
+
+    self.assertEqual(l.block_size, 1)
+    self.assertEqual(l.output_ratio, 1)
+    self.assertEqual(l.name, 'l2_normalization')
+    self.assertEqual(l.get_output_shape_for_sequence(x), shape[2:])
+
+    y = self.verify_contract(l, x, training=training)
+
+    # Verify the train batch is normalized correctly.
+    reduce_axes = tuple(
+        a for a in range(len(shape)) if a in axes or a - len(shape) in axes
+    )
+    x_np = np.array(x.values)
+    x_ss = np.sum(np.square(x_np), axis=reduce_axes, keepdims=True)
+    y_expected_np = x_np / np.sqrt(x_ss + epsilon)
+
+    y_np = np.array(y.values)
+    expanded_mask = np.array(y.expanded_mask())
+    y_np_masked = np.where(expanded_mask, y_np, 0.0)
+    y_expected_np_masked = np.where(expanded_mask, y_expected_np, 0.0)
+
+    np.testing.assert_allclose(
+        y_np_masked, y_expected_np_masked, rtol=1e-5, atol=1e-5
+    )
+
+
+class RMSNormalizationTest(test_utils.SequenceLayerTest):
+  """Test behavior of RMSNormalization layer."""
+
+  def test_invalid_axis(self):
+    """Normalizing over the batch or time dimension is not allowed."""
+    l = self.sl.RMSNormalization.Config(axis=[-1, -2]).make()
+    x = self.random_sequence(2, 3, 5)
+    with self.assertRaises(ValueError):
+      l = self.init_layer(l, x)
+      l.layer(x, training=False)
+
+  @parameterized.parameters(
+      itertools.product(
+          (False, True),
+          [
+              ((2, 10, 3), [-1], [3]),
+              ((2, 3, 5, 9), [-1], [9]),
+              ((2, 3, 5, 9), [-2], [5]),
+              ((2, 3, 5, 9), [-1, -2], [5, 9]),
+          ],
+      )
+  )
+  def test_rms_normalization(self, training, shape_axes):
+    shape, axes, _ = shape_axes
+    epsilon = 1e-1
+    l = self.sl.RMSNormalization.Config(
+        axis=axes, epsilon=epsilon, name='rms_normalization'
+    ).make()
+    x = self.random_sequence(*shape)
+    l = self.init_layer(l, x)
+
+    self.assertEqual(l.block_size, 1)
+    self.assertEqual(l.output_ratio, 1)
+    self.assertEqual(l.name, 'rms_normalization')
+    self.assertEqual(l.get_output_shape_for_sequence(x), shape[2:])
+
+    y = self.verify_contract(l, x, training=training)
+
+    # Verify the train batch is normalized correctly.
+    reduce_axes = tuple(
+        a for a in range(len(shape)) if a in axes or a - len(shape) in axes
+    )
+    x_np = np.array(x.values)
+    x_ss = np.mean(np.square(x_np), axis=reduce_axes, keepdims=True)
+    y_expected_np = x_np / np.sqrt(x_ss + epsilon)
+
+    y_np = np.array(y.values)
+    expanded_mask = np.array(y.expanded_mask())
+    y_np_masked = np.where(expanded_mask, y_np, 0.0)
+    y_expected_np_masked = np.where(expanded_mask, y_expected_np, 0.0)
+
+    np.testing.assert_allclose(
+        y_np_masked, y_expected_np_masked, rtol=1e-5, atol=1e-5
+    )
+
+
+class LayerNormalizationTest(test_utils.SequenceLayerTest):
+  """Test behavior of LayerNormalization layer."""
+
+  def test_invalid_axis(self):
+    """Normalizing over the batch or time dimension is not allowed."""
+    l = self.sl.LayerNormalization.Config(axis=[-1, -2]).make()
+    x = self.random_sequence(2, 3, 5)
+    with self.assertRaises(ValueError):
+      l = self.init_layer(l, x)
+      l.layer(x, training=False)
+
+  @parameterized.parameters(
+      itertools.product(
+          (False, True),
+          [
+              ((2, 10, 4), [-1], [4]),
+              ((2, 3, 5, 4), [-1], [4]),
+              ((2, 3, 4, 9), [-2], [4]),
+              ((2, 3, 4, 8), [-1, -2], [4, 8]),
+          ],
+      )
+  )
+  def test_layer_normalization(self, training, shape_axes):
+    shape, axes, _ = shape_axes
+    l = self.sl.LayerNormalization.Config(
+        axis=axes, name='layer_normalization'
+    ).make()
+    x = self.random_sequence(*shape)
+    l = self.init_layer(l, x)
+
+    self.assertEqual(l.block_size, 1)
+    self.assertEqual(l.output_ratio, 1)
+    self.assertEqual(l.name, 'layer_normalization')
+    self.assertEqual(l.get_output_shape_for_sequence(x), shape[2:])
+
+    y = self.verify_contract(l, x, training=training)
+
+    # Verify the train batch is normalized correctly.
+    reduce_axes = tuple(
+        a for a in range(len(shape)) if a in axes or a - len(shape) in axes
+    )
+    y_np = np.array(y.values)
+    mean = np.mean(y_np, axis=reduce_axes)
+    var = np.var(y_np, axis=reduce_axes)
+
+    # Invalid timesteps will have a mean and variance of zero.
+    np.testing.assert_allclose(mean, np.zeros_like(mean), rtol=1e-5, atol=1e-5)
+    mask = np.array(y.mask, dtype=np.float32)
+    mask = np.reshape(
+        mask, mask.shape + (1,) * (len(mean.shape) - len(mask.shape))
+    )
+    np.testing.assert_allclose(
+        var, np.broadcast_to(mask, mean.shape), rtol=1e-4, atol=1e-4
+    )
+
+
+class BatchNormalizationTest(test_utils.SequenceLayerTest):
+  """Test behavior of BatchNormalization layer."""
+
+  def test_batch_normalization_invalid_axis(self):
+    """Normalizing over the batch or time dimension is not allowed."""
+    x = self.random_sequence(2, 3, 5)
+    l = self.sl.BatchNormalization.Config(axis=0).make()
+    with self.assertRaises(ValueError):
+      l = self.init_layer(l, x)
+      l.layer(x, training=False)
+
+    l = self.sl.BatchNormalization.Config(axis=1).make()
+    with self.assertRaises(ValueError):
+      l = self.init_layer(l, x)
+      l.layer(x, training=False)
+
+    l = self.sl.BatchNormalization.Config(axis=2).make()
+    l = self.init_layer(l, x)
+
+
+class GroupNormalizationTest(test_utils.SequenceLayerTest):
+  """Test behavior of GroupNormalization layer."""
+
+  def test_invalid_axis(self):
+    """Normalizing over the batch or time dimension is not allowed."""
+    x = self.random_sequence(2, 3, 5)
+    l = self.sl.GroupNormalization.Config(num_groups=1, axis=0).make()
+    with self.assertRaises(ValueError):
+      l = self.init_layer(l, x)
+      l.layer(x, training=False)
+
+    l = self.sl.GroupNormalization.Config(num_groups=1, axis=1).make()
+    with self.assertRaises(ValueError):
+      l = self.init_layer(l, x)
+      l.layer(x, training=False)
+
+    l = self.sl.GroupNormalization.Config(num_groups=1, axis=2).make()
+    l = self.init_layer(l, x)
+
+  def test_invalid_groups(self):
+    x = self.random_sequence(2, 3, 5)
+    l = self.sl.GroupNormalization.Config(num_groups=2).make()
+    with self.assertRaises(ValueError):
+      l = self.init_layer(l, x)
+      l.layer(x, training=False)
+
+  @parameterized.parameters(
+      itertools.product(
+          [
+              ((8, 6, 6), -1, 3, [6]),
+              ((8, 6, 5, 6), -2, 5, [5]),
+              ((8, 6, 5, 6), -2, 1, [5]),
+          ],
+          (False, True),
+      )
+  )
+  def test_group_normalization(self, shape_axes, cumulative):
+    shape, axis, num_groups, _ = shape_axes
+    l = self.sl.GroupNormalization.Config(
+        num_groups=num_groups,
+        cumulative=cumulative,
+        axis=axis,
+        name='group_normalization',
+    ).make()
+    self.assertEqual(l.block_size, 1)
+    self.assertEqual(l.output_ratio, 1)
+    self.assertEqual(l.name, 'group_normalization')
+
+    x = self.random_sequence(*shape)
+    l = self.init_layer(l, x)
+    self.assertEqual(l.get_output_shape_for_sequence(x), shape[2:])
+
+    # Test inference (training=False) - both backends match.
+    y_test = self.verify_contract(l, x, training=False)
+
+    axis = axis + x.ndim if axis < 0 else axis
+    shape = list(y_test.values.shape)
+    axis_dim = shape[axis]
+    group_size = axis_dim // num_groups
+    outer_dims = shape[:axis]
+    inner_dims = shape[axis + 1 :]
+
+    # Unscale and verify group normalization per-timestep.
+    if cumulative:
+      # Skip testing cumulative mode numerically.
+      return
+
+    y_vals = y_test.values
+    y_grouped = np.reshape(
+        y_vals,
+        outer_dims + [num_groups, group_size] + inner_dims,
+    )
+
+    if l.supports_step:
+      # Pointwise/causal normalization: reduce only over group_size (axis 3).
+      reduction_dims = (3,)
+    else:
+      # Non-causal normalization: reduce over time (axis 1) and group_size (axis 3).
+      reduction_dims = tuple(
+          a for a in range(y_grouped.ndim) if a not in (0, axis)
+      )
+    expanded_mask = self.sl.types.Sequence(y_grouped, x.mask).expanded_mask()
+    expanded_mask_np = np.array(expanded_mask, dtype=bool)
+    y_grouped_np = np.array(y_grouped)
+
+    mean = np.mean(
+        y_grouped_np, axis=reduction_dims, keepdims=True, where=expanded_mask_np
+    )
+    var = np.var(
+        y_grouped_np, axis=reduction_dims, keepdims=True, where=expanded_mask_np
+    )
+
+    # Avoid NaNs.
+    mean = np.where(np.isnan(mean), np.zeros_like(mean), mean)
+    var = np.where(np.isnan(var), np.ones_like(var), var)
+
+    np.testing.assert_allclose(mean, np.zeros_like(mean), atol=2e-5)
+    if l.supports_step:
+      if group_size == 1:
+        # Reducing over 1 element per-timestep mathematically results in 0 variance.
+        np.testing.assert_allclose(var, np.zeros_like(var), atol=1e-3)
+      elif group_size == 2:
+        # For tiny group sizes per-timestep, the output variance can naturally
+        # deviate from 1.0 due to epsilon.
+        np.testing.assert_allclose(var, np.ones_like(var), atol=0.8)
+      else:
+        np.testing.assert_allclose(var, np.ones_like(var), atol=1e-3)
+    else:
+      np.testing.assert_allclose(var, np.ones_like(var), atol=1e-3)
diff --git a/sequence_layers/specs/pooling.py b/sequence_layers/specs/pooling.py
new file mode 100644
index 0000000..d36fea2
--- /dev/null
+++ b/sequence_layers/specs/pooling.py
@@ -0,0 +1,281 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Specifications for pooling layers.
+
+See the corresponding _behaviors module for behaviors.
+"""
+
+import abc
+import dataclasses
+from typing import Any, override, Sequence
+
+from sequence_layers.specs import types as types_spec
+
+
+class BasePooling[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Base specification for pooling layers."""
+
+
+class MinPooling1D[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    BasePooling[SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for MinPooling1D layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for MinPooling1D."""
+
+    pool_size: int
+    strides: int = 1
+    dilation_rate: int = 1
+    padding: types_spec.PaddingModeString = types_spec.PaddingMode.VALID.value
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class MaxPooling1D[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    BasePooling[SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for MaxPooling1D layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for MaxPooling1D."""
+
+    pool_size: int
+    strides: int = 1
+    dilation_rate: int = 1
+    padding: types_spec.PaddingModeString = types_spec.PaddingMode.VALID.value
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class AveragePooling1D[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    BasePooling[SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for AveragePooling1D layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for AveragePooling1D."""
+
+    pool_size: int
+    strides: int = 1
+    dilation_rate: int = 1
+    padding: types_spec.PaddingModeString = types_spec.PaddingMode.VALID.value
+    masked_average: bool = False
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class MinPooling2D[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    BasePooling[SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for MinPooling2D layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for MinPooling2D."""
+
+    pool_size: int | Sequence[int]
+    strides: int | Sequence[int] = 1
+    dilation_rate: int | Sequence[int] = 1
+    time_padding: types_spec.PaddingModeString = (
+        types_spec.PaddingMode.VALID.value
+    )
+    spatial_padding: types_spec.PaddingModeString | tuple[int, int] = (
+        types_spec.PaddingMode.SAME.value
+    )
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class MaxPooling2D[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    BasePooling[SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for MaxPooling2D layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for MaxPooling2D."""
+
+    pool_size: int | Sequence[int]
+    strides: int | Sequence[int] = 1
+    dilation_rate: int | Sequence[int] = 1
+    time_padding: types_spec.PaddingModeString = (
+        types_spec.PaddingMode.VALID.value
+    )
+    spatial_padding: types_spec.PaddingModeString | tuple[int, int] = (
+        types_spec.PaddingMode.SAME.value
+    )
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class AveragePooling2D[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    BasePooling[SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for AveragePooling2D layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for AveragePooling2D."""
+
+    pool_size: int | Sequence[int]
+    strides: int | Sequence[int] = 1
+    dilation_rate: int | Sequence[int] = 1
+    time_padding: types_spec.PaddingModeString = (
+        types_spec.PaddingMode.VALID.value
+    )
+    spatial_padding: types_spec.PaddingModeString | tuple[int, int] = (
+        types_spec.PaddingMode.SAME.value
+    )
+    masked_average: bool = False
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class MinPooling3D[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    BasePooling[SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for MinPooling3D layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for MinPooling3D."""
+
+    pool_size: int | Sequence[int]
+    strides: int | Sequence[int] = 1
+    dilation_rate: int | Sequence[int] = 1
+    time_padding: types_spec.PaddingModeString = (
+        types_spec.PaddingMode.VALID.value
+    )
+    spatial_padding: Sequence[
+        types_spec.PaddingModeString | tuple[int, int]
+    ] = (
+        types_spec.PaddingMode.SAME.value,
+        types_spec.PaddingMode.SAME.value,
+    )
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class MaxPooling3D[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    BasePooling[SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for MaxPooling3D layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for MaxPooling3D."""
+
+    pool_size: int | Sequence[int]
+    strides: int | Sequence[int] = 1
+    dilation_rate: int | Sequence[int] = 1
+    time_padding: types_spec.PaddingModeString = (
+        types_spec.PaddingMode.VALID.value
+    )
+    spatial_padding: Sequence[
+        types_spec.PaddingModeString | tuple[int, int]
+    ] = (
+        types_spec.PaddingMode.SAME.value,
+        types_spec.PaddingMode.SAME.value,
+    )
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class AveragePooling3D[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    BasePooling[SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for AveragePooling3D layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for AveragePooling3D."""
+
+    pool_size: int | Sequence[int]
+    strides: int | Sequence[int] = 1
+    dilation_rate: int | Sequence[int] = 1
+    time_padding: types_spec.PaddingModeString = (
+        types_spec.PaddingMode.VALID.value
+    )
+    spatial_padding: Sequence[
+        types_spec.PaddingModeString | tuple[int, int]
+    ] = (
+        types_spec.PaddingMode.SAME.value,
+        types_spec.PaddingMode.SAME.value,
+    )
+    masked_average: bool = False
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
diff --git a/sequence_layers/specs/pooling_behaviors.py b/sequence_layers/specs/pooling_behaviors.py
new file mode 100644
index 0000000..80ffbda
--- /dev/null
+++ b/sequence_layers/specs/pooling_behaviors.py
@@ -0,0 +1,859 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Behavior tests for pooling layers.
+
+Backend-specific test files should inherit from these tests.
+"""
+
+# pylint: disable=abstract-method
+# pyrefly: disable=bad-instantiation
+from absl.testing import parameterized
+import numpy as np
+
+from sequence_layers.specs import test_utils
+
+
+class Pooling1DTest(test_utils.SequenceLayerTest):
+  """Test behavior of 1D pooling layers."""
+
+  def test_defaults(self):
+    self.assertConfigDefaults(
+        self.sl.MaxPooling1D.Config,
+        {
+            'strides': 1,
+            'dilation_rate': 1,
+            'padding': 'valid',
+            'name': None,
+        },
+        pool_size=3,
+    )
+    self.assertConfigDefaults(
+        self.sl.MinPooling1D.Config,
+        {
+            'strides': 1,
+            'dilation_rate': 1,
+            'padding': 'valid',
+            'name': None,
+        },
+        pool_size=3,
+    )
+    self.assertConfigDefaults(
+        self.sl.AveragePooling1D.Config,
+        {
+            'strides': 1,
+            'dilation_rate': 1,
+            'padding': 'valid',
+            'masked_average': False,
+            'name': None,
+        },
+        pool_size=3,
+    )
+
+  @parameterized.product(
+      pool_type_kwargs=(
+          ('min', {}),
+          ('max', {}),
+          ('average', {'masked_average': False}),
+          ('average', {'masked_average': True}),
+      ),
+      params=[
+          # 1x1 conv.
+          (1, 1, 1),
+          # even pool_size with smaller, equal and larger strides.
+          (2, 1, 1),
+          (2, 2, 1),
+          (2, 3, 1),
+          # odd pool_size with smaller, equal and larger strides.
+          (3, 2, 1),
+          (3, 3, 1),
+          (3, 4, 1),
+          # pool_size smaller, equal and larger than even dilation_rate.
+          (1, 1, 2),
+          (2, 1, 2),
+          (3, 1, 2),
+          # pool_size smaller, equal and larger than odd dilation_rate.
+          (1, 1, 3),
+          (2, 1, 3),
+          (3, 1, 3),
+      ],
+      padding=[
+          'same',
+          'valid',
+          'reverse_causal_valid',
+          'causal',
+          'reverse_causal',
+          'semicausal',
+      ],
+  )
+  def test_pooling1d(self, pool_type_kwargs, params, padding):
+    pool_type, kwargs = pool_type_kwargs
+    return self._test_pooling1d(
+        pool_type,
+        params,
+        (3,),
+        padding,
+        self.xp.float32,
+        **kwargs,
+    )
+
+  @parameterized.product(
+      pool_type_kwargs=(
+          ('min', {}),
+          ('max', {}),
+          ('average', {'masked_average': False}),
+          ('average', {'masked_average': True}),
+      ),
+      dtype_name=[
+          'FLOAT32',
+          'INT32',
+      ],
+  )
+  def test_dtypes(self, pool_type_kwargs, dtype_name):
+    pool_type, kwargs = pool_type_kwargs
+    dtype = getattr(self.xp, dtype_name.lower())
+    return self._test_pooling1d(
+        pool_type, (3, 2, 1), (3,), 'reverse_causal', dtype, **kwargs
+    )
+
+  @parameterized.product(
+      pool_type_kwargs=(
+          ('min', {}),
+          ('max', {}),
+          ('average', {'masked_average': False}),
+          ('average', {'masked_average': True}),
+      ),
+      channel_shape=(
+          (),
+          (3,),
+          (3, 5),
+      ),
+  )
+  def test_channel_shapes(self, pool_type_kwargs, channel_shape):
+    pool_type, kwargs = pool_type_kwargs
+    return self._test_pooling1d(
+        pool_type,
+        (3, 2, 1),
+        channel_shape,
+        'reverse_causal',
+        self.xp.float32,
+        **kwargs,
+    )
+
+  @parameterized.product(
+      masked_average=[True, False],
+  )
+  def test_masked_average(self, masked_average):
+    pool_size, stride, dilation_rate = 3, 3, 1
+    padding = 'reverse_causal'
+    config = self.sl.AveragePooling1D.Config(
+        pool_size=pool_size,
+        strides=stride,
+        dilation_rate=dilation_rate,
+        padding=padding,
+        name='pool_1d',
+        masked_average=masked_average,
+    )
+    l = self.make_layer(config)
+
+    x_values = np.array(
+        [
+            [1, 2, 3, 4, 5, 6],
+            [3, 4, 5, 6, 7, 8],
+            [5, 6, 7, 8, 9, 0],
+            [2, 3, 0, 6, 2, 1],
+            [0, 6, 2, 1, 7, 8],
+        ],
+        dtype=np.float32,
+    )
+
+    x_mask = np.array(
+        [
+            [False, False, False, False, False, False],
+            [True, True, True, False, False, False],
+            [True, True, True, True, False, False],
+            [True, True, True, True, True, False],
+            [True, True, True, True, True, True],
+        ],
+        dtype=bool,
+    )
+
+    x = self.sl.types.Sequence(
+        self.xp.array(x_values),
+        self.xp.array(x_mask),
+    )
+    l = self.init_layer(l, x)
+    y = l.layer(x, training=False)
+
+    if masked_average:
+      expected_y_values = np.array(
+          [
+              [0.0, 0.0],
+              [(3 + 4 + 5) / 3.0, 0],
+              [(5 + 6 + 7) / 3.0, 8],
+              [(2 + 3 + 0) / 3.0, (6 + 2) / 2.0],
+              [(0 + 6 + 2) / 3.0, (1 + 7 + 8) / 3.0],
+          ],
+          dtype=np.float32,
+      )
+    else:
+      expected_y_values = np.array(
+          [
+              [0.0, 0.0],
+              [(3 + 4 + 5) / 3.0, 0],
+              [(5 + 6 + 7) / 3.0, 8 / 3.0],
+              [(2 + 3 + 0) / 3.0, (6 + 2) / 3.0],
+              [(0 + 6 + 2) / 3.0, (1 + 7 + 8) / 3.0],
+          ],
+          dtype=np.float32,
+      )
+
+    expected_y_mask = np.array(
+        [
+            [False, False],
+            [True, False],
+            [True, True],
+            [True, True],
+            [True, True],
+        ],
+        dtype=bool,
+    )
+
+    expected_y = self.sl.types.Sequence(
+        self.xp.array(expected_y_values),
+        self.xp.array(expected_y_mask),
+    )
+    self.assertSequencesClose(y, expected_y)
+
+  def _test_pooling1d(
+      self, pool_type, params, channel_shape, padding, dtype, **kwargs
+  ):
+    pool_size, stride, dilation_rate = params
+    effective_pool_size = (pool_size - 1) * dilation_rate + 1
+
+    match pool_type:
+      case 'min':
+        config = self.sl.MinPooling1D.Config(
+            pool_size=pool_size,
+            strides=stride,
+            dilation_rate=dilation_rate,
+            padding=padding,
+            name='pool_1d',
+            **kwargs,
+        )
+      case 'max':
+        config = self.sl.MaxPooling1D.Config(
+            pool_size=pool_size,
+            strides=stride,
+            dilation_rate=dilation_rate,
+            padding=padding,
+            name='pool_1d',
+            **kwargs,
+        )
+      case 'average':
+        config = self.sl.AveragePooling1D.Config(
+            pool_size=pool_size,
+            strides=stride,
+            dilation_rate=dilation_rate,
+            padding=padding,
+            name='pool_1d',
+            **kwargs,
+        )
+      case _:
+        raise NotImplementedError()
+
+    l = self.make_layer(config)
+
+    self.assertEqual(l.block_size, stride)
+    self.assertEqual(1 / l.output_ratio, stride)
+    self.assertEqual(l.name, 'pool_1d')
+    self.assertEqual(
+        l.supports_step,
+        padding
+        in (
+            'reverse_causal_valid',
+            'causal',
+            'reverse_causal',
+            'semicausal',
+        ),
+    )
+
+    expected_input_latency = (
+        effective_pool_size - 1
+        if padding in ('reverse_causal_valid', 'reverse_causal')
+        else 0
+    )
+    self.assertEqual(l.input_latency, expected_input_latency)
+    self.assertEqual(l.output_latency, expected_input_latency // stride)
+
+    batch_size = 2
+    x = self.random_sequence(batch_size, 1, *channel_shape, dtype=dtype)
+    l = self.init_layer(l, x)
+    self.assertEmpty(self.get_variables(l))
+
+    output_spec = l.get_output_spec(x.channel_spec)
+    self.assertEqual(output_spec.dtype, dtype)
+    self.assertEqual(output_spec.shape, channel_shape)
+
+    # Check contract compatibility on various sequence lengths.
+    # JAX does not support reduce_window gradients with dilation_rate > 1.
+    test_gradients = dilation_rate == 1 and self.xp.float32 == dtype
+    test_receptive_field = dilation_rate == 1 and self.xp.float32 == dtype
+
+    for time in range(20 * l.block_size - 1, 20 * l.block_size + 2):
+      x = self.random_sequence(batch_size, time, *channel_shape, dtype=dtype)
+      self.verify_contract(
+          l,
+          x,
+          training=False,
+          test_gradients=test_gradients,
+          test_receptive_field=test_receptive_field,
+      )
+
+
+class Pooling2DTest(test_utils.SequenceLayerTest):
+  """Test behavior of 2D pooling layers."""
+
+  @parameterized.product(
+      pool_type_kwargs=(
+          ('min', {}),
+          ('max', {}),
+          ('average', {'masked_average': False}),
+          ('average', {'masked_average': True}),
+      ),
+      params=[
+          # 1x1 conv.
+          (1, 1, 1),
+          # even pool_size with smaller, equal and larger strides.
+          (2, 1, 1),
+          (2, 2, 1),
+          (2, 3, 1),
+          # odd pool_size with smaller, equal and larger strides.
+          (3, 2, 1),
+          (3, 3, 1),
+          (3, 4, 1),
+          # pool_size smaller, equal and larger than even dilation_rate.
+          (1, 1, 2),
+          (2, 1, 2),
+          (3, 1, 2),
+          # pool_size smaller, equal and larger than odd dilation_rate.
+          (1, 1, 3),
+          (2, 1, 3),
+          (3, 1, 3),
+      ],
+      time_padding=[
+          'same',
+          'valid',
+          'reverse_causal_valid',
+          'causal',
+          'reverse_causal',
+          'semicausal',
+      ],
+  )
+  def test_pooling2d(self, pool_type_kwargs, params, time_padding):
+    pool_type, kwargs = pool_type_kwargs
+    self._test_pooling2d(
+        pool_type,
+        params,
+        (9,),
+        time_padding,
+        'same',
+        self.xp.float32,
+        **kwargs,
+    )
+
+  @parameterized.product(
+      pool_type_kwargs=(
+          ('min', {}),
+          ('max', {}),
+          ('average', {'masked_average': False}),
+          ('average', {'masked_average': True}),
+      ),
+      spatial_padding=[
+          'same',
+          'valid',
+          'reverse_causal_valid',
+          'causal',
+          'reverse_causal',
+          'semicausal',
+      ],
+  )
+  def test_spatial_padding(self, pool_type_kwargs, spatial_padding):
+    pool_type, kwargs = pool_type_kwargs
+    return self._test_pooling2d(
+        pool_type,
+        (3, 2, 1),
+        (9,),
+        'reverse_causal',
+        spatial_padding,
+        self.xp.float32,
+        **kwargs,
+    )
+
+  @parameterized.product(
+      pool_type_kwargs=(
+          ('min', {}),
+          ('max', {}),
+          ('average', {'masked_average': False}),
+          ('average', {'masked_average': True}),
+      ),
+      dtype_name=[
+          'FLOAT32',
+          'INT32',
+      ],
+  )
+  def test_dtypes(self, pool_type_kwargs, dtype_name):
+    pool_type, kwargs = pool_type_kwargs
+    dtype = getattr(self.xp, dtype_name.lower())
+    return self._test_pooling2d(
+        pool_type,
+        (3, 2, 1),
+        (9,),
+        'reverse_causal',
+        'reverse_causal',
+        dtype,
+        **kwargs,
+    )
+
+  @parameterized.product(
+      pool_type_kwargs=(
+          ('min', {}),
+          ('max', {}),
+          ('average', {'masked_average': False}),
+          ('average', {'masked_average': True}),
+      ),
+      channel_shape=(
+          (9,),
+          (9, 5),
+          (9, 5, 3),
+      ),
+  )
+  def test_channel_shapes(self, pool_type_kwargs, channel_shape):
+    pool_type, kwargs = pool_type_kwargs
+    return self._test_pooling2d(
+        pool_type,
+        (3, 2, 1),
+        channel_shape,
+        'reverse_causal',
+        'reverse_causal',
+        self.xp.float32,
+        **kwargs,
+    )
+
+  @parameterized.product(
+      masked_average=[True, False],
+  )
+  def test_masked_average(self, masked_average):
+    pool_size, stride, dilation_rate = (3, 2), (3, 2), (1, 1)
+    time_padding = 'reverse_causal'
+    spatial_padding = 'reverse_causal'
+    config = self.sl.AveragePooling2D.Config(
+        pool_size=pool_size,
+        strides=stride,
+        dilation_rate=dilation_rate,
+        time_padding=time_padding,
+        spatial_padding=spatial_padding,
+        name='pool_2d',
+        masked_average=masked_average,
+    )
+    l = self.make_layer(config)
+
+    x_values = np.array(
+        [
+            [[1, 2], [2, 3], [5, 6], [7, 8], [9, 3], [4, 2]],
+            [[2, 3], [5, 6], [7, 8], [9, 3], [3, 1], [2, 7]],
+            [[5, 2], [7, 3], [0, 3], [3, 1], [2, 6], [1, 2]],
+            [[7, 3], [0, 3], [3, 1], [2, 6], [1, 2], [3, 4]],
+            [[0, 3], [3, 1], [2, 6], [1, 2], [3, 4], [5, 7]],
+        ],
+        dtype=np.float32,
+    )
+
+    x_mask = np.array(
+        [
+            [False, False, False, False, False, False],
+            [True, True, True, False, False, False],
+            [True, True, True, True, False, False],
+            [True, True, True, True, True, False],
+            [True, True, True, True, True, True],
+        ],
+        dtype=bool,
+    )
+
+    x = self.sl.types.Sequence(
+        self.xp.array(x_values),
+        self.xp.array(x_mask),
+    )
+    l = self.init_layer(l, x)
+    y = l.layer(x, training=False)
+
+    if masked_average:
+      expected_y_values = np.array(
+          [
+              [[0.0], [0.0]],
+              [[(2 + 5 + 7 + 3 + 6 + 8) / 6.0], [0]],
+              [[(5 + 7 + 0 + 2 + 3 + 3) / 6.0], [(3 + 1) / 2.0]],
+              [[(7 + 0 + 3 + 3 + 3 + 1) / 6.0], [(2 + 1 + 6 + 2) / 4.0]],
+              [
+                  [(0 + 3 + 2 + 3 + 1 + 6) / 6.0],
+                  [(1 + 3 + 5 + 2 + 4 + 7) / 6.0],
+              ],
+          ],
+          dtype=np.float32,
+      )
+    else:
+      expected_y_values = np.array(
+          [
+              [[0.0], [0.0]],
+              [[(2 + 5 + 7 + 3 + 6 + 8) / 6.0], [0]],
+              [[(5 + 7 + 0 + 2 + 3 + 3) / 6.0], [(3 + 1) / 6.0]],
+              [[(7 + 0 + 3 + 3 + 3 + 1) / 6.0], [(2 + 1 + 6 + 2) / 6.0]],
+              [
+                  [(0 + 3 + 2 + 3 + 1 + 6) / 6.0],
+                  [(1 + 3 + 5 + 2 + 4 + 7) / 6.0],
+              ],
+          ],
+          dtype=np.float32,
+      )
+
+    expected_y_mask = np.array(
+        [
+            [False, False],
+            [True, False],
+            [True, True],
+            [True, True],
+            [True, True],
+        ],
+        dtype=bool,
+    )
+
+    expected_y = self.sl.types.Sequence(
+        self.xp.array(expected_y_values),
+        self.xp.array(expected_y_mask),
+    )
+    self.assertSequencesClose(y, expected_y)
+
+  def _test_pooling2d(
+      self,
+      pool_type,
+      params,
+      channel_shape,
+      time_padding,
+      spatial_padding,
+      dtype,
+      **kwargs,
+  ):
+    pool_size, stride, dilation_rate = params
+    effective_pool_size = (pool_size - 1) * dilation_rate + 1
+
+    match pool_type:
+      case 'min':
+        config = self.sl.MinPooling2D.Config(
+            pool_size=pool_size,
+            strides=stride,
+            dilation_rate=dilation_rate,
+            time_padding=time_padding,
+            spatial_padding=spatial_padding,
+            name='pool_2d',
+            **kwargs,
+        )
+      case 'max':
+        config = self.sl.MaxPooling2D.Config(
+            pool_size=pool_size,
+            strides=stride,
+            dilation_rate=dilation_rate,
+            time_padding=time_padding,
+            spatial_padding=spatial_padding,
+            name='pool_2d',
+            **kwargs,
+        )
+      case 'average':
+        config = self.sl.AveragePooling2D.Config(
+            pool_size=pool_size,
+            strides=stride,
+            dilation_rate=dilation_rate,
+            time_padding=time_padding,
+            spatial_padding=spatial_padding,
+            name='pool_2d',
+            **kwargs,
+        )
+      case _:
+        raise NotImplementedError()
+
+    l = self.make_layer(config)
+
+    self.assertEqual(l.block_size, stride)
+    self.assertEqual(1 / l.output_ratio, stride)
+    self.assertEqual(l.name, 'pool_2d')
+    self.assertEqual(
+        l.supports_step,
+        time_padding
+        in (
+            'reverse_causal_valid',
+            'causal',
+            'reverse_causal',
+            'semicausal',
+        ),
+    )
+
+    expected_input_latency = (
+        effective_pool_size - 1
+        if time_padding in ('reverse_causal_valid', 'reverse_causal')
+        else 0
+    )
+    self.assertEqual(l.input_latency, expected_input_latency)
+    self.assertEqual(l.output_latency, expected_input_latency // stride)
+
+    batch_size = 2
+    x = self.random_sequence(batch_size, 1, *channel_shape, dtype=dtype)
+    l = self.init_layer(l, x)
+    self.assertEmpty(self.get_variables(l))
+
+    output_spec = l.get_output_spec(x.channel_spec)
+    self.assertEqual(output_spec.dtype, dtype)
+
+    # Verify verification contract
+    test_gradients = dilation_rate == 1 and self.xp.float32 == dtype
+    test_receptive_field = dilation_rate == 1 and self.xp.float32 == dtype
+
+    for time in range(20 * l.block_size - 1, 20 * l.block_size + 2):
+      x = self.random_sequence(batch_size, time, *channel_shape, dtype=dtype)
+      self.verify_contract(
+          l,
+          x,
+          training=False,
+          test_gradients=test_gradients,
+          test_receptive_field=test_receptive_field,
+      )
+
+
+class Pooling3DTest(test_utils.SequenceLayerTest):
+  """Test behavior of 3D pooling layers."""
+
+  @parameterized.product(
+      pool_type_kwargs=(
+          ('min', {}),
+          ('max', {}),
+          ('average', {'masked_average': False}),
+          ('average', {'masked_average': True}),
+      ),
+      params=[
+          # 1x1 conv.
+          (1, 1, 1),
+          # even pool_size with smaller, equal and larger strides.
+          (2, 1, 1),
+          (2, 2, 1),
+          (2, 3, 1),
+          # odd pool_size with smaller, equal and larger strides.
+          (3, 2, 1),
+          (3, 3, 1),
+          (3, 4, 1),
+          # pool_size smaller, equal and larger than even dilation_rate.
+          (1, 1, 2),
+          (2, 1, 2),
+          (3, 1, 2),
+          # pool_size smaller, equal and larger than odd dilation_rate.
+          (1, 1, 3),
+          (2, 1, 3),
+          (3, 1, 3),
+      ],
+      time_padding=[
+          'same',
+          'valid',
+          'reverse_causal_valid',
+          'causal',
+          'reverse_causal',
+          'semicausal',
+      ],
+  )
+  def test_pooling3d(self, pool_type_kwargs, params, time_padding):
+    pool_type, kwargs = pool_type_kwargs
+    self._test_pooling3d(
+        pool_type,
+        params,
+        (9, 9),
+        time_padding,
+        'same',
+        self.xp.float32,
+        **kwargs,
+    )
+
+  @parameterized.product(
+      pool_type_kwargs=(
+          ('min', {}),
+          ('max', {}),
+          ('average', {'masked_average': False}),
+          ('average', {'masked_average': True}),
+      ),
+      spatial_padding=[
+          'same',
+          'valid',
+          'reverse_causal_valid',
+          'causal',
+          'reverse_causal',
+          'semicausal',
+      ],
+  )
+  def test_spatial_padding(self, pool_type_kwargs, spatial_padding):
+    pool_type, kwargs = pool_type_kwargs
+    return self._test_pooling3d(
+        pool_type,
+        (3, 2, 1),
+        (9, 9),
+        'reverse_causal',
+        spatial_padding,
+        self.xp.float32,
+        **kwargs,
+    )
+
+  @parameterized.product(
+      pool_type_kwargs=(
+          ('min', {}),
+          ('max', {}),
+          ('average', {'masked_average': False}),
+          ('average', {'masked_average': True}),
+      ),
+      dtype_name=[
+          'FLOAT32',
+          'INT32',
+      ],
+  )
+  def test_dtypes(self, pool_type_kwargs, dtype_name):
+    pool_type, kwargs = pool_type_kwargs
+    dtype = getattr(self.xp, dtype_name.lower())
+    return self._test_pooling3d(
+        pool_type,
+        (3, 2, 1),
+        (9, 9),
+        'reverse_causal',
+        'reverse_causal',
+        dtype,
+        **kwargs,
+    )
+
+  @parameterized.product(
+      pool_type_kwargs=(
+          ('min', {}),
+          ('max', {}),
+          ('average', {'masked_average': False}),
+          ('average', {'masked_average': True}),
+      ),
+      channel_shape=(
+          (9, 9),
+          (9, 9, 5),
+      ),
+  )
+  def test_channel_shapes(self, pool_type_kwargs, channel_shape):
+    pool_type, kwargs = pool_type_kwargs
+    return self._test_pooling3d(
+        pool_type,
+        (3, 2, 1),
+        channel_shape,
+        'reverse_causal',
+        'reverse_causal',
+        self.xp.float32,
+        **kwargs,
+    )
+
+  def _test_pooling3d(
+      self,
+      pool_type,
+      params,
+      channel_shape,
+      time_padding,
+      spatial_padding,
+      dtype,
+      **kwargs,
+  ):
+    pool_size, stride, dilation_rate = params
+    effective_pool_size = (pool_size - 1) * dilation_rate + 1
+
+    match pool_type:
+      case 'min':
+        config = self.sl.MinPooling3D.Config(
+            pool_size=pool_size,
+            strides=stride,
+            dilation_rate=dilation_rate,
+            time_padding=time_padding,
+            spatial_padding=(spatial_padding, spatial_padding),
+            name='pool_3d',
+            **kwargs,
+        )
+      case 'max':
+        config = self.sl.MaxPooling3D.Config(
+            pool_size=pool_size,
+            strides=stride,
+            dilation_rate=dilation_rate,
+            time_padding=time_padding,
+            spatial_padding=(spatial_padding, spatial_padding),
+            name='pool_3d',
+            **kwargs,
+        )
+      case 'average':
+        config = self.sl.AveragePooling3D.Config(
+            pool_size=pool_size,
+            strides=stride,
+            dilation_rate=dilation_rate,
+            time_padding=time_padding,
+            spatial_padding=(spatial_padding, spatial_padding),
+            name='pool_3d',
+            **kwargs,
+        )
+      case _:
+        raise NotImplementedError()
+
+    l = self.make_layer(config)
+
+    self.assertEqual(l.block_size, stride)
+    self.assertEqual(1 / l.output_ratio, stride)
+    self.assertEqual(l.name, 'pool_3d')
+    self.assertEqual(
+        l.supports_step,
+        time_padding
+        in (
+            'reverse_causal_valid',
+            'causal',
+            'reverse_causal',
+            'semicausal',
+        ),
+    )
+
+    expected_input_latency = (
+        effective_pool_size - 1
+        if time_padding in ('reverse_causal_valid', 'reverse_causal')
+        else 0
+    )
+    self.assertEqual(l.input_latency, expected_input_latency)
+    self.assertEqual(l.output_latency, expected_input_latency // stride)
+
+    batch_size = 2
+    x = self.random_sequence(batch_size, 1, *channel_shape, dtype=dtype)
+    l = self.init_layer(l, x)
+    self.assertEmpty(self.get_variables(l))
+
+    output_spec = l.get_output_spec(x.channel_spec)
+    self.assertEqual(output_spec.dtype, dtype)
+
+    # Verify verification contract
+    test_gradients = dilation_rate == 1 and self.xp.float32 == dtype
+    test_receptive_field = dilation_rate == 1 and self.xp.float32 == dtype
+
+    for time in range(20 * l.block_size - 1, 20 * l.block_size + 2):
+      x = self.random_sequence(batch_size, time, *channel_shape, dtype=dtype)
+      self.verify_contract(
+          l,
+          x,
+          training=False,
+          test_gradients=test_gradients,
+          test_receptive_field=test_receptive_field,
+      )
diff --git a/sequence_layers/specs/position.py b/sequence_layers/specs/position.py
new file mode 100644
index 0000000..9308343
--- /dev/null
+++ b/sequence_layers/specs/position.py
@@ -0,0 +1,85 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Specifications for position and timing layers."""
+
+import abc
+import dataclasses
+from typing import Any, override, Protocol, runtime_checkable
+
+from sequence_layers.specs import types as types_spec
+
+
+class AddTimingSignal[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.SequenceLayer[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Adds sinusoids at varying frequencies to the input channels dimension."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Config for AddTimingSignal."""
+
+    min_timescale: float = 1.0
+    max_timescale: float = 1.0e4
+    trainable_scale: bool = False
+    axes: int | tuple[int, ...] | None = None
+    sharding: types_spec.Sharding | None = None
+    param_dtype: Any = None
+    only_advance_position_for_valid_timesteps: bool = True
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+class ApplyRotaryPositionalEncoding[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.SequenceLayer[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Applies Rotary Positional Encodings (RoPE) to the sequence."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Config for ApplyRotaryPositionalEncoding."""
+
+    max_wavelength: float
+    axis: int = -1
+    only_advance_position_for_valid_timesteps: bool = True
+    positions_in_at_least_fp32: bool = True
+    positions_name: str | None = None
+    name: str | None = None
+
+    @override
+    def make(self) -> Any:
+      """Dummy make to satisfy Pyrefly."""
+
+
+@runtime_checkable
+class ModuleSpec(Protocol):
+  """Protocol for position module."""
+
+  @property
+  def AddTimingSignal(self) -> type[AddTimingSignal]:
+    ...
+
+  @property
+  def ApplyRotaryPositionalEncoding(
+      self,
+  ) -> type[ApplyRotaryPositionalEncoding]:
+    ...
diff --git a/sequence_layers/specs/position_behaviors.py b/sequence_layers/specs/position_behaviors.py
new file mode 100644
index 0000000..f5656f3
--- /dev/null
+++ b/sequence_layers/specs/position_behaviors.py
@@ -0,0 +1,290 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Shared behavior tests for position and timing layers."""
+
+# pylint: disable=abstract-method
+# pyrefly: disable=bad-instantiation
+
+from absl.testing import parameterized
+import numpy as np
+
+from sequence_layers.specs import position as position_spec
+from sequence_layers.specs import test_utils
+
+
+class AddTimingSignalTest(test_utils.SequenceLayerTest):
+  """Test behavior of AddTimingSignal layer."""
+
+  @parameterized.parameters(
+      dict(
+          min_timescale=1.0,
+          max_timescale=1.0e4,
+          trainable_scale=True,
+          channel_shape=(3,),
+          axes=None,
+      ),
+      dict(
+          min_timescale=1.0,
+          max_timescale=1.0e4,
+          trainable_scale=False,
+          channel_shape=(3,),
+          axes=None,
+      ),
+      dict(
+          min_timescale=10.0,
+          max_timescale=1.0e5,
+          trainable_scale=False,
+          channel_shape=(3,),
+          axes=0,
+      ),
+      dict(
+          min_timescale=1.0,
+          max_timescale=1.0e4,
+          trainable_scale=True,
+          channel_shape=(5, 9),
+          axes=(1,),
+      ),
+      dict(
+          min_timescale=1.0,
+          max_timescale=1.0e4,
+          trainable_scale=True,
+          channel_shape=(5, 9, 3),
+          axes=[1, 2],
+      ),
+      dict(
+          min_timescale=1.0,
+          max_timescale=1.0e4,
+          trainable_scale=True,
+          channel_shape=(5, 9),
+          axes=(1,),
+          only_advance_position_for_valid_timesteps=False,
+      ),
+  )
+  def test_basic(
+      self,
+      min_timescale,
+      max_timescale,
+      trainable_scale,
+      channel_shape,
+      axes,
+      only_advance_position_for_valid_timesteps=True,
+  ):
+    config = self.sl.AddTimingSignal.Config(
+        min_timescale=min_timescale,
+        max_timescale=max_timescale,
+        trainable_scale=trainable_scale,
+        axes=axes,
+        only_advance_position_for_valid_timesteps=only_advance_position_for_valid_timesteps,
+        name='add_timing_signal',
+    )
+    layer = self.make_layer(config)
+    batch_size = 8
+    x = self.random_sequence(batch_size, 1, *channel_shape)
+    layer = self.init_layer(layer, x)
+
+    self.assertEqual(layer.block_size, 1)
+    self.assertEqual(layer.output_ratio, 1)
+    self.assertEqual(layer.name, 'add_timing_signal')
+    self.assertEqual(layer.get_output_shape(x.channel_shape), x.channel_shape)
+
+    # Verify trainable scale presence in variables
+    variables = self.get_variables(layer)
+    if isinstance(variables, dict) and 'params' in variables:
+      params = variables['params']
+    else:
+      params = variables
+    if trainable_scale:
+      self.assertIn('scale', params)
+    else:
+      self.assertNotIn('scale', params)
+
+    for time in range(13 * layer.block_size, 15 * layer.block_size):
+      x = self.random_sequence(
+          batch_size,
+          time,
+          *channel_shape,
+          random_mask=True,
+      )
+      self.verify_contract(layer, x, training=False)
+
+  @parameterized.parameters(
+      dict(channel_shape=(2, 3), axes=-1, normalized_axes=(1,)),
+      dict(channel_shape=(2, 3, 5), axes=[0, 2], normalized_axes=(0, 2)),
+  )
+  def test_timing_signal_along_axes(self, channel_shape, axes, normalized_axes):
+    config = self.sl.AddTimingSignal.Config(
+        axes=axes,
+        name='add_timing_signal',
+    )
+    layer = self.make_layer(config)
+    batch_size = 2
+    seq_len = 3
+    inputs = self.sl.Sequence.from_values(
+        self.xp.zeros((batch_size, seq_len, *channel_shape))
+    )
+    layer = self.init_layer(layer, inputs)
+    outputs = layer.layer(inputs, training=False)
+    outputs_np = np.asarray(outputs.values[0, -1])
+
+    channel_dims = len(channel_shape)
+
+    with self.subTest('equal_along_broadcasted_axes'):
+      broadcast_slice_0 = tuple(
+          slice(None) if axis in normalized_axes else 0
+          for axis in range(channel_dims)
+      )
+      broadcast_slice_1 = tuple(
+          slice(None) if axis in normalized_axes else 1
+          for axis in range(channel_dims)
+      )
+      self.assertAllEqual(
+          outputs_np[broadcast_slice_0], outputs_np[broadcast_slice_1]
+      )
+
+    with self.subTest('not_equal_over_all_axes'):
+      complementary_slice_0 = tuple(
+          0 if axis in normalized_axes else slice(None)
+          for axis in range(channel_dims)
+      )
+      complementary_slice_1 = tuple(
+          1 if axis in normalized_axes else slice(None)
+          for axis in range(channel_dims)
+      )
+      self.assertNotAllEqual(
+          outputs_np[complementary_slice_0], outputs_np[complementary_slice_1]
+      )
+
+
+class ApplyRotaryPositionalEncodingTest(test_utils.SequenceLayerTest):
+  """Test behavior of ApplyRotaryPositionalEncoding layer."""
+
+  @parameterized.product(
+      max_wavelength=(1.0e4, 1.0e5),
+      channel_shape=((4,), (3, 6)),
+      only_advance_position_for_valid_timesteps=(False, True),
+  )
+  def test_basic(
+      self,
+      max_wavelength,
+      channel_shape,
+      only_advance_position_for_valid_timesteps,
+  ):
+    config = self.sl.ApplyRotaryPositionalEncoding.Config(
+        max_wavelength=max_wavelength,
+        only_advance_position_for_valid_timesteps=only_advance_position_for_valid_timesteps,
+        name='rope',
+    )
+    layer = self.make_layer(config)
+    batch_size = 2
+    x = self.random_sequence(batch_size, 1, *channel_shape)
+    layer = self.init_layer(layer, x)
+
+    self.assertEqual(layer.block_size, 1)
+    self.assertEqual(layer.output_ratio, 1)
+    self.assertEqual(layer.name, 'rope')
+    self.assertEqual(layer.get_output_shape(x.channel_shape), x.channel_shape)
+
+    for time in range(13 * layer.block_size, 15 * layer.block_size):
+      x = self.random_sequence(
+          batch_size,
+          time,
+          *channel_shape,
+          random_mask=only_advance_position_for_valid_timesteps,
+      )
+      self.verify_contract(layer, x, training=False)
+
+  def test_only_advance_position_for_valid_timesteps(self):
+    config = self.sl.ApplyRotaryPositionalEncoding.Config(
+        max_wavelength=1.0e5,
+        only_advance_position_for_valid_timesteps=True,
+        name='rope',
+    )
+    layer = self.make_layer(config)
+
+    x = self.sl.Sequence(
+        self.xp.array(np.random.normal(size=(3, 3, 6)).astype(np.float32)),
+        self.xp.array(
+            [[False, True, True], [True, False, True], [True, True, False]]
+        ),
+    ).mask_invalid()
+
+    layer = self.init_layer(layer, x)
+    y = layer.layer(x, training=False)
+
+    # Verify the layer ignores invalid timesteps by showing the output is equal
+    # to processing a sequence without the invalid timesteps.
+    self.assertSequencesClose(
+        y[0:1, 1:],
+        layer.layer(x[0:1, 1:], training=False),
+    )
+    self.assertSequencesClose(
+        self.sl.Sequence.concatenate_sequences([y[1:2, :1], y[1:2, 2:]]),
+        layer.layer(
+            self.sl.Sequence.concatenate_sequences([x[1:2, :1], x[1:2, 2:]]),
+            training=False,
+        ),
+    )
+    self.assertSequencesClose(
+        y[2:3, :-1],
+        layer.layer(x[2:3, :-1], training=False),
+    )
+
+  def test_external_positions(self):
+    config = self.sl.ApplyRotaryPositionalEncoding.Config(
+        max_wavelength=1.0e4,
+        only_advance_position_for_valid_timesteps=False,
+        positions_name='positions',
+        name='rope',
+    )
+    layer = self.make_layer(config)
+
+    x = self.random_sequence(1, 5, 8, random_lengths=False)
+    x = self.sl.Sequence.concatenate_sequences([x, x])
+
+    # Ensure position indices list is constructed via xp wrapper
+    positions_arr = self.xp.array(np.arange(10)[np.newaxis] % 5)
+    constants = {'positions': self.sl.Sequence.from_values(positions_arr)}
+    layer = self.init_layer(layer, x, constants=constants)
+    self.assertEqual(layer.block_size, 1)
+    self.assertEqual(layer.output_ratio, 1)
+    self.assertEqual(layer.name, 'rope')
+    self.assertEqual(layer.get_output_shape(x.channel_shape), x.channel_shape)
+
+    y = self.verify_contract(
+        layer,
+        x,
+        constants=constants,
+        training=False,
+        stream_constants=True,
+        pad_constants=True,
+    )
+    # Since the positions repeat, the first half should equal the second half.
+    self.assertSequencesClose(y[:, :5], y[:, 5:])
+
+  def test_error_only_advance_position_for_valid_timesteps_and_external_positions(
+      self,
+  ):
+    config = self.sl.ApplyRotaryPositionalEncoding.Config(
+        max_wavelength=1.0e4,
+        positions_name='positions',
+        only_advance_position_for_valid_timesteps=True,
+        name='rope',
+    )
+    layer = self.make_layer(config)
+    x = self.random_sequence(1, 5, 8, random_lengths=False)
+    x = self.sl.Sequence.concatenate_sequences([x, x])
+    positions_arr = self.xp.array(np.arange(10)[np.newaxis] % 5)
+    constants = {'positions': self.sl.Sequence.from_values(positions_arr)}
+    with self.assertRaises(ValueError):
+      self.init_layer(layer, x, constants=constants)
diff --git a/sequence_layers/specs/simple.py b/sequence_layers/specs/simple.py
new file mode 100644
index 0000000..c05fa8a
--- /dev/null
+++ b/sequence_layers/specs/simple.py
@@ -0,0 +1,635 @@
+"""Specifications for simple layers.
+
+See the corresponding _behaviors module for behaviors.
+"""
+
+# pylint: disable=abstract-method
+
+import abc
+import dataclasses
+from typing import Any, Callable, Protocol, runtime_checkable, Sequence, TypeVar
+
+from sequence_layers.specs import types as types_spec
+
+# isort: off
+from sequence_layers.specs.types import (  # pylint: disable=unused-import
+    HashableArray,
+)
+# isort: on
+
+# ---------------------------------------------------------------------------
+# Activation Functions (StatelessPointwiseFunctor)
+# ---------------------------------------------------------------------------
+
+
+class Identity[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.StatelessPointwise[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Identity layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Identity layer."""
+
+
+class Relu[SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec](
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.StatelessPointwiseFunctor[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Relu layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Relu layer."""
+
+
+class Gelu[SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec](
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.StatelessPointwiseFunctor[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Gelu layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Gelu layer."""
+
+
+class Abs[SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec](
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.StatelessPointwiseFunctor[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Abs layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Abs layer."""
+
+
+class Exp[SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec](
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.StatelessPointwiseFunctor[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Exp layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Exp layer."""
+
+
+class Log[SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec](
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.StatelessPointwiseFunctor[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Log layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Log layer."""
+
+
+class Swish[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.StatelessPointwiseFunctor[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Swish layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Swish layer."""
+
+
+class Tanh[SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec](
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.StatelessPointwiseFunctor[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Tanh layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Tanh layer."""
+
+
+class Sigmoid[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.StatelessPointwiseFunctor[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Sigmoid layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Sigmoid layer."""
+
+
+class LeakyRelu[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.StatelessPointwiseFunctor[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for LeakyRelu layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for LeakyRelu layer."""
+
+
+class Elu[SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec](
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.StatelessPointwiseFunctor[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Elu layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Elu layer."""
+
+
+class Softmax[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.StatelessPointwiseFunctor[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Softmax layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Softmax layer."""
+
+
+class Softplus[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.StatelessPointwiseFunctor[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Softplus layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Softplus layer."""
+
+
+# ---------------------------------------------------------------------------
+# Simple Math and Pointwise (StatelessPointwise)
+# ---------------------------------------------------------------------------
+
+
+class Cast[SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec](
+    types_spec.StatelessPointwiseFunctor[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Cast layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Cast layer."""
+
+
+class Scale[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.StatelessPointwise[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Scale layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Scale layer."""
+
+
+class Add[SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec](
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.StatelessPointwise[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Add layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Add layer."""
+
+
+class MaskInvalid[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.StatelessPointwise[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for MaskInvalid layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for MaskInvalid layer."""
+
+
+# ---------------------------------------------------------------------------
+# Gating (Stateless)
+# ---------------------------------------------------------------------------
+
+
+T = TypeVar('T')
+
+
+class GatedUnit[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for GatedUnit layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config[T](types_spec.SequenceLayerConfig):
+    """Configuration for GatedUnit layer."""
+
+    feature_activation: Callable[[T], T] | None
+    gate_activation: Callable[[T], T] | None
+
+
+class GatedLinearUnit[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](GatedUnit[SequenceT, ShapeDTypeT], metaclass=abc.ABCMeta):
+  """Specification for GatedLinearUnit layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config[T](GatedUnit.Config[T]):
+    """Configuration for GatedLinearUnit layer."""
+
+
+class GatedTanhUnit[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](GatedUnit[SequenceT, ShapeDTypeT], metaclass=abc.ABCMeta):
+  """Specification for GatedTanhUnit layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config[T](GatedUnit.Config[T]):
+    """Configuration for GatedTanhUnit layer."""
+
+
+# ---------------------------------------------------------------------------
+# Shape Operations (Stateless)
+# ---------------------------------------------------------------------------
+
+
+class Flatten[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Flatten layer."""
+
+
+class Reshape[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Reshape layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Reshape layer."""
+
+    output_shape: Sequence[int]
+
+
+class ExpandDims[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for ExpandDims layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for ExpandDims layer."""
+
+    axis: int | Sequence[int]
+
+
+class Squeeze[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Squeeze layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Squeeze layer."""
+
+    axis: int | Sequence[int] | None
+
+
+class Transpose[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Transpose layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Transpose layer."""
+
+    axes: Sequence[int] | None
+
+
+# ---------------------------------------------------------------------------
+# Other Simple Layers
+# ---------------------------------------------------------------------------
+
+
+class OneHot[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for OneHot layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for OneHot layer."""
+
+    depth: int
+
+
+class Embedding[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Embedding layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Embedding layer."""
+
+    dimension: int
+    num_embeddings: int
+
+
+class Dropout[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.StatelessPointwise[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Dropout layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Dropout layer."""
+
+    rate: float
+
+
+class Downsample1D[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Downsample1D layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Downsample1D layer."""
+
+    rate: int
+
+
+class Upsample1D[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Upsample1D layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Upsample1D layer."""
+
+    rate: int
+
+
+class CheckpointName[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.StatelessPointwiseFunctor[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for CheckpointName layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for CheckpointName layer."""
+
+    checkpoint_name: str
+
+
+class Lambda[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Lambda layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Lambda layer."""
+
+    fn: Callable[..., Any]
+
+
+class Logging[
+    SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
+](
+    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.StatelessPointwise[SequenceT, SequenceT, ShapeDTypeT],
+    metaclass=abc.ABCMeta,
+):
+  """Specification for Logging layer."""
+
+  @dataclasses.dataclass(frozen=True)
+  class Config(types_spec.SequenceLayerConfig):
+    """Configuration for Logging layer."""
+
+    prefix: str
+
+
+# pylint: disable=invalid-name
+# pylint: disable=missing-function-docstring
+@runtime_checkable
+class ModuleSpec(Protocol):
+  """Protocol for simple layers module."""
+
+  @property
+  def Identity(self) -> type[Identity]:
+    ...
+
+  @property
+  def Relu(self) -> type[Relu]:
+    ...
+
+  @property
+  def Gelu(self) -> type[Gelu]:
+    ...
+
+  @property
+  def Swish(self) -> type[Swish]:
+    ...
+
+  @property
+  def Tanh(self) -> type[Tanh]:
+    ...
+
+  @property
+  def Sigmoid(self) -> type[Sigmoid]:
+    ...
+
+  @property
+  def LeakyRelu(self) -> type[LeakyRelu]:
+    ...
+
+  @property
+  def Elu(self) -> type[Elu]:
+    ...
+
+  @property
+  def Softmax(self) -> type[Softmax]:
+    ...
+
+  @property
+  def Softplus(self) -> type[Softplus]:
+    ...
+
+  @property
+  def Cast(self) -> type[Cast]:
+    ...
+
+  @property
+  def Scale(self) -> type[Scale]:
+    ...
+
+  @property
+  def Add(self) -> type[Add]:
+    ...
+
+  @property
+  def MaskInvalid(self) -> type[MaskInvalid]:
+    ...
+
+  @property
+  def GatedUnit(self) -> type[GatedUnit]:
+    ...
+
+  @property
+  def GatedLinearUnit(self) -> type[GatedLinearUnit]:
+    ...
+
+  @property
+  def GatedTanhUnit(self) -> type[GatedTanhUnit]:
+    ...
+
+  @property
+  def Flatten(self) -> type[Flatten]:
+    ...
+
+  @property
+  def Reshape(self) -> type[Reshape]:
+    ...
+
+  @property
+  def ExpandDims(self) -> type[ExpandDims]:
+    ...
+
+  @property
+  def Squeeze(self) -> type[Squeeze]:
+    ...
+
+  @property
+  def Transpose(self) -> type[Transpose]:
+    ...
+
+  @property
+  def OneHot(self) -> type[OneHot]:
+    ...
+
+  @property
+  def Embedding(self) -> type[Embedding]:
+    ...
+
+  @property
+  def Dropout(self) -> type[Dropout]:
+    ...
+
+  @property
+  def Downsample1D(self) -> type[Downsample1D]:
+    ...
+
+  @property
+  def Upsample1D(self) -> type[Upsample1D]:
+    ...
+
+  @property
+  def CheckpointName(self) -> type[CheckpointName]:
+    ...
+
+  @property
+  def Lambda(self) -> type[Lambda]:
+    ...
+
+  @property
+  def Logging(self) -> type[Logging]:
+    ...
+
+
+__all__ = [
+    name
+    for name, attr in globals().items()
+    if isinstance(attr, type) and not name.startswith('_')
+]
diff --git a/sequence_layers/specs/simple_behaviors.py b/sequence_layers/specs/simple_behaviors.py
new file mode 100644
index 0000000..8755bcf
--- /dev/null
+++ b/sequence_layers/specs/simple_behaviors.py
@@ -0,0 +1,795 @@
+"""Behavior tests for simple layers.
+
+Backend-specific test files should inherit from these tests.
+"""
+
+# pylint: disable=abstract-method
+# pyrefly: disable=bad-instantiation
+
+from fractions import Fraction
+from typing import Any, override
+from unittest import mock
+
+from absl import logging
+from absl.testing import parameterized
+import numpy as np
+
+from sequence_layers.specs import simple as simple_spec
+from sequence_layers.specs import test_utils
+
+
+class ModuleSpecTest(test_utils.ModuleSpecTest):
+  """Test that a backend-specific module implements the ModuleSpec protocol."""
+
+  @override
+  def module_spec_pairs(self, backend_sl: Any) -> dict[Any, Any]:
+    return {backend_sl.simple: simple_spec.ModuleSpec}
+
+
+class IdentityTest(test_utils.SequenceLayerTest):
+  """Test behavior of Identity layer."""
+
+  def test_defaults(self):
+    # pyrefly: ignore [missing-attribute]
+    self.assertConfigDefaults(self.sl.Identity.Config, {'name': None})
+
+  @parameterized.parameters((((2, 3, 5)),), (((2, 3, 5, 9)),))
+  def test_identity(self, shape):
+    x = self.random_sequence(*shape)
+    # pyrefly: ignore [missing-attribute]
+    l = self.sl.Identity.Config(name='identity').make()
+    l = self.init_layer(l, x)
+    self.assertEqual(l.block_size, 1)
+    self.assertEqual(l.output_ratio, 1)
+    self.verify_contract(l, x, training=False)
+
+
+class PointwiseMathTest(test_utils.SequenceLayerTest):
+  """Test behavior of pointwise math layers."""
+
+  def test_defaults(self):
+    # pyrefly: ignore [missing-attribute]
+    for layer_cls in [self.sl.Abs, self.sl.Exp, self.sl.Log]:
+      with self.subTest(layer=layer_cls.__name__):
+        self.assertConfigDefaults(layer_cls.Config, {'name': None})
+
+  def make_layer(self, layer_name):
+    """Helper to create a layer by name."""
+    layer_cls = getattr(self.sl, layer_name)
+    return layer_cls.Config(name=layer_name.lower()).make()
+
+  def test_pointwise_math(self):
+    params = [
+        ('Relu', 'relu', False),
+        ('Sigmoid', 'sigmoid', False),
+        ('Tanh', 'tanh', False),
+        ('Elu', 'elu', False),
+        ('Softplus', 'softplus', False),
+        ('Swish', 'swish', False),
+        ('Gelu', 'gelu', False),
+        ('Abs', 'abs', True),
+        ('Exp', 'exp', True),
+        ('Log', 'log', True),
+        ('Softmax', 'softmax', False),
+    ]
+    for layer_name, method_name, is_xp in params:
+      with self.subTest(layer=layer_name):
+        x = self.random_sequence(2, 10, 4)
+        l = self.make_layer(layer_name)
+        l = self.init_layer(l, x)
+
+        self.assertEqual(l.block_size, 1)
+        self.assertEqual(l.output_ratio, 1)
+
+        y = self.verify_contract(l, x, training=False)
+
+        activation = getattr(
+            self.sl.backend.xp if is_xp else self.nn, method_name
+        )
+        y_expected = x.apply_values(activation).mask_invalid()
+        self.assertSequencesClose(y, y_expected, rtol=1e-5, atol=1e-5)
+
+  @parameterized.parameters(
+      ('Softmax', 'softmax', -1),
+      ('Softmax', 'softmax', -2),
+      ('Softmax', 'softmax', 2),
+      ('Softmax', 'softmax', 3),
+  )
+  def test_pointwise_math_axis(self, layer_name, method_name, axis):
+    batch_size, time, channels, channels2 = 2, 10, 4, 3
+    x = self.random_sequence(batch_size, time, channels, channels2)
+    l = getattr(self.sl, layer_name).Config(name='test', axis=axis).make()
+    l = self.init_layer(l, x)
+
+    self.assertEqual(l.block_size, 1)
+    self.assertEqual(l.output_ratio, 1)
+    self.assertEqual(l.get_output_shape_for_sequence(x), (channels, channels2))
+    self.assertEqual(l.name, 'test')
+    y = self.verify_contract(l, x, training=False)
+    self.assertEmpty(self.get_variables(l))
+
+    activation = getattr(self.nn, method_name)
+    y_expected = x.apply_values(
+        lambda v: activation(v, axis=axis)
+    ).mask_invalid()
+    self.assertSequencesClose(y, y_expected)
+
+  @parameterized.parameters(
+      ('Softmax', (2, 10, 4), -2),
+      ('Softmax', (2, 10, 4), -3),
+      ('Softmax', (2, 10, 4), 0),
+      ('Softmax', (2, 10, 4), 1),
+      ('Softmax', (2, 10), -1),
+  )
+  def test_pointwise_math_axis_invalid(self, layer_name, shape, axis):
+    x = self.random_sequence(*shape)
+    l = getattr(self.sl, layer_name).Config(name='test', axis=axis).make()
+
+    with self.assertRaises(ValueError):
+      l = self.init_layer(l, x)
+      l.layer(x, training=False)
+
+
+class Downsample1DTest(test_utils.SequenceLayerTest):
+  """Test behavior of Downsample1D layer."""
+
+  @parameterized.parameters(((2, 3, 5), 2), ((2, 3, 5, 9), 3))
+  def test_downsample1d(self, shape, rate):
+    x = self.random_sequence(*shape)
+    # pyrefly: ignore [missing-attribute]
+    l = self.sl.Downsample1D.Config(rate=rate, name='downsample_1d').make()
+    l = self.init_layer(l, x)
+
+    self.assertEqual(l.block_size, rate)
+    self.assertEqual(l.output_ratio, Fraction(1, rate))
+
+    self.assertEqual(l.get_output_shape_for_sequence(x), x.channel_shape)
+    y = self.verify_contract(l, x, training=False)
+    self.assertEmpty(self.get_variables(l))
+
+    np.testing.assert_array_equal(y.values, x.values[:, ::rate])
+    np.testing.assert_array_equal(y.mask, x.mask[:, ::rate])
+
+
+class Upsample1DTest(test_utils.SequenceLayerTest):
+  """Test behavior of Upsample1D layer."""
+
+  @parameterized.parameters(((2, 3, 5), 2), ((2, 3, 5, 9), 3))
+  def test_upsample1d(self, shape, rate):
+    x = self.random_sequence(*shape)
+    # pyrefly: ignore [missing-attribute]
+    l = self.sl.Upsample1D.Config(rate=rate, name='upsample_1d').make()
+    l = self.init_layer(l, x)
+
+    self.assertEqual(l.block_size, 1)
+    self.assertEqual(l.output_ratio, rate)
+
+    self.assertEqual(l.get_output_shape_for_sequence(x), x.channel_shape)
+    y = self.verify_contract(l, x, training=False)
+    self.assertEmpty(self.get_variables(l))
+
+    for i in range(rate):
+      np.testing.assert_array_equal(x.values, y.values[:, i::rate])
+      np.testing.assert_array_equal(x.mask, y.mask[:, i::rate])
+
+
+class TransposeTest(test_utils.SequenceLayerTest):
+  """Test behavior of Transpose layer."""
+
+  @parameterized.parameters(
+      ((2, 3, 4, 5), (2, 3), (4, 5)),
+      ((2, 3, 4, 5, 6), (4, 2, 3), (6, 4, 5)),
+      ((2, 3), None, ()),
+  )
+  def test_transpose(self, input_shape, axes, _output_shape):
+    x = self.random_sequence(*input_shape)
+    # pyrefly: ignore [missing-attribute]
+    l = self.sl.Transpose.Config(axes=axes, name='transpose').make()
+    l = self.init_layer(l, x)
+
+    self.assertEqual(l.block_size, 1)
+    self.assertEqual(l.output_ratio, 1)
+
+    self.assertEqual(l.get_output_shape_for_sequence(x), _output_shape)
+    y = self.verify_contract(l, x, training=False)
+    self.assertEmpty(self.get_variables(l))
+
+    # Verify shape and values
+    if axes is not None:
+      y_expected = x.apply_values(np.transpose, (0, 1) + axes)
+    else:
+      axes_seq = (0, 1) + tuple(range(2, x.ndim))[::-1]
+      y_expected = x.apply_values(np.transpose, axes_seq)
+
+    self.assertSequencesEqual(y, y_expected)
+
+
+class DropoutTest(test_utils.SequenceLayerTest):
+  """Test behavior of Dropout layer."""
+
+  def test_defaults(self):
+    self.assertConfigDefaults(
+        # pyrefly: ignore [missing-attribute]
+        self.sl.Dropout.Config,
+        {'rate': 0.0, 'name': None},
+    )
+
+  def test_dropout_inference(self):
+    # pyrefly: ignore [missing-attribute]
+    l = self.sl.Dropout.Config(rate=0.5, name='dropout').make()
+    x = self.random_sequence(2, 3, 5)
+    l = self.init_layer(l, x)
+    y = l.layer(x, training=False)
+    # In inference, dropout should be identity
+    np.testing.assert_allclose(y.values, x.values)
+
+
+class FlattenTest(test_utils.SequenceLayerTest):
+  """Test behavior of Flatten layer."""
+
+  @parameterized.parameters(
+      (((2, 3, 5)),), (((2, 3, 5, 9)),), (((2, 3, 5, 9, 2)),)
+  )
+  def test_flatten(self, shape):
+    x = self.random_sequence(*shape)
+    l = self.sl.Flatten.Config(name='flatten').make()
+    l = self.init_layer(l, x)
+
+    self.assertEqual(l.block_size, 1)
+    self.assertEqual(l.output_ratio, 1)
+    num_elements = np.prod(shape[2:])
+
+    self.assertEqual(l.get_output_shape_for_sequence(x), (num_elements,))
+    y = self.verify_contract(l, x, training=False)
+    self.assertEmpty(self.get_variables(l))
+
+    # Verify shape
+    expected_shape = shape[:2] + (num_elements,)
+    self.assertEqual(y.values.shape, expected_shape)
+
+    # Verify values
+    y_expected = x.apply_values(np.reshape, shape[:2] + (num_elements,))
+    self.assertSequencesEqual(y, y_expected)
+
+
+class ReshapeTest(test_utils.SequenceLayerTest):
+  """Test behavior of Reshape layer."""
+
+  @parameterized.parameters(
+      ((2, 3, 5), (1, 5, 1)),
+      ((2, 3, 5, 9), (3, 3, 5)),
+      ((2, 3, 1), ()),
+      ((2, 3), (1,)),
+  )
+  def test_reshape(self, shape, output_shape):
+    x = self.random_sequence(*shape)
+    l = self.sl.Reshape.Config(output_shape, name='reshape').make()
+    l = self.init_layer(l, x)
+
+    self.assertEqual(l.block_size, 1)
+    self.assertEqual(l.output_ratio, 1)
+
+    self.assertEqual(l.get_output_shape_for_sequence(x), output_shape)
+    y = self.verify_contract(l, x, training=False)
+    self.assertEmpty(self.get_variables(l))
+
+    # Verify shape
+    expected_shape = shape[:2] + output_shape
+    self.assertEqual(y.values.shape, expected_shape)
+
+    # Verify values
+    y_expected = x.apply_values(np.reshape, shape[:2] + output_shape)
+    self.assertSequencesEqual(y, y_expected)
+
+
+class ExpandDimsTest(test_utils.SequenceLayerTest):
+  """Test behavior of ExpandDims layer."""
+
+  def test_basic(self):
+    x = self.random_sequence(2, 3, 4)
+    l = self.sl.ExpandDims.Config(axis=-1, name='expand_dims').make()
+    l = self.init_layer(l, x)
+
+    self.assertEqual(l.block_size, 1)
+    self.assertEqual(l.output_ratio, 1)
+
+    y = self.verify_contract(l, x, training=False)
+    self.assertEqual(y.values.shape, (2, 3, 4, 1))
+
+  def test_output_shape(self):
+    l = self.sl.ExpandDims.Config(axis=0, name='expand_dims').make()
+    self.assertEqual(l.get_output_shape((4, 8)), (1, 4, 8))
+
+
+class SqueezeTest(test_utils.SequenceLayerTest):
+  """Test behavior of Squeeze layer."""
+
+  @parameterized.named_parameters(
+      {
+          'testcase_name': 'float_input',
+          'input_array': np.array([[[3]]], dtype=np.float32),
+          'expected_output': np.array([[3]]),
+      },
+      {
+          'testcase_name': 'int_input',
+          'input_array': np.array([[[3]]], dtype=np.int32),
+          'expected_output': np.array([[3]], dtype=np.int32),
+      },
+      {
+          'testcase_name': 'no_op_input',
+          'input_array': np.array([[3]], dtype=np.float32),
+          'expected_output': np.array([[3]]),
+      },
+      {
+          'testcase_name': 'input_with_extra_dims',
+          'input_array': np.array([[[[[3], [4]]]]], dtype=np.float32),
+          'expected_output': np.array([[[3, 4]]]),
+      },
+  )
+  def test_squeeze(self, input_array, expected_output):
+    x = self.sl.Sequence.from_values(input_array)
+    l = self.sl.Squeeze.Config(name='squeeze').make()
+    l = self.init_layer(l, x)
+
+    self.assertEqual(l.block_size, 1)
+    self.assertEqual(l.output_ratio, 1)
+
+    self.assertEqual(
+        l.get_output_shape_for_sequence(x), expected_output.shape[2:]
+    )
+    test_receptive_field = np.issubdtype(input_array.dtype, np.inexact)
+    y = self.verify_contract(
+        l, x, training=False, test_receptive_field=test_receptive_field
+    )
+    self.assertEmpty(self.get_variables(l))
+
+    # Verify shape
+    self.assertEqual(y.values.shape, expected_output.shape)
+
+    # Verify values
+    np.testing.assert_allclose(y.values, expected_output)
+
+
+class ScaleTest(test_utils.SequenceLayerTest):
+  """Test behavior of Scale layer."""
+
+  @parameterized.parameters(((2, 13, 5),), ((2, 13, 5, 9),))
+  def test_basic(self, shape):
+    x = self.random_sequence(*shape)
+    l = self.sl.Scale.Config(scale=2.0, name='scale').make()
+    l = self.init_layer(l, x)
+
+    self.assertEqual(l.block_size, 1)
+    self.assertEqual(l.output_ratio, 1)
+
+    self.assertEqual(l.get_output_shape_for_sequence(x), shape[2:])
+    y = self.verify_contract(l, x, training=False)
+    self.assertEmpty(self.get_variables(l))
+
+    # Verify values
+    y_expected = x.apply_values(lambda v: v * 2.0)
+    self.assertSequencesEqual(y, y_expected)
+
+  @parameterized.parameters(((2, 13, 5),), ((2, 13, 9, 5),))
+  def test_ndarray(self, shape):
+    x = self.random_sequence(*shape)
+    l = self.sl.Scale.Config(
+        scale=np.arange(5, dtype=np.float32), name='scale'
+    ).make()
+    l = self.init_layer(l, x)
+
+    self.assertEqual(l.block_size, 1)
+    self.assertEqual(l.output_ratio, 1)
+
+    self.assertEqual(l.get_output_shape_for_sequence(x), shape[2:])
+    y = self.verify_contract(l, x, training=False)
+    self.assertEmpty(self.get_variables(l))
+
+    # Verify values
+    y_expected = x.apply_values(lambda v: v * np.arange(5, dtype=np.float32))
+    self.assertSequencesEqual(y, y_expected)
+
+  def test_broadcast(self):
+    x = self.random_sequence(2, 3, 5, 1)
+    l = self.sl.Scale.Config(scale=np.ones((5, 9))).make()
+    l = self.init_layer(l, x)
+
+    self.assertEqual(l.get_output_shape_for_sequence(x), (5, 9))
+    y = self.verify_contract(l, x, training=False)
+    self.assertEqual(y.values.shape, (2, 3, 5, 9))
+    self.assertEmpty(self.get_variables(l))
+
+  def test_too_many_dims(self):
+    x = self.random_sequence(2, 3, 5, 1)
+    l = self.sl.Scale.Config(scale=np.ones((5, 5, 5))).make()
+    l = self.init_layer(l, x, bind_only=True)
+    with self.assertRaises(ValueError):
+      l.get_output_shape(x.channel_shape)
+    with self.assertRaises(ValueError):
+      l.layer(x, training=False)
+
+  def test_broadcast_failure(self):
+    x = self.random_sequence(2, 3, 5, 9)
+    l = self.sl.Scale.Config(scale=np.ones((5,))).make()
+    l = self.init_layer(l, x, bind_only=True)
+    with self.assertRaises(ValueError):
+      l.get_output_shape(x.channel_shape)
+    with self.assertRaises(ValueError):
+      l.layer(x, training=False)
+
+
+class AddTest(test_utils.SequenceLayerTest):
+  """Test behavior of Add layer."""
+
+  @parameterized.parameters((((2, 13, 5)),), (((2, 13, 5, 9)),))
+  def test_add(self, shape):
+    x = self.random_sequence(*shape)
+    l = self.sl.Add.Config(-2.0, name='add').make()
+    l = self.init_layer(l, x)
+
+    self.assertEqual(l.block_size, 1)
+    self.assertEqual(l.output_ratio, 1)
+
+    self.assertEqual(l.get_output_shape_for_sequence(x), shape[2:])
+    y = self.verify_contract(l, x, training=False)
+    self.assertEmpty(self.get_variables(l))
+
+    # Verify values
+    y_expected = x.apply_values(lambda v: v - 2.0).mask_invalid()
+    self.assertSequencesEqual(y, y_expected)
+
+  @parameterized.parameters(((2, 13, 5),), ((2, 13, 9, 5),))
+  def test_ndarray(self, shape):
+    x = self.random_sequence(*shape)
+    l = self.sl.Add.Config(
+        shift=np.arange(5, dtype=np.float32), name='add'
+    ).make()
+    l = self.init_layer(l, x)
+
+    self.assertEqual(l.block_size, 1)
+    self.assertEqual(l.output_ratio, 1)
+
+    self.assertEqual(l.get_output_shape_for_sequence(x), shape[2:])
+    y = self.verify_contract(l, x, training=False)
+    self.assertEmpty(self.get_variables(l))
+
+    # Verify values
+    y_expected = x.apply_values(
+        lambda v: v + np.arange(5, dtype=np.float32)
+    ).mask_invalid()
+    self.assertSequencesEqual(y, y_expected)
+
+  def test_broadcast(self):
+    x = self.random_sequence(2, 3, 5, 1)
+    l = self.sl.Add.Config(shift=np.ones((5, 9))).make()
+    l = self.init_layer(l, x)
+
+    self.assertEqual(l.get_output_shape_for_sequence(x), (5, 9))
+    y = self.verify_contract(l, x, training=False)
+    self.assertEqual(y.values.shape, (2, 3, 5, 9))
+    self.assertEmpty(self.get_variables(l))
+
+  def test_too_many_dims(self):
+    x = self.random_sequence(2, 3, 5, 1)
+    l = self.sl.Add.Config(shift=np.ones((5, 5, 5))).make()
+    l = self.init_layer(l, x, bind_only=True)
+    with self.assertRaises(ValueError):
+      l.get_output_shape(x.channel_shape)
+    with self.assertRaises(ValueError):
+      l.layer(x, training=False)
+
+  def test_broadcast_failure(self):
+    x = self.random_sequence(2, 3, 5, 9)
+    l = self.sl.Add.Config(shift=np.ones((5,))).make()
+    l = self.init_layer(l, x, bind_only=True)
+    with self.assertRaises(ValueError):
+      l.get_output_shape(x.channel_shape)
+    with self.assertRaises(ValueError):
+      l.layer(x, training=False)
+
+
+class CastTest(test_utils.SequenceLayerTest):
+  """Test behavior of Cast layer."""
+
+  @parameterized.parameters(
+      (((2, 3, 5)), np.float16),
+      (((2, 3, 5, 9)), np.int32),
+  )
+  def test_cast(self, shape, target_dtype):
+    x = self.random_sequence(*shape, dtype=np.float32)
+    l = self.sl.Cast.Config(target_dtype, name='cast').make()
+    l = self.init_layer(l, x)
+
+    self.assertEqual(l.block_size, 1)
+    self.assertEqual(l.output_ratio, 1)
+
+    self.assertEqual(l.get_output_shape_for_sequence(x), shape[2:])
+    test_receptive_field = np.issubdtype(target_dtype, np.inexact)
+
+    pad_value = np.nan if target_dtype == np.float16 else 32768
+
+    y = self.verify_contract(
+        l,
+        x,
+        training=False,
+        padding_invariance_pad_value=pad_value,
+        test_receptive_field=test_receptive_field,
+    )
+    self.assertEmpty(self.get_variables(l))
+
+    self.assertEqual(y.values.dtype, target_dtype)
+
+
+class MaskInvalidTest(test_utils.SequenceLayerTest):
+  """Test behavior of MaskInvalid layer."""
+
+  def test_basic(self):
+    x = self.random_sequence(2, 15, 5)
+    l = self.sl.MaskInvalid.Config(name='mask_invalid').make()
+    l = self.init_layer(l, x)
+
+    self.assertEqual(l.block_size, 1)
+    self.assertEqual(l.output_ratio, 1)
+
+    self.assertEqual(l.get_output_shape_for_sequence(x), (5,))
+    self.verify_contract(l, x, training=False)
+    self.assertEmpty(self.get_variables(l))
+
+    # Now test specific behavior
+    # Fill invalid values with NaN
+    x_nan = x.mask_invalid(np.nan)
+
+    # Apply layer
+    y = l.layer(x_nan, training=False)
+
+    # Verify that invalid values are masked (zeroed)
+    self.assertSequencesEqual(x.mask_invalid(), y)
+
+
+class GatedUnitTest(test_utils.SequenceLayerTest):
+  """Test behavior of GatedUnit layers."""
+
+  def test_gated_activation(self):
+    shapes = ((2, 13, 6), (2, 13, 5, 10))
+
+    configs = [
+        self.sl.GatedUnit.Config(None, None),  # Bilinear
+        self.sl.GatedUnit.Config(None, self.nn.swish),  # SwiGLU
+        self.sl.GatedUnit.Config(None, self.nn.gelu),  # GeGLU
+        self.sl.GatedUnit.Config(lambda x: x, None),  # Bilinear
+        self.sl.GatedUnit.Config(self.nn.swish, self.nn.tanh),
+        self.sl.GatedTanhUnit.Config(),
+        self.sl.GatedLinearUnit.Config(),
+    ]
+
+    for shape in shapes:
+      for l_config in configs:
+        with self.subTest(shape=shape, config=str(l_config)):
+          x = self.random_sequence(*shape)
+          l = l_config.make()
+          l = self.init_layer(l, x)
+
+          self.assertEqual(l.block_size, 1)
+          self.assertEqual(l.output_ratio, 1)
+          self.assertEqual(
+              l.get_output_shape_for_sequence(x),
+              shape[2:-1] + (shape[-1] // 2,),
+          )
+          self.verify_contract(l, x, training=True)
+
+
+class OneHotTest(test_utils.SequenceLayerTest):
+  """Test behavior of OneHot layer."""
+
+  @parameterized.parameters(((1, 2, 3),), ((2, 3, 5, 9),), ((2, 3, 5, 9, 2),))
+  def test_one_hot(self, shape):
+    depth = 4
+    l = self.sl.OneHot.Config(depth, name='one_hot').make()
+    x = self.random_sequence(*shape, dtype=self.xp.int32, low=0, high=depth - 1)
+    self.assertEqual(l.block_size, 1)
+    self.assertEqual(l.output_ratio, 1)
+    self.assertEqual(l.get_output_shape_for_sequence(x), shape[2:] + (depth,))
+    self.assertEqual(l.name, 'one_hot')
+
+    l = self.init_layer(l, x)
+
+    y = self.verify_contract(
+        l,
+        x,
+        training=False,
+        padding_invariance_pad_value=0,
+        test_gradients=False,
+        test_receptive_field=False,
+    )
+    self.assertAllEqual(
+        y.values,
+        (
+            np.eye(depth)[np.array(x.values)].T
+            * np.array(x.mask).astype(np.float32).T
+        ).T,
+    )
+
+
+class EmbeddingTest(test_utils.SequenceLayerTest):
+  """Test behavior of Embedding layer."""
+
+  def test_defaults(self):
+    self.assertConfigDefaults(
+        self.sl.Embedding.Config,
+        {'dimension': 10, 'num_embeddings': 100, 'name': None},
+        dimension=10,
+        num_embeddings=100,
+    )
+
+  def test_embedding(self):
+    shapes = [(1, 2, 3), (2, 3, 5, 9)]
+    dimension, num_embeddings = 8, 5
+
+    for shape in shapes:
+      with self.subTest(shape=shape):
+        l = self.sl.Embedding.Config(
+            dimension=dimension, num_embeddings=num_embeddings, name='embedding'
+        ).make()
+        x = self.random_sequence(
+            *shape, dtype=self.xp.int32, low=0, high=num_embeddings - 1
+        )
+
+        self.assertEqual(l.block_size, 1)
+        self.assertEqual(l.output_ratio, 1)
+        self.assertEqual(
+            l.get_output_shape(x.channel_shape), shape[2:] + (dimension,)
+        )
+
+        l = self.init_layer(l, x)
+
+        self.verify_contract(
+            l,
+            x,
+            training=False,
+            test_gradients=False,
+            test_receptive_field=False,
+        )
+
+
+class LambdaTest(test_utils.SequenceLayerTest):
+  """Test behavior of Lambda layer."""
+
+  @parameterized.parameters(True, False)
+  def test_array_fn(self, mask_required: bool):
+    def fn(v):
+      if mask_required:
+        # Change the masked status by adding 1.
+        v = v + 1.0
+      return v.reshape(v.shape + (1,)) > 0.5
+
+    l = self.sl.simple.Lambda.Config(
+        fn,
+        mask_required=mask_required,
+        expected_input_spec=self.sl.types.ChannelSpec((5,), self.xp.float32),
+        name='lambda',
+    ).make()
+
+    x = self.random_sequence(2, 3, 5)
+    l = self.init_layer(l, x)
+
+    self.assertEqual(l.block_size, 1)
+    self.assertEqual(l.output_ratio, 1)
+
+    # Output spec reflects the changed shape and dtype.
+    self.assertEqual(l.get_output_shape(x.channel_shape), (5, 1))
+    self.assertEqual(l.get_output_dtype(x.dtype), self.xp.bool_)
+
+    y = self.verify_contract(
+        l,
+        x,
+        training=False,
+        # Receptive field test is not supported for bools.
+        test_receptive_field=False,
+    )
+
+    self.assertSequencesClose(y, x.apply_values(fn).mask_invalid())
+
+  @parameterized.parameters(True, False)
+  def test_sequence_fn(self, mask_required: bool):
+    def fn(x):
+      if mask_required:
+        # Change the masked status by adding 1.
+        x = x.apply_values(lambda v: v + 1.0)
+      return x.apply_values_masked(lambda v: v.reshape(v.shape + (1,)) > 0.5)
+
+    l = self.sl.simple.Lambda.Config(
+        fn,
+        sequence_input=True,
+        expected_input_spec=self.sl.types.ChannelSpec((5,), self.xp.float32),
+        name='lambda',
+    ).make()
+
+    x = self.random_sequence(2, 3, 5)
+    l = self.init_layer(l, x)
+
+    self.assertEqual(l.block_size, 1)
+    self.assertEqual(l.output_ratio, 1)
+
+    # Output spec reflects the changed shape and dtype.
+    self.assertEqual(l.get_output_shape(x.channel_shape), (5, 1))
+    self.assertEqual(l.get_output_dtype(x.dtype), self.xp.bool_)
+
+    y = self.verify_contract(
+        l,
+        x,
+        training=False,
+        # Receptive field test is not supported for bools.
+        test_receptive_field=False,
+    )
+
+    self.assertSequencesClose(y, fn(x).mask_invalid())
+
+
+class CheckpointNameTest(test_utils.SequenceLayerTest):
+  """Test behavior of CheckpointName layer."""
+
+  def test_basic(self):
+    x = self.random_sequence(2, 3, 5)
+    l = self.sl.simple.CheckpointName.Config(
+        checkpoint_name='test', name='checkpoint_name'
+    ).make()
+    l = self.init_layer(l, x)
+
+    self.assertEqual(l.block_size, 1)
+    self.assertEqual(l.output_ratio, 1)
+    self.assertEqual(l.get_output_shape(x.channel_shape), (5,))
+    self.verify_contract(l, x, training=False)
+
+
+# pylint: disable=missing-function-docstring
+class Has:
+  """A simple `HAS(v)` matcher that tests whether something has `v` in it."""
+
+  def __init__(self, value):
+    self._v = value
+
+  @override
+  def __eq__(self, o):
+    return self._v in o
+
+  @override
+  def __ne__(self, o):
+    return not self == o
+
+  @override
+  def __repr__(self):
+    return f'<HAS({self._v!r})>'
+
+
+class Not:
+  """Negates a matcher."""
+
+  def __init__(self, matcher):
+    self._matcher = matcher
+
+  @override
+  def __eq__(self, o):
+    return self._matcher != o
+
+  @override
+  def __ne__(self, o):
+    return not self == o
+
+  @override
+  def __repr__(self):
+    return f'<NOT({self._matcher!r})>'
+
+
+class LoggingTest(test_utils.SequenceLayerTest):
+  """Test behavior of Logging layer."""
+
+  @mock.patch.object(logging, 'info', wraps=logging.info)
+  def test_logs_tensors(self, mock_logger):
+    x = self.sl.types.Sequence.from_values(self.xp.array([[1.414, 2, 3, 4]]))
+    training = False
+
+    with self.subTest('prefix'):
+      l = self.sl.simple.Logging.Config(prefix='test string').make()
+      l = self.init_layer(l, x, bind_only=True)
+      l.layer(x, training=training)
+      mock_logger.assert_called_with(Has('test string'))
diff --git a/sequence_layers/specs/types.py b/sequence_layers/specs/types.py
index d2b216f..395815c 100644
--- a/sequence_layers/specs/types.py
+++ b/sequence_layers/specs/types.py
@@ -956,6 +956,7 @@ def layer_with_emits(
 
 
 _ChannelSpecType = ChannelSpec
+_HashableArrayType = HashableArray
 _SequenceType = Sequence
 _MaskedSequenceType = MaskedSequence
 _SequenceLayerType = SequenceLayer
@@ -977,6 +978,10 @@ def ChannelSpec(self) -> type[_ChannelSpecType]:
   def ShapeDType(self) -> type[_ChannelSpecType]:
     ...
 
+  @property
+  def HashableArray(self) -> type[_HashableArrayType]:
+    ...
+
   @property
   def Sequence(self) -> type[_SequenceType[Any, Any]]:
     ...
diff --git a/sequence_layers/specs/types_behaviors.py b/sequence_layers/specs/types_behaviors.py
index 8bdc6ae..3f93c87 100644
--- a/sequence_layers/specs/types_behaviors.py
+++ b/sequence_layers/specs/types_behaviors.py
@@ -844,3 +844,59 @@ def test_layer_applies_fn_based_on_mask_required(self) -> None:
             else:
               mock_apply_masked.assert_called_once()
               mock_apply.assert_not_called()
+
+  def test_mask_required_default(self) -> None:
+    """Tests that mask_required defaults to True."""
+    backend_sl = self.sl
+
+    class DefaultLayer(
+        DefaultTestLayer, backend_sl.types.StatelessPointwiseFunctor
+    ):
+      """Mock layer for testing defaults."""
+
+      def fn(self, values: Any, mask: Any) -> tuple[Any, Any]:
+        """Pointwise function."""
+        return values, mask
+
+      @override
+      def layer(self, *args, **kwargs):
+        """Calls base layer."""
+        return backend_sl.types.StatelessPointwiseFunctor.layer(
+            self, *args, **kwargs
+        )
+
+      @override
+      def get_output_shape(self, *args, **kwargs):
+        """Calls base get_output_shape."""
+        return backend_sl.types.StatelessPointwiseFunctor.get_output_shape(
+            self, *args, **kwargs
+        )
+
+    layer = DefaultLayer()
+    self.assertTrue(layer.mask_required)
+
+
+class HashableArrayTest(SequenceLayerTest):
+  """Tests for HashableArray."""
+
+  def test_hashable_array(self) -> None:
+    # We need to get HashableArray from the backend types!
+    HashableArray = self.sl.types.HashableArray
+
+    # Create a numpy array
+    x = np.array([[1.0, 2.0], [3.0, 4.0]])
+
+    # Create HashableArray
+    ha = HashableArray.from_array(x)
+
+    # Check properties
+    self.assertEqual(ha.dtype, x.dtype)
+
+    # Check to_array
+    x_back = ha.to_array()
+    np.testing.assert_array_equal(x, x_back)
+
+    # Check hashability
+    h = hash(ha)
+    self.assertIsInstance(h, int)
+

From 60b7f642d9bd6c6c2880f8a2dcf3f9bb4a935044 Mon Sep 17 00:00:00 2001
From: Julian Salazar <julsal@google.com>
Date: Mon, 1 Jun 2026 19:48:25 +0000
Subject: [PATCH 11/29] fix: resolve specs mixin generic MRO issues and coerce
 Sequence inputs to mx.array

TAG=agy
CONV=21ada17b-3411-4090-8450-e69d8ebfeae6
---
 sequence_layers/mlx/types.py    |  4 +--
 sequence_layers/specs/dsp.py    | 22 +++++++-------
 sequence_layers/specs/simple.py | 54 ++++++++++++++++-----------------
 3 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/sequence_layers/mlx/types.py b/sequence_layers/mlx/types.py
index 33eedaa..327b33c 100644
--- a/sequence_layers/mlx/types.py
+++ b/sequence_layers/mlx/types.py
@@ -168,8 +168,8 @@ class Sequence[ValuesT: mx.array, MaskT: mx.array](
   mask: MaskT
 
   def __init__(self, values: ValuesT, mask: MaskT):
-    self.values = values
-    self.mask = mask
+    self.values = mx.array(values) if not isinstance(values, mx.array) else values
+    self.mask = mx.array(mask) if not isinstance(mask, mx.array) else mask
 
   @property
   @override
diff --git a/sequence_layers/specs/dsp.py b/sequence_layers/specs/dsp.py
index d312fd2..270956f 100644
--- a/sequence_layers/specs/dsp.py
+++ b/sequence_layers/specs/dsp.py
@@ -26,8 +26,8 @@
 class Delay[
     SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
 ](
-    types_spec.PreservesShape[SequenceT, SequenceT, ShapeDTypeT],
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesShape,
+    types_spec.PreservesType,
     types_spec.SequenceLayer[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -49,8 +49,8 @@ def make(self) -> Any:
 class Lookahead[
     SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
 ](
-    types_spec.PreservesShape[SequenceT, SequenceT, ShapeDTypeT],
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesShape,
+    types_spec.PreservesType,
     types_spec.SequenceLayer[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -72,8 +72,8 @@ def make(self) -> Any:
 class Window[
     SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
 ](
-    types_spec.PreservesShape[SequenceT, SequenceT, ShapeDTypeT],
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesShape,
+    types_spec.PreservesType,
     types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -95,7 +95,7 @@ def make(self) -> Any:
 class Frame[
     SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
 ](
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType,
     types_spec.SequenceLayer[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -121,7 +121,7 @@ def make(self) -> Any:
 class OverlapAdd[
     SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
 ](
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType,
     types_spec.SequenceLayer[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -142,7 +142,7 @@ def make(self) -> Any:
 
 
 class FFT[SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec](
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType,
     types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -163,7 +163,7 @@ def make(self) -> Any:
 
 
 class IFFT[SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec](
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType,
     types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -279,7 +279,7 @@ def make(self) -> Any:
 class LinearToMelSpectrogram[
     SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
 ](
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType,
     types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
diff --git a/sequence_layers/specs/simple.py b/sequence_layers/specs/simple.py
index c05fa8a..760afe3 100644
--- a/sequence_layers/specs/simple.py
+++ b/sequence_layers/specs/simple.py
@@ -25,7 +25,7 @@
 class Identity[
     SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
 ](
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType,
     types_spec.StatelessPointwise[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -37,7 +37,7 @@ class Config(types_spec.SequenceLayerConfig):
 
 
 class Relu[SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec](
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType,
     types_spec.StatelessPointwiseFunctor[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -49,7 +49,7 @@ class Config(types_spec.SequenceLayerConfig):
 
 
 class Gelu[SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec](
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType,
     types_spec.StatelessPointwiseFunctor[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -61,7 +61,7 @@ class Config(types_spec.SequenceLayerConfig):
 
 
 class Abs[SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec](
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType,
     types_spec.StatelessPointwiseFunctor[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -73,7 +73,7 @@ class Config(types_spec.SequenceLayerConfig):
 
 
 class Exp[SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec](
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType,
     types_spec.StatelessPointwiseFunctor[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -85,7 +85,7 @@ class Config(types_spec.SequenceLayerConfig):
 
 
 class Log[SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec](
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType,
     types_spec.StatelessPointwiseFunctor[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -99,7 +99,7 @@ class Config(types_spec.SequenceLayerConfig):
 class Swish[
     SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
 ](
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType,
     types_spec.StatelessPointwiseFunctor[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -111,7 +111,7 @@ class Config(types_spec.SequenceLayerConfig):
 
 
 class Tanh[SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec](
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType,
     types_spec.StatelessPointwiseFunctor[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -125,7 +125,7 @@ class Config(types_spec.SequenceLayerConfig):
 class Sigmoid[
     SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
 ](
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType,
     types_spec.StatelessPointwiseFunctor[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -139,7 +139,7 @@ class Config(types_spec.SequenceLayerConfig):
 class LeakyRelu[
     SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
 ](
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType,
     types_spec.StatelessPointwiseFunctor[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -151,7 +151,7 @@ class Config(types_spec.SequenceLayerConfig):
 
 
 class Elu[SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec](
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType,
     types_spec.StatelessPointwiseFunctor[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -165,7 +165,7 @@ class Config(types_spec.SequenceLayerConfig):
 class Softmax[
     SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
 ](
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType,
     types_spec.StatelessPointwiseFunctor[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -179,7 +179,7 @@ class Config(types_spec.SequenceLayerConfig):
 class Softplus[
     SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
 ](
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType,
     types_spec.StatelessPointwiseFunctor[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -209,7 +209,7 @@ class Config(types_spec.SequenceLayerConfig):
 class Scale[
     SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
 ](
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType,
     types_spec.StatelessPointwise[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -221,7 +221,7 @@ class Config(types_spec.SequenceLayerConfig):
 
 
 class Add[SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec](
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType,
     types_spec.StatelessPointwise[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -235,7 +235,7 @@ class Config(types_spec.SequenceLayerConfig):
 class MaskInvalid[
     SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
 ](
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType,
     types_spec.StatelessPointwise[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -257,7 +257,7 @@ class Config(types_spec.SequenceLayerConfig):
 class GatedUnit[
     SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
 ](
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType,
     types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -299,7 +299,7 @@ class Config[T](GatedUnit.Config[T]):
 class Flatten[
     SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
 ](
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType,
     types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -309,7 +309,7 @@ class Flatten[
 class Reshape[
     SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
 ](
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType,
     types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -325,7 +325,7 @@ class Config(types_spec.SequenceLayerConfig):
 class ExpandDims[
     SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
 ](
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType,
     types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -341,7 +341,7 @@ class Config(types_spec.SequenceLayerConfig):
 class Squeeze[
     SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
 ](
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType,
     types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -357,7 +357,7 @@ class Config(types_spec.SequenceLayerConfig):
 class Transpose[
     SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
 ](
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType,
     types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -409,7 +409,7 @@ class Config(types_spec.SequenceLayerConfig):
 class Dropout[
     SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
 ](
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType,
     types_spec.StatelessPointwise[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -425,7 +425,7 @@ class Config(types_spec.SequenceLayerConfig):
 class Downsample1D[
     SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
 ](
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType,
     types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -441,7 +441,7 @@ class Config(types_spec.SequenceLayerConfig):
 class Upsample1D[
     SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
 ](
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType,
     types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -457,7 +457,7 @@ class Config(types_spec.SequenceLayerConfig):
 class CheckpointName[
     SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
 ](
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType,
     types_spec.StatelessPointwiseFunctor[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):
@@ -488,7 +488,7 @@ class Config(types_spec.SequenceLayerConfig):
 class Logging[
     SequenceT: types_spec.Sequence, ShapeDTypeT: types_spec.ChannelSpec
 ](
-    types_spec.PreservesType[SequenceT, SequenceT, ShapeDTypeT],
+    types_spec.PreservesType,
     types_spec.StatelessPointwise[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
 ):

From 1d14507d96c392dd2cc9ebdda9cceeab90f38c2b Mon Sep 17 00:00:00 2001
From: Julian Salazar <julsal@google.com>
Date: Mon, 1 Jun 2026 20:36:13 +0000
Subject: [PATCH 12/29] refactor(mlx): Port signal/utils unit tests and fix
 latency validation

- Ported JAX signal utility tests (hann, hamming, inv_stft) to MLX with SciPy parity.
- Created MLX utils unit tests covering make_layer, latency, and delay.
- Fixed make_layer to respect custom config.make() implementations (e.g. for gated units).
- Enforced strict latency validation in MLX utils.get_output_latency to match JAX.
- Included legacy JAX conditioning base class fix from previous rebase step.

TAG=agy
CONV=21ada17b-3411-4090-8450-e69d8ebfeae6
---
 sequence_layers/jax/conditioning.py |   1 +
 sequence_layers/mlx/signal_test.py  |  70 +++++++++++++++++++
 sequence_layers/mlx/utils.py        |   8 ++-
 sequence_layers/mlx/utils_test.py   | 102 ++++++++++++++++++++++++++++
 4 files changed, 179 insertions(+), 2 deletions(-)
 create mode 100644 sequence_layers/mlx/signal_test.py
 create mode 100644 sequence_layers/mlx/utils_test.py

diff --git a/sequence_layers/jax/conditioning.py b/sequence_layers/jax/conditioning.py
index 31caead..1490a8d 100644
--- a/sequence_layers/jax/conditioning.py
+++ b/sequence_layers/jax/conditioning.py
@@ -64,6 +64,7 @@ def _get_conditioning(
 
 class BaseConditioning(
     types.PreservesType,
+    types.SequenceLayer,
     conditioning_spec.BaseConditioning[types.Sequence, types.ChannelSpec],
     metaclass=abc.ABCMeta,
 ):
diff --git a/sequence_layers/mlx/signal_test.py b/sequence_layers/mlx/signal_test.py
new file mode 100644
index 0000000..37f6a1b
--- /dev/null
+++ b/sequence_layers/mlx/signal_test.py
@@ -0,0 +1,70 @@
+import unittest
+from absl.testing import absltest
+from absl.testing import parameterized
+import numpy as np
+from scipy import signal as sp_signal
+
+from sequence_layers.mlx import signal as mlx_signal
+from sequence_layers.jax import signal as jax_signal
+
+
+class WindowTest(parameterized.TestCase):
+
+  @parameterized.product(
+      length=(16, 32, 64, 65),
+      periodic=(True, False),
+      dtype=(np.float32, np.float64),
+  )
+  def test_hann_window(self, length, periodic, dtype):
+    mlx_win = mlx_signal.hann_window(length, periodic=periodic, dtype=dtype)
+    jax_win = jax_signal.hann_window(length, periodic=periodic, dtype=dtype)
+
+    self.assertEqual(mlx_win.dtype, dtype)
+    np.testing.assert_allclose(mlx_win, np.array(jax_win), atol=1e-6)
+
+    # Also compare with scipy
+    sym = not (periodic and (length % 2 == 0))
+    scipy_win = sp_signal.windows.hann(length, sym=sym).astype(dtype)
+    np.testing.assert_allclose(mlx_win, scipy_win, atol=1e-6)
+
+  @parameterized.product(
+      length=(16, 32, 64, 65),
+      periodic=(True, False),
+      dtype=(np.float32, np.float64),
+  )
+  def test_hamming_window(self, length, periodic, dtype):
+    mlx_win = mlx_signal.hamming_window(length, periodic=periodic, dtype=dtype)
+    jax_win = jax_signal.hamming_window(length, periodic=periodic, dtype=dtype)
+
+    self.assertEqual(mlx_win.dtype, dtype)
+    np.testing.assert_allclose(mlx_win, np.array(jax_win), atol=1e-6)
+
+    # Also compare with scipy
+    sym = not (periodic and (length % 2 == 0))
+    scipy_win = sp_signal.windows.hamming(length, sym=sym).astype(dtype)
+    np.testing.assert_allclose(mlx_win, scipy_win, atol=1e-6)
+
+
+class InverseStftWindowFnTest(parameterized.TestCase):
+
+  @parameterized.product(
+      frame_length=(64, 128, 256),
+      frame_step=(16, 32, 64),
+      dtype=(np.float32, np.float64),
+  )
+  def test_inverse_stft_window_fn(self, frame_length, frame_step, dtype):
+    if frame_step > frame_length:
+      self.skipTest("frame_step must be <= frame_length")
+
+    mlx_inv_fn = mlx_signal.inverse_stft_window_fn(frame_step)
+    mlx_inv_win = mlx_inv_fn(frame_length, dtype=dtype)
+
+    jax_inv_fn = jax_signal.inverse_stft_window_fn(frame_step)
+    jax_inv_win = jax_inv_fn(frame_length, dtype=dtype)
+
+    self.assertEqual(mlx_inv_win.dtype, dtype)
+    np.testing.assert_allclose(mlx_inv_win, np.array(jax_inv_win), atol=1e-6)
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/sequence_layers/mlx/utils.py b/sequence_layers/mlx/utils.py
index 812b0ae..a2739c2 100644
--- a/sequence_layers/mlx/utils.py
+++ b/sequence_layers/mlx/utils.py
@@ -54,8 +54,7 @@ def _get_accumulated_output_latency(layer, output_latency):
     return _get_accumulated_output_latency(layer.child, output_latency)
 
   # Single layer: compute latency.
-  output_ratio = layer.output_ratio
-  return int(output_latency * output_ratio) + layer.output_latency
+  return layer.get_accumulated_output_latency(output_latency)
 
 
 def get_required_stepwise_delay(output_ratio, input_latency):
@@ -222,6 +221,11 @@ def make_layer(config, backend='mlx') -> Any:
 
     try:
       mlx_config = mlx_config_class(**kwargs)
+      if (
+          hasattr(mlx_config, 'make')
+          and type(mlx_config).make != specs_types.SequenceLayerConfig.make
+      ):
+        return mlx_config.make()
       return mlx_class(mlx_config)
     except Exception as e:  # pylint: disable=broad-exception-caught
       raise AttributeError(
diff --git a/sequence_layers/mlx/utils_test.py b/sequence_layers/mlx/utils_test.py
new file mode 100644
index 0000000..d16993a
--- /dev/null
+++ b/sequence_layers/mlx/utils_test.py
@@ -0,0 +1,102 @@
+import unittest
+from absl.testing import absltest
+from absl.testing import parameterized
+import numpy as np
+
+import sequence_layers.jax as jax_sl
+import sequence_layers.mlx as mlx_sl
+from sequence_layers.mlx import utils as mlx_utils
+from sequence_layers.jax import utils as jax_utils
+
+
+class UtilsTest(parameterized.TestCase):
+
+  def test_make_layer_simple(self):
+    jax_config = jax_sl.Scale.Config(scale=0.5)
+    mlx_layer = mlx_utils.make_layer(jax_config)
+    self.assertIsInstance(mlx_layer, mlx_sl.Scale)
+    np.testing.assert_allclose(mlx_layer.config.scale.to_array(), 0.5)
+
+  def test_make_layer_gated_unit(self):
+    # Verifies our fix for gated units
+    jax_config = jax_sl.GatedLinearUnit.Config()
+    mlx_layer = mlx_utils.make_layer(jax_config)
+    self.assertIsInstance(mlx_layer, mlx_sl.GatedLinearUnit)
+    # Activations should be populated correctly
+    self.assertIsNone(mlx_layer._feature_activation)
+    self.assertIsNotNone(mlx_layer._gate_activation) # should be mx.sigmoid
+
+  def test_get_required_stepwise_delay(self):
+    from fractions import Fraction
+    ratios = [Fraction(1, 2), Fraction(1, 4), Fraction(2, 1), Fraction(4, 1)]
+    latencies = [0, 1, 2, 3, 4, 5, 6, 7, 8]
+
+    for ratio in ratios:
+      for latency in latencies:
+        with self.subTest(ratio=str(ratio), latency=latency):
+          try:
+            mlx_delay = mlx_utils.get_required_stepwise_delay(ratio, latency)
+            jax_delay = jax_utils.get_required_stepwise_delay(ratio, latency)
+            self.assertEqual(mlx_delay, jax_delay)
+          except NotImplementedError:
+            with self.assertRaises(NotImplementedError):
+              jax_utils.get_required_stepwise_delay(ratio, latency)
+
+  @parameterized.product(
+      accumulated_latency=(0, 1, 2, 3, 4),
+  )
+  def test_get_output_latency_simple(self, accumulated_latency):
+    jax_config = jax_sl.Scale.Config(scale=0.5)
+    mlx_config = mlx_sl.Scale.Config(scale=0.5)
+    
+    mlx_lat = mlx_utils.get_output_latency(mlx_config, accumulated_latency)
+    jax_lat = jax_utils.get_output_latency(jax_config, accumulated_latency)
+    
+    self.assertEqual(mlx_lat, jax_lat)
+
+  @parameterized.product(
+      accumulated_latency=(0, 1, 2, 3, 4),
+  )
+  def test_get_output_latency_serial(self, accumulated_latency):
+    jax_config = jax_sl.Serial.Config(
+        layers=[
+            jax_sl.Scale.Config(scale=0.5),
+            jax_sl.Add.Config(shift=1.0),
+        ]
+    )
+    mlx_config = mlx_sl.Serial.Config(
+        layers=[
+            mlx_sl.Scale.Config(scale=0.5),
+            mlx_sl.Add.Config(shift=1.0),
+        ]
+    )
+    
+    mlx_lat = mlx_utils.get_output_latency(mlx_config, accumulated_latency)
+    jax_lat = jax_utils.get_output_latency(jax_config, accumulated_latency)
+    
+    self.assertEqual(mlx_lat, jax_lat)
+
+  def test_get_output_latency_validation(self):
+    # Pooling with stride=2 has output_ratio = 1/2.
+    # Divisor for latency is 1 / (1/2) = 2.
+    # If accumulated_latency is odd (e.g. 1), it should raise ValueError in JAX.
+    jax_config = jax_sl.MaxPooling1D.Config(pool_size=2, strides=2)
+    mlx_config = mlx_sl.MaxPooling1D.Config(pool_size=2, strides=2)
+    
+    # For even latency, both should succeed and match
+    mlx_lat_even = mlx_utils.get_output_latency(mlx_config, 2)
+    jax_lat_even = jax_utils.get_output_latency(jax_config, 2)
+    self.assertEqual(mlx_lat_even, jax_lat_even)
+    
+    # For odd latency, JAX should raise ValueError
+    with self.assertRaises(ValueError):
+      jax_utils.get_output_latency(jax_config, 1)
+      
+    # Currently, MLX might NOT raise ValueError because _get_accumulated_output_latency bypasses it.
+    # We assert it raises ValueError to enforce parity.
+    with self.assertRaises(ValueError):
+      mlx_utils.get_output_latency(mlx_config, 1)
+
+
+if __name__ == '__main__':
+  absltest.main()

From 34e7be2e196aaae5ad28ced2748c197b7f724f4f Mon Sep 17 00:00:00 2001
From: Julian Salazar <julsal@google.com>
Date: Mon, 1 Jun 2026 20:56:22 +0000
Subject: [PATCH 13/29] chore(release): bump version to 0.3.0rc1

---
 sequence_layers/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sequence_layers/__init__.py b/sequence_layers/__init__.py
index e122cc1..c9e86ac 100644
--- a/sequence_layers/__init__.py
+++ b/sequence_layers/__init__.py
@@ -13,4 +13,4 @@
 # limitations under the License.
 """Package directory file for Sequence Layers."""
 
-__version__ = '0.2'
+__version__ = '0.3.0rc1'

From 93dcd7fae5ae893861256c8bf37d65df16d41f0a Mon Sep 17 00:00:00 2001
From: Julian Salazar <julsal@google.com>
Date: Mon, 1 Jun 2026 21:23:52 +0000
Subject: [PATCH 14/29] refactor(jax): remove dead projection helper to secure
 checkpoint backward compatibility

- Deleted `AttentionInputProjectionHelper` from JAX attention `common.py` to purge dead namespace-flattening helper code.
- Stripped `AttentionInputProjectionHelper` from the inheritance base lists of all JAX self and cross attention layers, ensuring strict backward compatibility of Flax parameter PyTree namespaces with the `main` branch.

TAG=agy
CONV=21ada17b-3411-4090-8450-e69d8ebfeae6
---
 sequence_layers/jax/attention/common.py       | 291 ------------------
 .../jax/attention/dot_product_attention.py    |   1 -
 .../attention/dot_product_self_attention.py   |   1 -
 .../local_dot_product_self_attention.py       |   1 -
 .../streaming_dot_product_attention.py        |   1 -
 .../streaming_local_dot_product_attention.py  |   1 -
 6 files changed, 296 deletions(-)

diff --git a/sequence_layers/jax/attention/common.py b/sequence_layers/jax/attention/common.py
index 782359c..e1d7c79 100644
--- a/sequence_layers/jax/attention/common.py
+++ b/sequence_layers/jax/attention/common.py
@@ -765,297 +765,6 @@ def make(
     )
 
 
-class AttentionInputProjectionHelper:
-  """Helper class for shared attention input projection logic."""
-
-  def _setup_projection_layers(
-      self,
-      config: QueryKeyValueProjectionConfig,
-      num_query_heads: int,
-      num_kv_heads: int,
-      units_per_head: int,
-      use_bias: bool,
-      precision: jax.lax.PrecisionLike,
-      compute_dtype: types.DType,
-      param_dtype: types.DType,
-      allow_combined_qkv: bool = True,
-  ) -> None:
-    """Creates submodules, must be called from nn.Module.setup in subclasses."""
-    match config:
-      case CombinedQueryKeyValueProjection():
-        if not allow_combined_qkv:
-          raise ValueError(
-              'CombinedQueryKeyValueProjection is not supported. Use'
-              ' SeparateQueryKeyValueProjection or'
-              ' QueryAndSharedKeyValueProjection.'
-          )
-        if num_query_heads != num_kv_heads:
-          raise ValueError(
-              f'num_query_heads={num_query_heads} !='
-              f' num_kv_heads={num_kv_heads}'
-          )
-        num_stacked = 2 if config.share_kv_projection else 3
-        self._qkv = utils.FlaxEinsumDense(
-            equation='...a,abcd->...bcd',
-            output_shape=(num_stacked, num_query_heads, units_per_head),
-            bias_axes='bcd' if use_bias else None,
-            kernel_init=utils.shard_initializer(
-                config.qkv_kernel_init,
-                config.qkv_kernel_sharding,
-                projectable=True,
-                axes_types=(
-                    meta.AxisType.FANIN,
-                    meta.AxisType.STACKED,
-                    None,
-                    None,
-                ),
-            ),
-            bias_init=utils.shard_initializer(
-                config.bias_init, config.bias_sharding
-            ),
-            precision=precision,
-            compute_dtype=compute_dtype,
-            param_dtype=param_dtype,
-            einsum_factory=config.einsum_factory,
-            quantization_provider=config.quantization_provider,
-            name='query_key_value_projection',
-        )
-      case SeparateQueryKeyValueProjection():
-        self._q = utils.FlaxEinsumDense(
-            equation='...a,abc->...bc',
-            output_shape=(num_query_heads, units_per_head),
-            bias_axes='bc' if use_bias else None,
-            kernel_init=utils.shard_initializer(
-                config.q_kernel_init,
-                config.q_kernel_sharding,
-                projectable=True,
-                axes_types=(meta.AxisType.FANIN, None, None),
-            ),
-            bias_init=utils.shard_initializer(
-                config.bias_init, config.bias_sharding
-            ),
-            precision=precision,
-            compute_dtype=compute_dtype,
-            param_dtype=param_dtype,
-            einsum_factory=config.einsum_factory,
-            quantization_provider=config.quantization_provider,
-            name='query_projection',
-        )
-        self._k = utils.FlaxEinsumDense(
-            equation='...a,abc->...bc',
-            output_shape=(num_kv_heads, units_per_head),
-            bias_axes='bc' if use_bias else None,
-            kernel_init=utils.shard_initializer(
-                config.k_kernel_init,
-                config.k_kernel_sharding,
-                projectable=True,
-                axes_types=(meta.AxisType.FANIN, None, None),
-            ),
-            bias_init=utils.shard_initializer(
-                config.bias_init, config.bias_sharding
-            ),
-            precision=precision,
-            compute_dtype=compute_dtype,
-            param_dtype=param_dtype,
-            einsum_factory=config.einsum_factory,
-            quantization_provider=config.quantization_provider,
-            name='key_projection',
-        )
-        self._v = utils.FlaxEinsumDense(
-            equation='...a,abc->...bc',
-            output_shape=(num_kv_heads, units_per_head),
-            bias_axes='bc' if use_bias else None,
-            kernel_init=utils.shard_initializer(
-                config.v_kernel_init,
-                config.v_kernel_sharding,
-                projectable=True,
-                axes_types=(meta.AxisType.FANIN, None, None),
-            ),
-            bias_init=utils.shard_initializer(
-                config.bias_init, config.bias_sharding
-            ),
-            precision=precision,
-            compute_dtype=compute_dtype,
-            param_dtype=param_dtype,
-            einsum_factory=config.einsum_factory,
-            quantization_provider=config.quantization_provider,
-            name='value_projection',
-        )
-      case QueryAndKeyValueProjection():
-        self._q = utils.FlaxEinsumDense(
-            equation='...a,abc->...bc',
-            output_shape=(num_query_heads, units_per_head),
-            bias_axes='bc' if use_bias else None,
-            kernel_init=utils.shard_initializer(
-                config.q_kernel_init,
-                config.q_kernel_sharding,
-                projectable=True,
-                axes_types=(meta.AxisType.FANIN, None, None),
-            ),
-            bias_init=utils.shard_initializer(
-                config.q_bias_init, config.q_bias_sharding
-            ),
-            precision=precision,
-            compute_dtype=compute_dtype,
-            param_dtype=param_dtype,
-            einsum_factory=config.einsum_factory,
-            quantization_provider=config.quantization_provider,
-            name='query_projection',
-        )
-        self._kv = utils.FlaxEinsumDense(
-            equation='...a,abcd->...bcd',
-            output_shape=(2, num_kv_heads, units_per_head),
-            bias_axes='bcd' if use_bias else None,
-            kernel_init=utils.shard_initializer(
-                config.kv_kernel_init,
-                config.kv_kernel_sharding,
-                projectable=True,
-                axes_types=(
-                    meta.AxisType.FANIN,
-                    meta.AxisType.STACKED,
-                    None,
-                    None,
-                ),
-            ),
-            bias_init=utils.shard_initializer(
-                config.kv_bias_init, config.kv_bias_sharding
-            ),
-            precision=precision,
-            compute_dtype=compute_dtype,
-            param_dtype=param_dtype,
-            einsum_factory=config.einsum_factory,
-            quantization_provider=config.quantization_provider,
-            name='key_value_projection',
-        )
-      case QueryAndSharedKeyValueProjection():
-        self._q = utils.FlaxEinsumDense(
-            equation='...a,abc->...bc',
-            output_shape=(num_query_heads, units_per_head),
-            bias_axes='bc' if use_bias else None,
-            kernel_init=utils.shard_initializer(
-                config.q_kernel_init,
-                config.q_kernel_sharding,
-                projectable=True,
-                axes_types=(meta.AxisType.FANIN, None, None),
-            ),
-            bias_init=utils.shard_initializer(
-                config.q_bias_init, config.q_bias_sharding
-            ),
-            precision=precision,
-            compute_dtype=compute_dtype,
-            param_dtype=param_dtype,
-            einsum_factory=config.einsum_factory,
-            quantization_provider=config.quantization_provider,
-            name='query_projection',
-        )
-        self._shared_kv = utils.FlaxEinsumDense(
-            equation='...a,abc->...bc',
-            output_shape=(num_kv_heads, units_per_head),
-            bias_axes='bc' if use_bias else None,
-            kernel_init=utils.shard_initializer(
-                config.kv_kernel_init,
-                config.kv_kernel_sharding,
-                projectable=True,
-                axes_types=(
-                    meta.AxisType.FANIN,
-                    None,
-                    None,
-                ),
-            ),
-            bias_init=utils.shard_initializer(
-                config.kv_bias_init, config.kv_bias_sharding
-            ),
-            precision=precision,
-            compute_dtype=compute_dtype,
-            param_dtype=param_dtype,
-            einsum_factory=config.einsum_factory,
-            quantization_provider=config.quantization_provider,
-            name='shared_key_value_projection',
-        )
-
-  def get_input_projection_output_dtype(
-      self,
-      config: QueryKeyValueProjectionConfig,
-      input_dtype: types.DType,
-      constants: types.Constants | None = None,
-  ) -> types.DType:
-    """Returns the output dtype of the QKV projection."""
-    match config:
-      case CombinedQueryKeyValueProjection():
-        return self._qkv.get_output_dtype(input_dtype, constants=constants)
-      case (
-          SeparateQueryKeyValueProjection()
-          | QueryAndKeyValueProjection()
-          | QueryAndSharedKeyValueProjection()
-      ):
-        return self._q.get_output_dtype(input_dtype, constants=constants)
-      case _:
-        raise NotImplementedError(config)
-
-  def get_qkv(
-      self, config: QueryKeyValueProjectionConfig, x: types.Sequence
-  ) -> tuple[types.Sequence, types.Sequence, types.Sequence]:
-    """Project input to query/key/value sequences."""
-    match config:
-      case CombinedQueryKeyValueProjection():
-        projection = utils.sequence_unstack(
-            self._qkv.project_sequence(x), axis=2
-        )
-
-        if len(projection) == 2:
-          # Shared K and V.
-          queries, keys = projection
-          values = keys
-        else:
-          queries, keys, values = projection
-      case SeparateQueryKeyValueProjection():
-        queries = self._q.project_sequence(x)
-        keys = self._k.project_sequence(x)
-        values = self._v.project_sequence(x)
-      case QueryAndKeyValueProjection():
-        queries = self._q.project_sequence(x)
-        keys, values = utils.sequence_unstack(
-            self._kv.project_sequence(x), axis=2
-        )
-      case QueryAndSharedKeyValueProjection():
-        queries = self._q.project_sequence(x)
-        keys = values = self._shared_kv.project_sequence(x)
-      case _:
-        raise NotImplementedError(config)
-    return queries, keys, values
-
-  def get_q(
-      self, config: QueryKeyValueProjectionConfig, x: types.Sequence
-  ) -> types.Sequence:
-    """Project input to query sequence."""
-    match config:
-      case SeparateQueryKeyValueProjection():
-        queries = self._q.project_sequence(x)
-      case QueryAndKeyValueProjection():
-        queries = self._q.project_sequence(x)
-      case QueryAndSharedKeyValueProjection():
-        queries = self._q.project_sequence(x)
-      case _:
-        raise NotImplementedError(config)
-    return queries
-
-  def get_kv(
-      self, config: QueryKeyValueProjectionConfig, x: types.Sequence
-  ) -> tuple[types.Sequence, types.Sequence]:
-    """Project input to key/value sequences."""
-    match config:
-      case SeparateQueryKeyValueProjection():
-        keys = self._k.project_sequence(x)
-        values = self._v.project_sequence(x)
-      case QueryAndKeyValueProjection():
-        keys, values = utils.sequence_unstack(
-            self._kv.project_sequence(x), axis=2
-        )
-      case QueryAndSharedKeyValueProjection():
-        keys = values = self._shared_kv.project_sequence(x)
-      case _:
-        raise NotImplementedError(config)
-    return keys, values
 
 
 class SelfAttentionEmits(struct.PyTreeNode):
diff --git a/sequence_layers/jax/attention/dot_product_attention.py b/sequence_layers/jax/attention/dot_product_attention.py
index 6ff3663..37faef0 100644
--- a/sequence_layers/jax/attention/dot_product_attention.py
+++ b/sequence_layers/jax/attention/dot_product_attention.py
@@ -28,7 +28,6 @@
 
 class DotProductAttention(
     types.Emitting,
-    common.AttentionInputProjectionHelper,
     attention_spec.DotProductAttention[types.Sequence, types.ChannelSpec],
 ):
   """Dot product attention."""
diff --git a/sequence_layers/jax/attention/dot_product_self_attention.py b/sequence_layers/jax/attention/dot_product_self_attention.py
index a1f9a49..52ee2c0 100644
--- a/sequence_layers/jax/attention/dot_product_self_attention.py
+++ b/sequence_layers/jax/attention/dot_product_self_attention.py
@@ -28,7 +28,6 @@
 
 class DotProductSelfAttention(
     types.Emitting,
-    common.AttentionInputProjectionHelper,
     attention_spec.DotProductSelfAttention[types.Sequence, types.ChannelSpec],
 ):
   """A multi-headed dot-product self attention layer."""
diff --git a/sequence_layers/jax/attention/local_dot_product_self_attention.py b/sequence_layers/jax/attention/local_dot_product_self_attention.py
index 0f8fcbe..0fe0e13 100644
--- a/sequence_layers/jax/attention/local_dot_product_self_attention.py
+++ b/sequence_layers/jax/attention/local_dot_product_self_attention.py
@@ -27,7 +27,6 @@
 
 class LocalDotProductSelfAttention(
     types.Emitting,
-    common.AttentionInputProjectionHelper,
     attention_spec.LocalDotProductSelfAttention[types.Sequence, types.ChannelSpec],
 ):
   """A multi-headed dot-product self attention layer."""
diff --git a/sequence_layers/jax/attention/streaming_dot_product_attention.py b/sequence_layers/jax/attention/streaming_dot_product_attention.py
index 453bae4..c3a5907 100644
--- a/sequence_layers/jax/attention/streaming_dot_product_attention.py
+++ b/sequence_layers/jax/attention/streaming_dot_product_attention.py
@@ -26,7 +26,6 @@
 
 class StreamingDotProductAttention(
     types.Emitting,
-    common.AttentionInputProjectionHelper,
     attention_spec.StreamingDotProductAttention[types.Sequence, types.ChannelSpec],
 ):
   """A multi-headed streaming dot-product attention layer.
diff --git a/sequence_layers/jax/attention/streaming_local_dot_product_attention.py b/sequence_layers/jax/attention/streaming_local_dot_product_attention.py
index 5e30f72..af2c539 100644
--- a/sequence_layers/jax/attention/streaming_local_dot_product_attention.py
+++ b/sequence_layers/jax/attention/streaming_local_dot_product_attention.py
@@ -26,7 +26,6 @@
 
 class StreamingLocalDotProductAttention(
     types.Emitting,
-    common.AttentionInputProjectionHelper,
     attention_spec.StreamingDotProductAttention[types.Sequence, types.ChannelSpec],
 ):
   """A multi-headed streaming local dot-product attention layer.

From aade0b1adfe00e7de9d6747e22d125f118747f05 Mon Sep 17 00:00:00 2001
From: Julian Salazar <julsal@google.com>
Date: Mon, 1 Jun 2026 21:36:28 +0000
Subject: [PATCH 15/29] style(jax): explicitly restate inherited config fields
 in conditioning and position layers

- Restated all inherited config dataclass fields in `Conditioning.Config` (in `conditioning.py`) and `AddTimingSignal.Config`, `ApplyRotaryPositionalEncoding.Config` (in `position.py`).
- This ensures uniform coding style across JAX configurations, improves self-documentation and IDE autocomplete, and removes any risk of implicit dataclass inheritance behavior.

TAG=agy
CONV=21ada17b-3411-4090-8450-e69d8ebfeae6
---
 sequence_layers/jax/conditioning.py | 10 +++++++++-
 sequence_layers/jax/position.py     | 16 ++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/sequence_layers/jax/conditioning.py b/sequence_layers/jax/conditioning.py
index 1490a8d..6eab012 100644
--- a/sequence_layers/jax/conditioning.py
+++ b/sequence_layers/jax/conditioning.py
@@ -338,12 +338,20 @@ class Conditioning(
   class Config(conditioning_spec.Conditioning.Config):
     """Config for Conditioning."""
 
-    # Override defaults or add JAX-specific fields
+    # Stated explicitly for JAX documentation and IDE support
+    conditioning_name: str
+    projection: BaseConditioning.Projection
+    combination: BaseConditioning.Combination
+    projection_channel_shape: types.Shape | None = None
+    streaming: bool = False
+    affine_scale_offset: complex = 1.0
+    compute_dtype: types.DType | None = None
     param_dtype: types.DType = jnp.float32
     kernel_init: nn.initializers.Initializer = nn.linear.default_kernel_init
     kernel_sharding: types.Sharding | None = None
     bias_init: nn.initializers.Initializer = nn.initializers.zeros_init()
     bias_sharding: types.Sharding | None = None
+    name: str | None = None
 
     @override
     def make(self) -> 'Conditioning':
diff --git a/sequence_layers/jax/position.py b/sequence_layers/jax/position.py
index 670045c..65b9ea1 100644
--- a/sequence_layers/jax/position.py
+++ b/sequence_layers/jax/position.py
@@ -45,7 +45,15 @@ class AddTimingSignal(
   class Config(position_spec.AddTimingSignal.Config):
     """Config for AddTimingSignal."""
 
+    # Stated explicitly for JAX documentation and IDE support
+    min_timescale: float = 1.0
+    max_timescale: float = 1.0e4
+    trainable_scale: bool = False
+    axes: int | tuple[int, ...] | None = None
+    sharding: types.Sharding | None = None
     param_dtype: types.DType = jnp.float32
+    only_advance_position_for_valid_timesteps: bool = True
+    name: str | None = None
 
     @override
     def make(self) -> 'AddTimingSignal':
@@ -194,6 +202,14 @@ class ApplyRotaryPositionalEncoding(
   class Config(position_spec.ApplyRotaryPositionalEncoding.Config):
     """Config for ApplyRotaryPositionalEncoding."""
 
+    # Stated explicitly for JAX documentation and IDE support
+    max_wavelength: float
+    axis: int = -1
+    only_advance_position_for_valid_timesteps: bool = True
+    positions_in_at_least_fp32: bool = True
+    positions_name: str | None = None
+    name: str | None = None
+
     @override
     def make(self) -> 'ApplyRotaryPositionalEncoding':
       return ApplyRotaryPositionalEncoding(self, name=self.name)

From ac7831f8d23b3ceba8d0ddc018191412296eaf99 Mon Sep 17 00:00:00 2001
From: Julian Salazar <julsal@google.com>
Date: Mon, 1 Jun 2026 23:55:59 +0000
Subject: [PATCH 16/29] chore(specs): clean up private exports and testonly
 symbols from package roots

TAG=agy
CONV=21ada17b-3411-4090-8450-e69d8ebfeae6
---
 sequence_layers/jax/__init__.py            |   5 +-
 sequence_layers/mlx/__init__.py            | 126 ++++++++++-----------
 sequence_layers/specs/__init__.py          |  15 +--
 sequence_layers/specs/backend_behaviors.py |   4 +-
 sequence_layers/specs/test_utils.py        |   8 +-
 5 files changed, 72 insertions(+), 86 deletions(-)

diff --git a/sequence_layers/jax/__init__.py b/sequence_layers/jax/__init__.py
index bdbcf7d..e3d1d53 100644
--- a/sequence_layers/jax/__init__.py
+++ b/sequence_layers/jax/__init__.py
@@ -30,7 +30,4 @@
 
 # (re-export the names for typechecking)
 # pylint: disable=useless-import-alias
-from . import backend as backend
-from . import test_utils as test_utils
-from . import types as types
-from .test_utils import SequenceLayerTest
+from sequence_layers.jax import types as types
diff --git a/sequence_layers/mlx/__init__.py b/sequence_layers/mlx/__init__.py
index a4b0940..40b904b 100644
--- a/sequence_layers/mlx/__init__.py
+++ b/sequence_layers/mlx/__init__.py
@@ -20,131 +20,125 @@
 # Pyrefly fails to resolve the concrete method implementations in `mlx/types.py`
 # and flags all instances as abstract (`bad-instantiation` false positives).
 #
-# Explicit imports (e.g., `from .simple import Relu`) DO NOT trigger this issue.
+# Explicit imports (e.g., `from sequence_layers.mlx.simple import Relu`) DO NOT trigger this issue.
 # If you need to expose specific layers at the package level, import them
 # explicitly instead of using a star import.
-from . import attention
-from . import backend
-from . import dense
-from . import dsp
-from . import projection_configs
-from . import simple
-from . import test_utils
-from . import types
-from . import types as basic_types
-from . import utils
-from .attention import DotProductAttention
+from sequence_layers.mlx import attention
+from sequence_layers.mlx import dense
+from sequence_layers.mlx import dsp
+from sequence_layers.mlx import projection_configs
+from sequence_layers.mlx import simple
+from sequence_layers.mlx import types
+from sequence_layers.mlx import types as basic_types
+from sequence_layers.mlx import utils
+from sequence_layers.mlx.attention import DotProductAttention
 from .attention import DotProductSelfAttention
-from .attention import LocalDotProductSelfAttention
+from sequence_layers.mlx.attention import LocalDotProductSelfAttention
 from .attention import StreamingDotProductAttention
-from .attention import StreamingLocalDotProductAttention
+from sequence_layers.mlx.attention import StreamingLocalDotProductAttention
 from .combinators import CombinationMode
-from .combinators import Parallel
+from sequence_layers.mlx.combinators import Parallel
 from .combinators import Repeat
-from .combinators import Residual
+from sequence_layers.mlx.combinators import Residual
 from .combinators import Serial
-from .combinators import SerialCombinatorMixin
+from sequence_layers.mlx.combinators import SerialCombinatorMixin
 from .combinators import SerialModules
-from .conditioning import Conditioning
+from sequence_layers.mlx.conditioning import Conditioning
 from .convolution import Conv1D
-from .convolution import Conv1DTranspose
+from sequence_layers.mlx.convolution import Conv1DTranspose
 from .convolution import DepthwiseConv1D
-from .convolution2d import AveragePooling2D
+from sequence_layers.mlx.convolution2d import AveragePooling2D
 from .convolution2d import Conv2D
-from .convolution2d import Conv2DTranspose
+from sequence_layers.mlx.convolution2d import Conv2DTranspose
 from .convolution2d import ParallelChannels
-from .convolution2d import Upsample2D
+from sequence_layers.mlx.convolution2d import Upsample2D
 from .dense import Dense
-from .dense import EinsumDense
+from sequence_layers.mlx.dense import EinsumDense
 from .dsp import Delay
-from .dsp import FFT
+from sequence_layers.mlx.dsp import FFT
 from .dsp import Frame
-from .dsp import IFFT
+from sequence_layers.mlx.dsp import IFFT
 from .dsp import InverseSTFT
-from .dsp import IRFFT
+from sequence_layers.mlx.dsp import IRFFT
 from .dsp import LinearToMelSpectrogram
-from .dsp import Lookahead
+from sequence_layers.mlx.dsp import Lookahead
 from .dsp import OverlapAdd
-from .dsp import RFFT
+from sequence_layers.mlx.dsp import RFFT
 from .dsp import STFT
-from .dsp import Window
+from sequence_layers.mlx.dsp import Window
 from .normalization import BatchNormalization
-from .normalization import GroupNormalization
+from sequence_layers.mlx.normalization import GroupNormalization
 from .normalization import L2Normalize
-from .normalization import LayerNormalization
+from sequence_layers.mlx.normalization import LayerNormalization
 from .normalization import RMSNormalization
-from .pooling import AveragePooling1D
+from sequence_layers.mlx.pooling import AveragePooling1D
 from .pooling import MaxPooling1D
-from .pooling import MinPooling1D
+from sequence_layers.mlx.pooling import MinPooling1D
 from .position import AddTimingSignal
-from .position import ApplyRotaryPositionalEncoding
+from sequence_layers.mlx.position import ApplyRotaryPositionalEncoding
 from .projection_configs import CombinedQueryKeyValueProjection
-from .projection_configs import QueryAndKeyValueProjection
+from sequence_layers.mlx.projection_configs import QueryAndKeyValueProjection
 from .projection_configs import QueryAndSharedKeyValueProjection
-from .projection_configs import SeparateQueryKeyValueProjection
+from sequence_layers.mlx.projection_configs import SeparateQueryKeyValueProjection
 from .simple import Abs
-from .simple import Add
+from sequence_layers.mlx.simple import Add
 from .simple import Cast
-from .simple import CheckpointName
+from sequence_layers.mlx.simple import CheckpointName
 from .simple import Downsample1D
-from .simple import Dropout
+from sequence_layers.mlx.simple import Dropout
 from .simple import Elu
-from .simple import Embedding
+from sequence_layers.mlx.simple import Embedding
 from .simple import Exp
-from .simple import ExpandDims
+from sequence_layers.mlx.simple import ExpandDims
 from .simple import Flatten
-from .simple import GatedLinearUnit
+from sequence_layers.mlx.simple import GatedLinearUnit
 from .simple import GatedTanhUnit
-from .simple import GatedUnit
+from sequence_layers.mlx.simple import GatedUnit
 from .simple import Gelu
-from .simple import Identity
+from sequence_layers.mlx.simple import Identity
 from .simple import Lambda
-from .simple import LeakyRelu
+from sequence_layers.mlx.simple import LeakyRelu
 from .simple import Log
-from .simple import Logging
+from sequence_layers.mlx.simple import Logging
 from .simple import MaskInvalid
-from .simple import OneHot
+from sequence_layers.mlx.simple import OneHot
 from .simple import Relu
-from .simple import Reshape
+from sequence_layers.mlx.simple import Reshape
 from .simple import Scale
-from .simple import Sigmoid
+from sequence_layers.mlx.simple import Sigmoid
 from .simple import Softmax
-from .simple import Softplus
+from sequence_layers.mlx.simple import Softplus
 from .simple import Squeeze
-from .simple import Swish
+from sequence_layers.mlx.simple import Swish
 from .simple import Tanh
-from .simple import Transpose
+from sequence_layers.mlx.simple import Transpose
 from .simple import Upsample1D
-from .test_utils import SequenceLayerTest
-from .types import ChannelSpec
+from sequence_layers.mlx.types import ChannelSpec
 from .types import check_layer
-from .types import check_step
+from sequence_layers.mlx.types import check_step
 from .types import Constants
-from .types import DType
+from sequence_layers.mlx.types import DType
 from .types import Emits
-from .types import Emitting
+from sequence_layers.mlx.types import Emitting
 from .types import MaskedSequence
-from .types import MaskT
+from sequence_layers.mlx.types import MaskT
 from .types import PreservesShape
-from .types import PreservesType
+from sequence_layers.mlx.types import PreservesType
 from .types import Sequence
-from .types import SequenceLayer
+from sequence_layers.mlx.types import SequenceLayer
 from .types import SequenceLayerConfig
-from .types import Shape
+from sequence_layers.mlx.types import Shape
 from .types import ShapeDType
-from .types import ShapeLike
+from sequence_layers.mlx.types import ShapeLike
 from .types import State
-from .types import Stateless
+from sequence_layers.mlx.types import Stateless
 from .types import StatelessPointwise
 
 __all__ = [
     'basic_types',
     'dense',
-    'backend',
     'simple',
     'types',
-    'test_utils',
-    'SequenceLayerTest',
     'Constants',
     'Sequence',
     'MaskedSequence',
diff --git a/sequence_layers/specs/__init__.py b/sequence_layers/specs/__init__.py
index 1cacfc7..7cb4d87 100644
--- a/sequence_layers/specs/__init__.py
+++ b/sequence_layers/specs/__init__.py
@@ -18,8 +18,8 @@
 
 from typing import Any, Protocol, runtime_checkable
 
-from . import test_utils_spec as _test_utils_spec
-from . import types as _types
+from sequence_layers.specs import test_utils_spec as _test_utils_spec
+from sequence_layers.specs import types as _types
 
 
 @runtime_checkable
@@ -28,18 +28,10 @@ class ModuleSpec(Protocol):
 
   # pylint: disable=missing-function-docstring
 
-  @property
-  def backend(self) -> Any:
-    ...
-
   @property
   def types(self) -> _types.ModuleSpec:
     ...
 
-  @property
-  def test_utils(self) -> _test_utils_spec.ModuleSpec:
-    ...
-
   # pylint: disable=invalid-name
 
   # Identifiers that backend-specific implementations should expose at top
@@ -62,6 +54,3 @@ def SequenceLayer(self) -> type[_types.SequenceLayer]:
   def SequenceLayerConfig(self) -> type[_types.SequenceLayerConfig]:
     ...
 
-  @property
-  def SequenceLayerTest(self) -> type[Any]:
-    ...
diff --git a/sequence_layers/specs/backend_behaviors.py b/sequence_layers/specs/backend_behaviors.py
index 9123112..274817e 100644
--- a/sequence_layers/specs/backend_behaviors.py
+++ b/sequence_layers/specs/backend_behaviors.py
@@ -26,4 +26,6 @@ class ModuleSpecTest(test_utils_spec.ModuleSpecTest):
 
   @override
   def module_spec_pairs(self, backend_sl: specs.ModuleSpec):
-    return {backend_sl.backend: backend_spec.ModuleSpec}
+    import importlib  # pylint: disable=g-import-not-at-top
+    backend = importlib.import_module(backend_sl.__name__ + '.backend')
+    return {backend: backend_spec.ModuleSpec}
diff --git a/sequence_layers/specs/test_utils.py b/sequence_layers/specs/test_utils.py
index 2410668..d172a04 100644
--- a/sequence_layers/specs/test_utils.py
+++ b/sequence_layers/specs/test_utils.py
@@ -159,12 +159,16 @@ class SequenceLayerTest(
   @property
   def xp(self) -> backend_spec.xp:
     """Returns the backend wrapper."""
-    return self.sl.backend.xp
+    import importlib  # pylint: disable=g-import-not-at-top
+    backend = importlib.import_module(self.sl.__name__ + '.backend')
+    return backend.xp
 
   @property
   def nn(self) -> backend_spec.nn:
     """Returns the backend nn wrapper."""
-    return self.sl.backend.nn
+    import importlib  # pylint: disable=g-import-not-at-top
+    backend = importlib.import_module(self.sl.__name__ + '.backend')
+    return backend.nn
 
   def make_layer(self, config: types_spec.SequenceLayerConfig) -> Any:
     """Instantiates a layer from its config, delegating to the backend."""

From 06ababe902e52def140db4e480284a50d967b70c Mon Sep 17 00:00:00 2001
From: Julian Salazar <julsal@google.com>
Date: Mon, 1 Jun 2026 23:56:34 +0000
Subject: [PATCH 17/29] refactor(simple): decouple gated unit configs from
 GatedUnit.Config

TAG=agy
CONV=21ada17b-3411-4090-8450-e69d8ebfeae6
---
 sequence_layers/jax/simple.py   | 8 ++++----
 sequence_layers/mlx/simple.py   | 4 ++--
 sequence_layers/specs/simple.py | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/sequence_layers/jax/simple.py b/sequence_layers/jax/simple.py
index d0b5de8..9b6d384 100644
--- a/sequence_layers/jax/simple.py
+++ b/sequence_layers/jax/simple.py
@@ -836,13 +836,13 @@ class GatedLinearUnit(
   """Computes a Gated Linear Unit, reducing the input channels by 2x."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(GatedUnit.Config, spec.GatedLinearUnit.Config):
+  class Config(spec.GatedLinearUnit.Config):
     name: str | None = None
 
     @override
     def make(self) -> 'GatedLinearUnit':
       return GatedLinearUnit(
-          config=GatedUnit.Config(
+          GatedUnit.Config(
               None,
               typing.cast(
                   typing.Callable[[types.ArrayLike], types.ArrayLike],
@@ -860,13 +860,13 @@ class GatedTanhUnit(
   """Computes a Gated Tanh Unit, reducing the input channels by 2x."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(GatedUnit.Config, spec.GatedTanhUnit.Config):
+  class Config(spec.GatedTanhUnit.Config):
     name: str | None = None
 
     @override
     def make(self) -> 'GatedTanhUnit':
       return GatedTanhUnit(
-          config=GatedUnit.Config(
+          GatedUnit.Config(
               typing.cast(
                   typing.Callable[[types.ArrayLike], types.ArrayLike],
                   jax.nn.tanh,
diff --git a/sequence_layers/mlx/simple.py b/sequence_layers/mlx/simple.py
index 1744734..dfa9cd4 100644
--- a/sequence_layers/mlx/simple.py
+++ b/sequence_layers/mlx/simple.py
@@ -752,7 +752,7 @@ class GatedLinearUnit(
   """Computes a Gated Linear Unit, reducing input channels by 2x."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(GatedUnit.Config, spec.GatedLinearUnit.Config):
+  class Config(spec.GatedLinearUnit.Config):
     """Configuration for GatedLinearUnit layer."""
 
     name: str | None = None
@@ -775,7 +775,7 @@ class GatedTanhUnit(
   """Computes a Gated Tanh Unit, reducing input channels by 2x."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config(GatedUnit.Config, spec.GatedTanhUnit.Config):
+  class Config(spec.GatedTanhUnit.Config):
     """Configuration for GatedTanhUnit layer."""
 
     name: str | None = None
diff --git a/sequence_layers/specs/simple.py b/sequence_layers/specs/simple.py
index 760afe3..7833c8e 100644
--- a/sequence_layers/specs/simple.py
+++ b/sequence_layers/specs/simple.py
@@ -277,7 +277,7 @@ class GatedLinearUnit[
   """Specification for GatedLinearUnit layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config[T](GatedUnit.Config[T]):
+  class Config(types_spec.SequenceLayerConfig):
     """Configuration for GatedLinearUnit layer."""
 
 
@@ -287,7 +287,7 @@ class GatedTanhUnit[
   """Specification for GatedTanhUnit layer."""
 
   @dataclasses.dataclass(frozen=True)
-  class Config[T](GatedUnit.Config[T]):
+  class Config(types_spec.SequenceLayerConfig):
     """Configuration for GatedTanhUnit layer."""
 
 

From f9e9429931ce6d695227f8f458782cd77a1941f9 Mon Sep 17 00:00:00 2001
From: Julian Salazar <julsal@google.com>
Date: Mon, 1 Jun 2026 23:59:49 +0000
Subject: [PATCH 18/29] style(jax): strip config= keyword argument in simple
 layers

TAG=agy
CONV=21ada17b-3411-4090-8450-e69d8ebfeae6
---
 sequence_layers/jax/simple.py | 102 +++++++++++++++++-----------------
 1 file changed, 51 insertions(+), 51 deletions(-)

diff --git a/sequence_layers/jax/simple.py b/sequence_layers/jax/simple.py
index 9b6d384..dd8a8e6 100644
--- a/sequence_layers/jax/simple.py
+++ b/sequence_layers/jax/simple.py
@@ -203,7 +203,7 @@ def __post_init__(self):
 
     @override
     def make(self) -> 'Scale':
-      return Scale(config=self, name=self.name)
+      return Scale(self, name=self.name)
 
   config: Config
 
@@ -254,7 +254,7 @@ def __post_init__(self):
 
     @override
     def make(self) -> 'Affine':
-      return Affine(config=self, name=self.name)
+      return Affine(self, name=self.name)
 
   @override
   def setup(self):
@@ -344,7 +344,7 @@ def __post_init__(self):
 
     @override
     def make(self) -> 'Add':
-      return Add(config=self, name=self.name)
+      return Add(self, name=self.name)
 
   config: Config
 
@@ -393,7 +393,7 @@ def __post_init__(self):
 
     @override
     def make(self) -> 'Maximum':
-      return Maximum(config=self, name=self.name)
+      return Maximum(self, name=self.name)
 
   config: Config
 
@@ -442,7 +442,7 @@ def __post_init__(self):
 
     @override
     def make(self) -> 'Mod':
-      return Mod(config=self, name=self.name)
+      return Mod(self, name=self.name)
 
   config: Config
 
@@ -493,7 +493,7 @@ def __post_init__(self):
 
     @override
     def make(self) -> 'Minimum':
-      return Minimum(config=self, name=self.name)
+      return Minimum(self, name=self.name)
 
   config: Config
 
@@ -617,7 +617,7 @@ class Config(_ReduceChannels.Config):
 
     @override
     def make(self) -> 'Mean':
-      return Mean(config=self, name=self.name)
+      return Mean(self, name=self.name)
 
   @property
   @override
@@ -634,7 +634,7 @@ class Config(_ReduceChannels.Config):
 
     @override
     def make(self) -> 'Min':
-      return Min(config=self, name=self.name)
+      return Min(self, name=self.name)
 
   @property
   @override
@@ -651,7 +651,7 @@ class Config(_ReduceChannels.Config):
 
     @override
     def make(self) -> 'Max':
-      return Max(config=self, name=self.name)
+      return Max(self, name=self.name)
 
   @property
   @override
@@ -668,7 +668,7 @@ class Config(_ReduceChannels.Config):
 
     @override
     def make(self) -> 'Sum':
-      return Sum(config=self, name=self.name)
+      return Sum(self, name=self.name)
 
   @property
   @override
@@ -689,7 +689,7 @@ class Config(spec.Abs.Config):
 
     @override
     def make(self) -> 'Abs':
-      return Abs(config=self, name=self.name)
+      return Abs(self, name=self.name)
 
   config: Config
 
@@ -740,7 +740,7 @@ class Config(spec.Cast.Config):
 
     @override
     def make(self) -> 'Cast':
-      return Cast(config=self, name=self.name)
+      return Cast(self, name=self.name)
 
   config: Config
 
@@ -790,7 +790,7 @@ class Config(spec.GatedUnit.Config):
 
     @override
     def make(self) -> 'GatedUnit':
-      return GatedUnit(config=self, name=self.name)
+      return GatedUnit(self, name=self.name)
 
   config: Config
 
@@ -892,7 +892,7 @@ class Config(types.SequenceLayerConfig):
     @override
     def make(self) -> 'GradientClipping':
       assert self.clip_value > 0
-      return GradientClipping(config=self, name=self.name)
+      return GradientClipping(self, name=self.name)
 
   config: Config
 
@@ -1001,7 +1001,7 @@ class Config(types.SequenceLayerConfig):
 
     @override
     def make(self) -> 'ApplySharding':
-      return ApplySharding(config=self, name=self.name)
+      return ApplySharding(self, name=self.name)
 
   config: Config
 
@@ -1042,7 +1042,7 @@ class Config(types.SequenceLayerConfig):
 
     @override
     def make(self) -> 'OptimizationBarrier':
-      return OptimizationBarrier(config=self, name=self.name)
+      return OptimizationBarrier(self, name=self.name)
 
   config: Config
 
@@ -1245,7 +1245,7 @@ class Config(spec.CheckpointName.Config):
 
     @override
     def make(self) -> 'CheckpointName':
-      return CheckpointName(config=self, name=self.name)
+      return CheckpointName(self, name=self.name)
 
   config: Config
 
@@ -1290,7 +1290,7 @@ class Config(types.SequenceLayerConfig):
 
     @override
     def make(self) -> 'Snake':
-      return Snake(config=self, name=self.name)
+      return Snake(self, name=self.name)
 
   config: Config
 
@@ -1346,7 +1346,7 @@ class Config(spec.Tanh.Config):
 
     @override
     def make(self) -> 'Tanh':
-      return Tanh(config=self, name=self.name)
+      return Tanh(self, name=self.name)
 
   config: Config
 
@@ -1382,7 +1382,7 @@ class Config(spec.Relu.Config):
 
     @override
     def make(self) -> 'Relu':
-      return Relu(config=self, name=self.name)
+      return Relu(self, name=self.name)
 
   config: Config
 
@@ -1419,7 +1419,7 @@ class Config(spec.LeakyRelu.Config):
 
     @override
     def make(self) -> 'LeakyRelu':
-      return LeakyRelu(config=self, name=self.name)
+      return LeakyRelu(self, name=self.name)
 
   config: Config
 
@@ -1453,7 +1453,7 @@ class Config(types.SequenceLayerConfig):
 
     @override
     def make(self) -> 'PRelu':
-      return PRelu(config=self, name=self.name)
+      return PRelu(self, name=self.name)
 
   config: Config
 
@@ -1508,7 +1508,7 @@ class Config(spec.Elu.Config):
 
     @override
     def make(self) -> 'Elu':
-      return Elu(config=self, name=self.name)
+      return Elu(self, name=self.name)
 
   config: Config
 
@@ -1544,7 +1544,7 @@ class Config(spec.Exp.Config):
 
     @override
     def make(self) -> 'Exp':
-      return Exp(config=self, name=self.name)
+      return Exp(self, name=self.name)
 
   config: Config
 
@@ -1575,7 +1575,7 @@ class Config(spec.Log.Config):
 
     @override
     def make(self) -> 'Log':
-      return Log(config=self, name=self.name)
+      return Log(self, name=self.name)
 
   config: Config
 
@@ -1603,7 +1603,7 @@ class Config(types.SequenceLayerConfig):
 
     @override
     def make(self) -> 'Power':
-      return Power(config=self, name=self.name)
+      return Power(self, name=self.name)
 
   config: Config
 
@@ -1639,7 +1639,7 @@ class Config(spec.Sigmoid.Config):
 
     @override
     def make(self) -> 'Sigmoid':
-      return Sigmoid(config=self, name=self.name)
+      return Sigmoid(self, name=self.name)
 
   config: Config
 
@@ -1670,7 +1670,7 @@ class Config(spec.Softplus.Config):
 
     @override
     def make(self) -> 'Softplus':
-      return Softplus(config=self, name=self.name)
+      return Softplus(self, name=self.name)
 
   config: Config
 
@@ -1702,7 +1702,7 @@ class Config(spec.Softmax.Config):
 
     @override
     def make(self) -> 'Softmax':
-      return Softmax(config=self, name=self.name)
+      return Softmax(self, name=self.name)
 
   config: Config
 
@@ -1774,7 +1774,7 @@ class Config(spec.Gelu.Config):
 
     @override
     def make(self) -> 'Gelu':
-      return Gelu(config=self, name=self.name)
+      return Gelu(self, name=self.name)
 
   config: Config
 
@@ -1828,7 +1828,7 @@ def as_slices(self) -> tuple[slice | int | None, ...]:
 
     @override
     def make(self) -> 'Slice':
-      return Slice(config=self, name=self.name)
+      return Slice(self, name=self.name)
 
   config: Config
 
@@ -1949,7 +1949,7 @@ class Config(spec.OneHot.Config):
 
     @override
     def make(self) -> 'OneHot':
-      return OneHot(config=self, name=self.name)
+      return OneHot(self, name=self.name)
 
   config: Config
 
@@ -2031,7 +2031,7 @@ class Config(spec.Embedding.Config):
 
     @override
     def make(self) -> 'Embedding':
-      return Embedding(config=self, name=self.name)
+      return Embedding(self, name=self.name)
 
   config: Config
 
@@ -2164,7 +2164,7 @@ class Config(types.SequenceLayerConfig):
 
     @override
     def make(self) -> 'EmbeddingTranspose':
-      return EmbeddingTranspose(config=self, name=self.name)
+      return EmbeddingTranspose(self, name=self.name)
 
   config: Config
 
@@ -2270,7 +2270,7 @@ def __post_init__(self):
 
     @override
     def make(self) -> 'ExpandDims':
-      return ExpandDims(config=self, name=self.name)
+      return ExpandDims(self, name=self.name)
 
   config: Config
 
@@ -2347,7 +2347,7 @@ def __post_init__(self):
 
     @override
     def make(self) -> 'Reshape':
-      return Reshape(config=self, name=self.name)
+      return Reshape(self, name=self.name)
 
   config: Config
 
@@ -2424,7 +2424,7 @@ def __post_init__(self):
 
     @override
     def make(self) -> 'GlobalReshape':
-      return GlobalReshape(config=self, name=self.name)
+      return GlobalReshape(self, name=self.name)
 
   config: Config
 
@@ -2528,7 +2528,7 @@ def make(self) -> 'Transpose':
       if self.axes is not None and (0 in self.axes or 1 in self.axes):
         raise ValueError("Can't transpose batch or time dimension.")
 
-      return Transpose(config=self, name=self.name)
+      return Transpose(self, name=self.name)
 
   config: Config
 
@@ -2643,7 +2643,7 @@ def make(self) -> 'MoveAxis':
             f' {len(destination)}'
         )
 
-      return MoveAxis(config=self, name=self.name)
+      return MoveAxis(self, name=self.name)
 
   # pyrefly: ignore[bad-override]
   config: Config
@@ -2675,7 +2675,7 @@ class Config(types.SequenceLayerConfig):
 
     @override
     def make(self) -> 'Emit':
-      return Emit(config=self, name=self.name)
+      return Emit(self, name=self.name)
 
   config: Config
 
@@ -2704,7 +2704,7 @@ class Config(types.SequenceLayerConfig):
 
     @override
     def make(self) -> 'NamedEmit':
-      return NamedEmit(config=self, name=self.name)
+      return NamedEmit(self, name=self.name)
 
   config: Config
 
@@ -2738,7 +2738,7 @@ class Config(spec.Dropout.Config):
 
     @override
     def make(self) -> 'Dropout':
-      return Dropout(config=self, name=self.name)
+      return Dropout(self, name=self.name)
 
   config: Config
 
@@ -2928,7 +2928,7 @@ class Config(spec.Downsample1D.Config):
 
     @override
     def make(self) -> 'Downsample1D':
-      return Downsample1D(config=self, name=self.name)
+      return Downsample1D(self, name=self.name)
 
   config: Config
 
@@ -2990,7 +2990,7 @@ class Config(spec.Upsample1D.Config):
 
     @override
     def make(self) -> 'Upsample1D':
-      return Upsample1D(config=self, name=self.name)
+      return Upsample1D(self, name=self.name)
 
   config: Config
 
@@ -3047,7 +3047,7 @@ def __post_init__(self):
 
     @override
     def make(self) -> 'Upsample2D':
-      return Upsample2D(config=self, name=self.name)
+      return Upsample2D(self, name=self.name)
 
   config: Config
 
@@ -3116,7 +3116,7 @@ class Config(spec.MaskInvalid.Config):
 
     @override
     def make(self) -> 'MaskInvalid':
-      return MaskInvalid(config=self, name=self.name)
+      return MaskInvalid(self, name=self.name)
 
   config: Config
 
@@ -3165,7 +3165,7 @@ class Config(spec.Logging.Config):
 
     @override
     def make(self) -> 'Logging':
-      return Logging(config=self)
+      return Logging(self)
 
   config: Config
 
@@ -3265,7 +3265,7 @@ class Config(types.SequenceLayerConfig):
 
     @override
     def make(self) -> 'Argmax':
-      return Argmax(config=self, name=self.name)
+      return Argmax(self, name=self.name)
 
   config: Config
 
@@ -3331,7 +3331,7 @@ def __post_init__(self):
 
     @override
     def make(self) -> 'EinopsRearrange':
-      return EinopsRearrange(config=self, name=self.name)
+      return EinopsRearrange(self, name=self.name)
 
   config: Config
 
@@ -3409,7 +3409,7 @@ def __post_init__(self):
 
     @override
     def make(self) -> 'GlobalEinopsRearrange':
-      return GlobalEinopsRearrange(config=self, name=self.name)
+      return GlobalEinopsRearrange(self, name=self.name)
 
   config: Config
 
@@ -3509,7 +3509,7 @@ def make(self) -> 'Squeeze':
       elif axis is not None and (0 in axis or 1 in axis):
         raise ValueError('Batch and time (axis=0 or 1) cannot be squeezed.')
 
-      return Squeeze(config=self, name=self.name)
+      return Squeeze(self, name=self.name)
 
   config: Config
 

From f754a8884db1304b11c145d5259c39d2118314e7 Mon Sep 17 00:00:00 2001
From: Julian Salazar <julsal@google.com>
Date: Tue, 2 Jun 2026 00:00:12 +0000
Subject: [PATCH 19/29] refactor: unify get_initial_state and Lambda signatures
 to ChannelSpec

TAG=agy
CONV=21ada17b-3411-4090-8450-e69d8ebfeae6
---
 sequence_layers/jax/combinators.py   |  6 +++---
 sequence_layers/jax/convolution.py   |  4 ++--
 sequence_layers/jax/normalization.py |  2 +-
 sequence_layers/jax/pooling.py       |  2 +-
 sequence_layers/jax/position.py      |  8 ++++----
 sequence_layers/jax/recurrent.py     |  4 ++--
 sequence_layers/jax/simple.py        | 12 ++++++------
 sequence_layers/mlx/pooling.py       |  2 +-
 8 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/sequence_layers/jax/combinators.py b/sequence_layers/jax/combinators.py
index b8d2235..df5803d 100644
--- a/sequence_layers/jax/combinators.py
+++ b/sequence_layers/jax/combinators.py
@@ -236,7 +236,7 @@ def output_ratio(self) -> fractions.Fraction:
   def get_initial_state(
       self,
       batch_size: int,
-      input_spec: types.ShapeDType,
+      input_spec: types.ChannelSpec,
       *,
       training: bool,
       constants: types.Constants | None = None,
@@ -523,7 +523,7 @@ def output_ratio(self) -> fractions.Fraction:
   def get_initial_state(
       self,
       batch_size: int,
-      input_spec: types.ShapeDType,
+      input_spec: types.ChannelSpec,
       *,
       training: bool,
       constants: types.Constants | None = None,
@@ -1031,7 +1031,7 @@ def layer_with_emits(
   def get_initial_state(
       self,
       batch_size: int,
-      input_spec: types.ShapeDType,
+      input_spec: types.ChannelSpec,
       *,
       training: bool,
       constants: types.Constants | None = None,
diff --git a/sequence_layers/jax/convolution.py b/sequence_layers/jax/convolution.py
index 2796943..31c49c5 100644
--- a/sequence_layers/jax/convolution.py
+++ b/sequence_layers/jax/convolution.py
@@ -270,7 +270,7 @@ def compute_conv_mask(
 
 def compute_conv_initial_state(
     batch_size: int,
-    input_spec: types.ShapeDType,
+    input_spec: types.ChannelSpec,
     buffer_width: int,
     padding: types.PaddingModeString,
     pad_value: complex | None = None,
@@ -632,7 +632,7 @@ def _buffer_width(self) -> int:
   def get_initial_state(
       self,
       batch_size: int,
-      input_spec: types.ShapeDType,
+      input_spec: types.ChannelSpec,
       *,
       training: bool,
       constants: types.Constants | None = None,
diff --git a/sequence_layers/jax/normalization.py b/sequence_layers/jax/normalization.py
index 7841821..e9c4575 100644
--- a/sequence_layers/jax/normalization.py
+++ b/sequence_layers/jax/normalization.py
@@ -712,7 +712,7 @@ def receptive_field_per_step(self) -> dict[int, types.ReceptiveField]:
   def get_initial_state(
       self,
       batch_size: int,
-      input_spec: types.ShapeDType,
+      input_spec: types.ChannelSpec,
       *,
       training: bool,
       constants: types.Constants | None = None,
diff --git a/sequence_layers/jax/pooling.py b/sequence_layers/jax/pooling.py
index 1c4677a..820bf3b 100644
--- a/sequence_layers/jax/pooling.py
+++ b/sequence_layers/jax/pooling.py
@@ -240,7 +240,7 @@ def _buffer_width(self) -> int:
   def get_initial_state(
       self,
       batch_size: int,
-      input_spec: types.ShapeDType,
+      input_spec: types.ChannelSpec,
       *,
       training: bool,
       constants: types.Constants | None = None,
diff --git a/sequence_layers/jax/position.py b/sequence_layers/jax/position.py
index 65b9ea1..84f04d2 100644
--- a/sequence_layers/jax/position.py
+++ b/sequence_layers/jax/position.py
@@ -76,7 +76,7 @@ def setup(self) -> None:
       self.scale = None
 
   @nn.nowrap
-  def _check_inputs(self, input_spec: types.ShapeDType):
+  def _check_inputs(self, input_spec: types.ChannelSpec):
     if input_spec.dtype not in (
         jnp.float16,
         jnp.bfloat16,
@@ -97,7 +97,7 @@ def receptive_field_per_step(self) -> dict[int, types.ReceptiveField]:
   def get_initial_state(
       self,
       batch_size: int,
-      input_spec: types.ShapeDType,
+      input_spec: types.ChannelSpec,
       *,
       training: bool,
       constants: types.Constants | None = None,
@@ -234,7 +234,7 @@ def receptive_field_per_step(self) -> dict[int, types.ReceptiveField]:
       return {0: (0, 0)}
 
   @nn.nowrap
-  def _check_inputs(self, input_spec: types.ShapeDType):
+  def _check_inputs(self, input_spec: types.ChannelSpec):
     if input_spec.dtype not in (
         jnp.float16,
         jnp.bfloat16,
@@ -265,7 +265,7 @@ def _check_inputs(self, input_spec: types.ShapeDType):
   def get_initial_state(
       self,
       batch_size: int,
-      input_spec: types.ShapeDType,
+      input_spec: types.ChannelSpec,
       *,
       training: bool,
       constants: types.Constants | None = None,
diff --git a/sequence_layers/jax/recurrent.py b/sequence_layers/jax/recurrent.py
index 8306ed9..4bc0e71 100644
--- a/sequence_layers/jax/recurrent.py
+++ b/sequence_layers/jax/recurrent.py
@@ -247,7 +247,7 @@ def layer(
   def get_initial_state(
       self,
       batch_size: int,
-      input_spec: types.ShapeDType,
+      input_spec: types.ChannelSpec,
       *,
       training: bool,
       constants: types.Constants | None = None,
@@ -670,7 +670,7 @@ def layer(
   def get_initial_state(
       self,
       batch_size: int,
-      input_spec: types.ShapeDType,
+      input_spec: types.ChannelSpec,
       *,
       training: bool,
       constants: types.Constants | None = None,
diff --git a/sequence_layers/jax/simple.py b/sequence_layers/jax/simple.py
index dd8a8e6..45e4980 100644
--- a/sequence_layers/jax/simple.py
+++ b/sequence_layers/jax/simple.py
@@ -1098,13 +1098,13 @@ class Config(spec.Lambda.Config):
     # If get_output_shape or get_output_dtype are called, the input_spec to use
     # for type or shape information (respectively). Prefer to use
     # get_output_spec to avoid having to specify this.
-    expected_input_spec: types.ShapeDType | None = None
+    expected_input_spec: types.ChannelSpec | None = None
     # An optional name for the layer.
     name: str | None = None
 
     @override
     def make(self) -> 'Lambda':
-      return Lambda(config=self, name=self.name)
+      return Lambda(self, name=self.name)
 
   config: Config
 
@@ -1113,7 +1113,7 @@ def make(self) -> 'Lambda':
   def supports_step(self) -> bool:
     return True
 
-  def _validate_input_spec(self, input_spec: types.ShapeDType) -> None:
+  def _validate_input_spec(self, input_spec: types.ChannelSpec) -> None:
     del input_spec
     # TODO(rryan): Re-enable when SoundStream works as expected with this
     # (including the test).
@@ -1130,10 +1130,10 @@ def _validate_input_spec(self, input_spec: types.ShapeDType) -> None:
   @override
   def get_output_spec(
       self,
-      input_spec: types.ShapeDType,
+      input_spec: types.ChannelSpec,
       *,
       constants: types.Constants | None = None,
-  ) -> types.ShapeDType:
+  ) -> types.ChannelSpec:
     self._validate_input_spec(input_spec)
     if self.config.sequence_input:
       # pyrefly: ignore[bad-assignment]
@@ -3219,7 +3219,7 @@ def layer(
   def get_initial_state(
       self,
       batch_size: int,
-      input_spec: types.ShapeDType,
+      input_spec: types.ChannelSpec,
       *,
       training: bool,
       constants: types.Constants | None = None,
diff --git a/sequence_layers/mlx/pooling.py b/sequence_layers/mlx/pooling.py
index c732243..3774957 100644
--- a/sequence_layers/mlx/pooling.py
+++ b/sequence_layers/mlx/pooling.py
@@ -261,7 +261,7 @@ def get_output_shape(self, input_shape, *, constants=None):
   def get_initial_state(
       self,
       batch_size: int,
-      input_spec: types.ShapeDType,
+      input_spec: types.ChannelSpec,
       *,
       training: bool,
       constants=None,

From 521ee9dd58c339d8bd89261e38624ea36e828af0 Mon Sep 17 00:00:00 2001
From: Julian Salazar <julsal@google.com>
Date: Tue, 2 Jun 2026 00:00:21 +0000
Subject: [PATCH 20/29] fix(jax): restore explicit_semicausal padding and fix
 FFT/IRFFT dtypes

TAG=agy
CONV=21ada17b-3411-4090-8450-e69d8ebfeae6
---
 sequence_layers/jax/dsp.py      | 38 +++++++++++++++++++++++++++++----
 sequence_layers/jax/dsp_test.py | 18 +++++++++++++++-
 2 files changed, 51 insertions(+), 5 deletions(-)

diff --git a/sequence_layers/jax/dsp.py b/sequence_layers/jax/dsp.py
index 19f6375..0fd683b 100644
--- a/sequence_layers/jax/dsp.py
+++ b/sequence_layers/jax/dsp.py
@@ -709,7 +709,7 @@ def layer(
     return fft_fn(x, axis=axis)
 
 
-class FFT(types.PreservesType, FFTBase, spec.FFT):
+class FFT(FFTBase, spec.FFT):
   """A layer that applies an FFT to the channels dimension."""
 
   @dataclasses.dataclass(frozen=True)
@@ -737,6 +737,21 @@ def _fft_length(self) -> int | None:
   def _padding(self) -> str:
     return self.config.padding
 
+  @nn.nowrap
+  def get_output_dtype(
+      self,
+      input_dtype: types.DType,
+      *,
+      constants: types.Constants | None = None,
+  ) -> types.DType:
+    match input_dtype:
+      case jnp.bfloat16 | jnp.float16 | jnp.float32 | jnp.complex64:
+        return jnp.complex64
+      case jnp.float64 | jnp.complex128:
+        return jnp.complex128
+      case _:
+        raise ValueError(f'Unsupported input dtype: {input_dtype}')
+
   def _get_output_length(self, input_size: int) -> int:
     return self.config.fft_length or input_size
 
@@ -751,7 +766,7 @@ def fft_fn(x, axis):
     return fft_fn
 
 
-class IFFT(types.PreservesType, FFTBase, spec.IFFT):
+class IFFT(FFTBase, spec.IFFT):
   """A layer that applies an IFFT to the channels dimension."""
 
   @dataclasses.dataclass(frozen=True)
@@ -780,6 +795,21 @@ def _fft_length(self) -> int | None:
   def _padding(self) -> str:
     return self.config.padding
 
+  @nn.nowrap
+  def get_output_dtype(
+      self,
+      input_dtype: types.DType,
+      *,
+      constants: types.Constants | None = None,
+  ) -> types.DType:
+    match input_dtype:
+      case jnp.bfloat16 | jnp.float16 | jnp.float32 | jnp.complex64:
+        return jnp.complex64
+      case jnp.float64 | jnp.complex128:
+        return jnp.complex128
+      case _:
+        raise ValueError(f'Unsupported input dtype: {input_dtype}')
+
   def _get_output_length(self, input_size: int) -> int:
     return self.config.frame_length or input_size
 
@@ -901,9 +931,9 @@ def get_output_dtype(
       constants: types.Constants | None = None,
   ) -> types.DType:
     match input_dtype:
-      case jnp.complex64:
+      case jnp.complex64 | jnp.bfloat16 | jnp.float16 | jnp.float32:
         return jnp.float32
-      case jnp.complex128:
+      case jnp.complex128 | jnp.float64:
         return jnp.float64
       case _:
         raise ValueError(f'Unsupported input dtype: {input_dtype}')
diff --git a/sequence_layers/jax/dsp_test.py b/sequence_layers/jax/dsp_test.py
index 26fc43c..9b00dab 100644
--- a/sequence_layers/jax/dsp_test.py
+++ b/sequence_layers/jax/dsp_test.py
@@ -124,6 +124,7 @@ class FrameTest(test_utils.SequenceLayerTest, spec.FrameTest):
           'same',
           'valid',
           'semicausal_full',
+          'explicit_semicausal',
       ),
   )
   def test_frame_exhaustive(
@@ -132,11 +133,17 @@ def test_frame_exhaustive(
     key = jax.random.PRNGKey(1234)
     batch_size = 2
     frame_length, frame_step = frame_length_frame_step
+    if padding == 'explicit_semicausal':
+      total_pad = frame_length - 1
+      overlap = max(0, frame_length - frame_step)
+      explicit_padding = (overlap, total_pad - overlap)
+    else:
+      explicit_padding = padding
     x = test_utils.random_sequence(batch_size, 1, *channel_shape)
     l = dsp.Frame.Config(
         frame_length=frame_length,
         frame_step=frame_step,
-        padding=padding,
+        padding=explicit_padding,
         name='frame',
     ).make()
     l = self.init_and_bind_layer(key, l, x)
@@ -149,6 +156,7 @@ def test_frame_exhaustive(
             'reverse_causal_valid',
             'causal',
             'reverse_causal',
+            'explicit_semicausal',
         ),
     )
     self.assertEqual(l.block_size, frame_step)
@@ -160,6 +168,14 @@ def test_frame_exhaustive(
         expected_input_latency = frame_length - 1
       case 'semicausal_full':
         expected_input_latency = frame_step - 1
+      case 'explicit_semicausal':
+        # If frame_length >= frame_step, the below expression simplifies to
+        # frame_step - 1. If frame_length < frame_step, the expression
+        # simplifies to frame_length - 1. In both cases, the output latency will
+        # be zero both expressions are less than frame_step.
+        expected_input_latency = (frame_length - 1) - max(
+            0, frame_length - frame_step
+        )
       case _:
         # Unsupported defaults to zero.
         expected_input_latency = 0

From 391b1b3658cf6d094ad92dd058478ea17a8c7b4e Mon Sep 17 00:00:00 2001
From: Julian Salazar <julsal@google.com>
Date: Tue, 2 Jun 2026 00:00:28 +0000
Subject: [PATCH 21/29] feat(mlx): canonicalize dtype conversions and preserve
 int64/float64

TAG=agy
CONV=21ada17b-3411-4090-8450-e69d8ebfeae6
---
 sequence_layers/mlx/init_mapping.py | 54 ++++++++++++++++++-----------
 1 file changed, 34 insertions(+), 20 deletions(-)

diff --git a/sequence_layers/mlx/init_mapping.py b/sequence_layers/mlx/init_mapping.py
index d9cc6c1..72f0e81 100644
--- a/sequence_layers/mlx/init_mapping.py
+++ b/sequence_layers/mlx/init_mapping.py
@@ -71,30 +71,44 @@ def init_fn(key, shape, dtype=mx.float32):
   return init_fn
 
 
+# Canonical numpy dtype name -> MLX dtype. Keyed by the exact name returned by
+# np.dtype(...).name, so no ordering or substring concerns apply.
+# Every dtype maps to its exact MLX equivalent -- we never downcast on the
+# user's behalf, since that would silently change precision and hide bugs.
+# (Some, e.g. float64, exist but are unsupported on the Metal GPU; let MLX
+# raise at the op so the mismatch is visible rather than masked.)
+_MX_DTYPE_BY_NAME = {
+    'bfloat16': mx.bfloat16,
+    'float32': mx.float32,
+    'float16': mx.float16,
+    'float64': mx.float64,
+    'int32': mx.int32,
+    'int64': mx.int64,
+    'int16': mx.int16,
+    'int8': mx.int8,
+    'uint8': mx.uint8,
+    'uint16': mx.uint16,
+    'uint32': mx.uint32,
+    'uint64': mx.uint64,
+    'bool': mx.bool_,
+    'complex64': mx.complex64,
+}
+
+
 def _to_mx_dtype(dtype):
   """Convert any dtype (JAX, numpy, MLX) to an MLX dtype."""
   if isinstance(dtype, mx.Dtype):
     return dtype
-  name = getattr(dtype, '__name__', '') or str(dtype)
-  mapping = {
-      'float32': mx.float32,
-      'float16': mx.float16,
-      'bfloat16': mx.bfloat16,
-      'float64': mx.float32,  # MLX lacks float64.
-      'int32': mx.int32,
-      'int64': mx.int32,  # MLX lacks int64.
-      'int16': mx.int16,
-      'int8': mx.int8,
-      'uint8': mx.uint8,
-      'uint32': mx.uint32,
-      'bool': mx.bool_,
-      'bool_': mx.bool_,
-      'complex64': mx.complex64,
-  }
-  for key, val in mapping.items():
-    if key in name:
-      return val
-  return mx.float32
+  if dtype is None:
+    # np.dtype(None) silently yields float64; require callers to resolve their
+    # own default rather than coerce here.
+    raise ValueError('_to_mx_dtype received None; expected a concrete dtype.')
+  # numpy understands JAX, ml_dtypes (bfloat16), string, and Python-type
+  # inputs, canonicalizing each to an exact name we can look up directly.
+  name = np.dtype(dtype).name
+  if name not in _MX_DTYPE_BY_NAME:
+    raise ValueError(f'No MLX dtype mapping for {dtype!r} (numpy name {name!r}).')
+  return _MX_DTYPE_BY_NAME[name]
 
 
 def _zeros_init(key, shape, dtype=mx.float32):

From eccd8452f826eb935cfa05994b51a161dfb74571 Mon Sep 17 00:00:00 2001
From: Julian Salazar <julsal@google.com>
Date: Tue, 2 Jun 2026 00:00:43 +0000
Subject: [PATCH 22/29] chore: migrate all relative imports to absolute imports
 for Google3

TAG=agy
CONV=21ada17b-3411-4090-8450-e69d8ebfeae6
---
 sequence_layers/mlx/combinators.py   | 2 +-
 sequence_layers/mlx/convolution.py   | 2 +-
 sequence_layers/mlx/convolution2d.py | 2 +-
 sequence_layers/mlx/dsp.py           | 2 +-
 sequence_layers/mlx/export.py        | 2 +-
 sequence_layers/mlx/position.py      | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/sequence_layers/mlx/combinators.py b/sequence_layers/mlx/combinators.py
index a3dc1ad..c6db286 100644
--- a/sequence_layers/mlx/combinators.py
+++ b/sequence_layers/mlx/combinators.py
@@ -14,7 +14,7 @@
 from sequence_layers.mlx import utils as mlx_utils
 from sequence_layers.specs import combinators as spec
 
-from . import types as bt
+from sequence_layers.mlx import types as bt
 
 Sequence = bt.Sequence
 CombinationMode = spec.CombinationMode
diff --git a/sequence_layers/mlx/convolution.py b/sequence_layers/mlx/convolution.py
index b7c96dd..b77af11 100644
--- a/sequence_layers/mlx/convolution.py
+++ b/sequence_layers/mlx/convolution.py
@@ -15,7 +15,7 @@
     SequenceLayerConfig as _SequenceLayerConfig
 from sequence_layers.specs import convolution as spec
 
-from . import types as bt
+from sequence_layers.mlx import types as bt
 
 Sequence = bt.Sequence
 MaskedSequence = bt.MaskedSequence
diff --git a/sequence_layers/mlx/convolution2d.py b/sequence_layers/mlx/convolution2d.py
index 21f2130..b35b9bf 100644
--- a/sequence_layers/mlx/convolution2d.py
+++ b/sequence_layers/mlx/convolution2d.py
@@ -17,7 +17,7 @@
     SequenceLayerConfig as _SequenceLayerConfig
 from sequence_layers.specs import convolution as spec
 
-from . import types as bt
+from sequence_layers.mlx import types as bt
 
 Sequence = bt.Sequence
 MaskedSequence = bt.MaskedSequence
diff --git a/sequence_layers/mlx/dsp.py b/sequence_layers/mlx/dsp.py
index 1a2e102..0109426 100644
--- a/sequence_layers/mlx/dsp.py
+++ b/sequence_layers/mlx/dsp.py
@@ -13,7 +13,7 @@
 from sequence_layers.mlx import types
 from sequence_layers.specs import dsp as spec
 
-from . import types as bt
+from sequence_layers.mlx import types as bt
 
 Sequence = bt.Sequence
 MaskedSequence = bt.MaskedSequence
diff --git a/sequence_layers/mlx/export.py b/sequence_layers/mlx/export.py
index 1beebce..338a317 100644
--- a/sequence_layers/mlx/export.py
+++ b/sequence_layers/mlx/export.py
@@ -2,7 +2,7 @@
 
 import mlx.core as mx
 
-from . import types as bt
+from sequence_layers.mlx import types as bt
 
 Sequence = bt.Sequence
 
diff --git a/sequence_layers/mlx/position.py b/sequence_layers/mlx/position.py
index a41280b..c185952 100644
--- a/sequence_layers/mlx/position.py
+++ b/sequence_layers/mlx/position.py
@@ -24,7 +24,7 @@
 from sequence_layers.mlx.init_mapping import _to_mx_dtype
 from sequence_layers.specs import position as position_spec
 
-from . import types as bt
+from sequence_layers.mlx import types as bt
 
 Sequence = bt.Sequence
 MaskedSequence = bt.MaskedSequence

From 8dc45f4a73f32f9f94f6c7fd742d44286cef6782 Mon Sep 17 00:00:00 2001
From: Julian Salazar <julsal@google.com>
Date: Tue, 2 Jun 2026 00:04:38 +0000
Subject: [PATCH 23/29] refactor(mlx): unify duplicate _to_mx_dtype
 implementations

TAG=agy
CONV=21ada17b-3411-4090-8450-e69d8ebfeae6
---
 sequence_layers/mlx/simple.py | 20 ++------------------
 sequence_layers/mlx/utils.py  | 25 ++-----------------------
 2 files changed, 4 insertions(+), 41 deletions(-)

diff --git a/sequence_layers/mlx/simple.py b/sequence_layers/mlx/simple.py
index dfa9cd4..938a4e1 100644
--- a/sequence_layers/mlx/simple.py
+++ b/sequence_layers/mlx/simple.py
@@ -10,6 +10,7 @@
 import mlx.core as mx
 import numpy as np
 
+from sequence_layers.mlx import init_mapping
 from sequence_layers.mlx import types
 from sequence_layers.specs import simple as spec
 
@@ -22,24 +23,7 @@ def _to_mx_dtype(dtype: Any) -> mx.Dtype | None:
   """Converts various dtype representations to MLX DType."""
   if dtype is None:
     return None
-  if isinstance(dtype, str):
-    if dtype == 'float32':
-      return mx.float32
-    if dtype == 'float16':
-      return mx.float16
-    if dtype == 'int32':
-      return mx.int32
-    if dtype == 'bool':
-      return mx.bool_
-  if dtype == np.float32:
-    return mx.float32
-  if dtype == np.float16:
-    return mx.float16
-  if dtype == np.int32:
-    return mx.int32
-  if dtype in (np.bool_, bool):
-    return mx.bool_
-  return dtype
+  return init_mapping._to_mx_dtype(dtype)
 
 
 # ---------------------------------------------------------------------------
diff --git a/sequence_layers/mlx/utils.py b/sequence_layers/mlx/utils.py
index a2739c2..9f648b1 100644
--- a/sequence_layers/mlx/utils.py
+++ b/sequence_layers/mlx/utils.py
@@ -8,6 +8,7 @@
 import mlx.core as mx
 import numpy as np
 
+from sequence_layers.mlx import init_mapping
 from sequence_layers.specs import combinators as spec_combinators
 from sequence_layers.specs import types as specs_types
 
@@ -84,29 +85,7 @@ def _to_mx_dtype(dtype: Any) -> Any:
   """Converts various dtype representations to MLX DType."""
   if dtype is None:
     return None
-  if isinstance(dtype, str):
-    if dtype == 'float32':
-      return mx.float32
-    if dtype == 'float16':
-      return mx.float16
-    if dtype == 'int32':
-      return mx.int32
-    if dtype == 'bool':
-      return mx.bool_
-  # Handle JAX/Numpy dtypes
-  try:
-    np_dtype = np.dtype(dtype)
-    if np_dtype == np.float32:
-      return mx.float32
-    if np_dtype == np.float16:
-      return mx.float16
-    if np_dtype == np.int32:
-      return mx.int32
-    if np_dtype == np.bool_:
-      return mx.bool_
-  except (TypeError, ValueError):
-    pass
-  return dtype
+  return init_mapping._to_mx_dtype(dtype)
 
 
 def _map_activation(act: Any) -> Any:

From f32f073dbbe24c3901ffcb74f44a6ea59ae8d285 Mon Sep 17 00:00:00 2001
From: Julian Salazar <julsal@google.com>
Date: Tue, 2 Jun 2026 00:09:55 +0000
Subject: [PATCH 24/29] style(jax): reorder explicit_semicausal in dsp_test.py
 to minimize diff with main

TAG=agy
CONV=21ada17b-3411-4090-8450-e69d8ebfeae6
---
 sequence_layers/jax/dsp_test.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sequence_layers/jax/dsp_test.py b/sequence_layers/jax/dsp_test.py
index 9b00dab..3aa56d4 100644
--- a/sequence_layers/jax/dsp_test.py
+++ b/sequence_layers/jax/dsp_test.py
@@ -123,8 +123,8 @@ class FrameTest(test_utils.SequenceLayerTest, spec.FrameTest):
           'reverse_causal',
           'same',
           'valid',
-          'semicausal_full',
           'explicit_semicausal',
+          'semicausal_full',
       ),
   )
   def test_frame_exhaustive(
@@ -166,8 +166,6 @@ def test_frame_exhaustive(
         expected_input_latency = 0
       case 'reverse_causal_valid' | 'reverse_causal':
         expected_input_latency = frame_length - 1
-      case 'semicausal_full':
-        expected_input_latency = frame_step - 1
       case 'explicit_semicausal':
         # If frame_length >= frame_step, the below expression simplifies to
         # frame_step - 1. If frame_length < frame_step, the expression
@@ -176,6 +174,8 @@ def test_frame_exhaustive(
         expected_input_latency = (frame_length - 1) - max(
             0, frame_length - frame_step
         )
+      case 'semicausal_full':
+        expected_input_latency = frame_step - 1
       case _:
         # Unsupported defaults to zero.
         expected_input_latency = 0

From 1c1107e9533b5609279e0fdf61d9ea60506d8ae9 Mon Sep 17 00:00:00 2001
From: David Braun <2096055+DBraun@users.noreply.github.com>
Date: Tue, 2 Jun 2026 14:49:00 -0500
Subject: [PATCH 25/29] feat(mlx): add EinsumDense.to_quantized for int8/int4
 quantization
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

mlx.nn.quantize only quantizes modules that define a to_quantized() method.
The MLX EinsumDense layer — used for the attention head-combining output
projection (equation '...nh,dnh->...d') — had no to_quantized(), so under
nn.quantize it silently stayed full-precision (bf16) while the rest of the
model was int8-quantized. That makes exported models larger and slower and
changes numerics versus an all-int8 model.

Add to_quantized() for the '...nh,dnh->...d' equation: flatten the [d, n, h]
kernel to [d, n*h], mx.quantize it, and rebind layer() to flatten the
[..., n, h] input and use mx.quantized_matmul. Other equations are returned
unchanged.
---
 sequence_layers/mlx/dense.py | 54 ++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/sequence_layers/mlx/dense.py b/sequence_layers/mlx/dense.py
index fef6a34..2aaed38 100644
--- a/sequence_layers/mlx/dense.py
+++ b/sequence_layers/mlx/dense.py
@@ -262,6 +262,60 @@ def einsum_fn(v):
       return x.apply_values(einsum_fn)
     return x.apply_values_masked(einsum_fn)
 
+  def to_quantized(self, group_size: int = 64, bits: int = 4, mode: str = 'affine'):
+    """Weight-only quantize the head-combining projection kernel.
+
+    Only the '...nh,dnh->...d' equation (the attention output projection) is
+    supported; other equations are returned unchanged. The [d, n, h] kernel is
+    flattened to [d, n*h] and quantized; the layer is rebound to flatten the
+    [..., n, h] input to [..., n*h] and use mx.quantized_matmul.
+    """
+    if (
+        self.kernel is None
+        or self.config.equation != '...nh,dnh->...d'
+        or (self.kernel.shape[-1] * self.kernel.shape[-2]) % group_size != 0
+    ):
+      return self
+
+    _d, _n, _h = self.kernel.shape
+    kernel_2d = self.kernel.reshape(_d, _n * _h)
+    self.q_weight, self.q_scales, self.q_biases = mx.quantize(
+        kernel_2d, group_size=group_size, bits=bits
+    )
+    self._q_group_size = group_size
+    self._q_bits = bits
+    self.kernel = None
+
+    activation = self.activation
+
+    def _quantized_layer(self, x, *, training: bool, constants=None):
+      compute_dtype = self.get_output_dtype(x.dtype)
+
+      def quantized_einsum_fn(v):
+        v_2d = v.reshape(*v.shape[:-2], _n * _h).astype(compute_dtype)
+        y = mx.quantized_matmul(
+            v_2d,
+            self.q_weight,
+            scales=self.q_scales,
+            biases=self.q_biases,
+            transpose=True,
+            group_size=self._q_group_size,
+            bits=self._q_bits,
+        )
+        if self.bias is not None:
+          y = y + self.bias
+        if activation is not None:
+          y = activation(y)
+        return y
+
+      if self.bias is not None or activation is not None:
+        return x.apply_values(quantized_einsum_fn)
+      return x.apply_values_masked(quantized_einsum_fn)
+
+    import types as _pytypes
+    self.layer = _pytypes.MethodType(_quantized_layer, self)
+    return self
+
 
 def _parse_equation(equation):
   """Parse einsum equation of form '...ab,bc->...ac'."""

From af9efa04355906fd05d641b5d86b8881b0e929e9 Mon Sep 17 00:00:00 2001
From: Julian Salazar <julsal@google.com>
Date: Tue, 2 Jun 2026 21:59:09 +0000
Subject: [PATCH 26/29] feat(mlx): support combined QKV projection quantization
 in DotProductSelfAttention

Quantizes attention layers when using combined projections (CombinedQueryKeyValueProjection layout, which is the default in mrt2 samplers). Splits the combined bias into q_bias and kv_bias to remain fully compatible with downstream evaluation functions.

TAG=agy
CONV=21ada17b-3411-4090-8450-e69d8ebfeae6
---
 sequence_layers/mlx/attention.py | 34 ++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/sequence_layers/mlx/attention.py b/sequence_layers/mlx/attention.py
index bbf1464..20b07be 100644
--- a/sequence_layers/mlx/attention.py
+++ b/sequence_layers/mlx/attention.py
@@ -894,26 +894,44 @@ def to_quantized(
   ):
     """Convert attention projection layers to quantized versions."""
     del mode  # Unused in MLX quantize
-    if (
-        getattr(self, 'q_proj', None) is None
-        or self.q_proj.shape[0] % group_size != 0
-    ):
+
+    # Determine in_features from whichever projection layout was initialized.
+    in_features = None
+    if getattr(self, 'qkv_proj', None) is not None:
+      in_features = self.qkv_proj.shape[0]
+    elif getattr(self, 'q_proj', None) is not None:
+      in_features = self.q_proj.shape[0]
+
+    if in_features is None or in_features % group_size != 0:
       return self
 
     self._quant_group_size = group_size
     self._quant_bits = bits
 
-    w_q = self.q_proj.T
-    # kv_proj is already combined [in, 2*kv_dim].
-    w_kv = self.kv_proj.T
-    w_qkv = mx.concatenate([w_q, w_kv], axis=0)
+    # Build the combined QKV weight matrix from whichever layout exists.
+    if getattr(self, 'qkv_proj', None) is not None:
+      w_qkv = self.qkv_proj.T
+    else:
+      w_q = self.q_proj.T
+      # kv_proj is already combined [in, 2*kv_dim].
+      w_kv = self.kv_proj.T
+      w_qkv = mx.concatenate([w_q, w_kv], axis=0)
+
     self.qkv_proj_qw, self.qkv_proj_qs, self.qkv_proj_qb = mx.quantize(
         w_qkv, group_size=group_size, bits=bits
     )
 
+    # Clear all original projection weights.
+    self.qkv_proj = cast(Any, None)
     self.q_proj = cast(Any, None)
     self.kv_proj = cast(Any, None)
 
+    # Split combined bias into q_bias / kv_bias for the quantized path.
+    if self.use_bias and getattr(self, 'qkv_bias', None) is not None:
+      d_q = self.num_heads * self.units_per_head
+      self.q_bias, self.kv_bias = mx.split(self.qkv_bias, [d_q], axis=-1)
+      self.qkv_bias = cast(Any, None)
+
     def _project_qkv(self, x):
       b, t = x.shape[0], x.shape[1]
       dtype = self.compute_dtype or x.dtype

From fc1874aed424fafa4cdcc1576bc960ae9986c939 Mon Sep 17 00:00:00 2001
From: Julian Salazar <julsal@google.com>
Date: Thu, 4 Jun 2026 09:33:53 +0000
Subject: [PATCH 27/29] restore Python 3.12 compatibility

---
 pyproject.toml                           |  2 +-
 sequence_layers/specs/dense.py           |  8 ++++----
 sequence_layers/specs/types.py           | 14 +++++++-------
 sequence_layers/specs/types_behaviors.py |  2 +-
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e40e2bf..f154b9f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@
 name = "sequence_layers"
 description = "Sequence Layers neural network layer library from Google."
 readme = "README.md"
-requires-python = ">=3.13"
+requires-python = ">=3.12"
 license = {file = "LICENSE"}
 authors = [
     {name = "RJ Skerry-Ryan", email="rjryan@google.com"},
diff --git a/sequence_layers/specs/dense.py b/sequence_layers/specs/dense.py
index e3a591f..fa39702 100644
--- a/sequence_layers/specs/dense.py
+++ b/sequence_layers/specs/dense.py
@@ -11,8 +11,8 @@
 
 
 class Dense[
-    SequenceT: types_spec.Sequence = types_spec.Sequence,
-    ShapeDTypeT: types_spec.ChannelSpec = types_spec.ChannelSpec,
+    SequenceT: types_spec.Sequence,
+    ShapeDTypeT: types_spec.ChannelSpec,
 ](
     types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
@@ -36,8 +36,8 @@ def make(self) -> Any:
 
 
 class EinsumDense[
-    SequenceT: types_spec.Sequence = types_spec.Sequence,
-    ShapeDTypeT: types_spec.ChannelSpec = types_spec.ChannelSpec,
+    SequenceT: types_spec.Sequence,
+    ShapeDTypeT: types_spec.ChannelSpec,
 ](
     types_spec.Stateless[SequenceT, SequenceT, ShapeDTypeT],
     metaclass=abc.ABCMeta,
diff --git a/sequence_layers/specs/types.py b/sequence_layers/specs/types.py
index 395815c..ffa78f4 100644
--- a/sequence_layers/specs/types.py
+++ b/sequence_layers/specs/types.py
@@ -88,14 +88,14 @@ def __init__(self, shape: Shape, dtype: Any):
 Emits = jt.PyTree[Array]
 
 
-ValuesT = TypeVar('ValuesT', bound=Array, default=Array)
-MaskT = TypeVar('MaskT', bound=Array, default=Array)
-ChannelSpecT = TypeVar('ChannelSpecT', bound=ChannelSpec, default=ChannelSpec)
+ValuesT = TypeVar('ValuesT', bound=Array)
+MaskT = TypeVar('MaskT', bound=Array)
+ChannelSpecT = TypeVar('ChannelSpecT', bound=ChannelSpec)
 
-LengthsT = TypeVar('LengthsT', bound=Array, default=Array)
+LengthsT = TypeVar('LengthsT', bound=Array)
 
-InputT = TypeVar('InputT', bound='Sequence', default='Sequence')
-OutputT = TypeVar('OutputT', bound='Sequence', default='Sequence')
+InputT = TypeVar('InputT', bound='Sequence')
+OutputT = TypeVar('OutputT', bound='Sequence')
 
 # A "self" type alias to allow Sequence and subclasses to return their own
 # Sequence subtype. (Self cannot be parameterized.)
@@ -218,7 +218,7 @@ class PaddingMode(enum.Enum):
 ]
 
 
-class Sequence[ValuesT = Array, MaskT = Array](metaclass=abc.ABCMeta):
+class Sequence[ValuesT, MaskT](metaclass=abc.ABCMeta):
   """A generic sequence container that preserves masking information.
 
   Note: This class can hold non-backend-specific arrays (like `np.ndarray`) to
diff --git a/sequence_layers/specs/types_behaviors.py b/sequence_layers/specs/types_behaviors.py
index 3f93c87..40a62bc 100644
--- a/sequence_layers/specs/types_behaviors.py
+++ b/sequence_layers/specs/types_behaviors.py
@@ -780,7 +780,7 @@ class StatelessPointwiseFunctorTest(SequenceLayerTest):
 
   def create_layer(
       self, is_mask_required: bool
-  ) -> types_spec.SequenceLayer[Any]:
+  ) -> types_spec.SequenceLayer[Any, Any, Any]:
     """Creates a stateless pointwise functor layer."""
 
     backend_sl = self.sl

From 8d37df0b4697930409e16fbeceefa4aa0e941fd1 Mon Sep 17 00:00:00 2001
From: Julian Salazar <julsal@google.com>
Date: Thu, 4 Jun 2026 09:55:33 +0000
Subject: [PATCH 28/29] simplify versioning to 0.3a1

---
 sequence_layers/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sequence_layers/__init__.py b/sequence_layers/__init__.py
index c9e86ac..ed9219c 100644
--- a/sequence_layers/__init__.py
+++ b/sequence_layers/__init__.py
@@ -13,4 +13,4 @@
 # limitations under the License.
 """Package directory file for Sequence Layers."""
 
-__version__ = '0.3.0rc1'
+__version__ = '0.3a1'

From 3d25391c47fb5af7b6ac62cf9a6902caf75c66fc Mon Sep 17 00:00:00 2001
From: Julian Salazar <julsal@google.com>
Date: Thu, 4 Jun 2026 09:56:17 +0000
Subject: [PATCH 29/29] refactor(mlx): use mlx_layers backing attribute for
 Serial combinators

Refactors SerialCombinatorMixin and its subclasses (Serial, SerialModules) to use a public `mlx_layers` backing attribute instead of dynamic `setattr` loops or private `_layers` lists.

This is required because MLX nn.Module only tracks submodules that are stored in public attributes (without a leading underscore) for parameter collection. Since `layers` is a read-only property in the shared spec, we use `mlx_layers` as the backing attribute and have the mixin's `layers` property return it.

Also updates the JAX-to-MLX weight converter to use index-based access (`layers[i]`) instead of dynamic attribute lookup.

TAG=agy
CONV=21ada17b-3411-4090-8450-e69d8ebfeae6
---
 sequence_layers/converters/jax_to_mlx.py |  4 +--
 sequence_layers/mlx/combinators.py       | 33 ++++++++++++++----------
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/sequence_layers/converters/jax_to_mlx.py b/sequence_layers/converters/jax_to_mlx.py
index 41376e3..5090369 100644
--- a/sequence_layers/converters/jax_to_mlx.py
+++ b/sequence_layers/converters/jax_to_mlx.py
@@ -243,7 +243,7 @@ def _load_serial(mlx_serial, linen_params, config, batch_stats=None):
     child_bs = batch_stats.get(key, {}) if batch_stats else None
 
     _load_config(
-        getattr(mlx_serial, name),
+        mlx_serial.layers[i],
         child_params,
         layer_config,
         batch_stats=child_bs,
@@ -326,7 +326,7 @@ def _load_residual(mlx_residual, linen_params, config, batch_stats=None):
     child_bs = batch_stats.get(key, {}) if batch_stats else None
 
     _load_config(
-        getattr(body, name),
+        body.layers[i],
         child_params,
         layer_config,
         batch_stats=child_bs,
diff --git a/sequence_layers/mlx/combinators.py b/sequence_layers/mlx/combinators.py
index c6db286..9cf2e40 100644
--- a/sequence_layers/mlx/combinators.py
+++ b/sequence_layers/mlx/combinators.py
@@ -98,8 +98,20 @@ class SerialCombinatorMixin:
 
   @property
   def layers(self) -> list[types.SequenceLayer]:
-    """Returns the list of layers in the serial combinator."""
-    raise NotImplementedError()
+    """Returns the list of layers in the serial combinator.
+
+    MLX nn.Module requires submodules to be stored in public attributes (without
+    a leading underscore) to be tracked for parameter collection. However,
+    because 'layers' is defined as a read-only property in the spec, we cannot
+    assign to 'self.layers' directly in __init__.
+
+    To satisfy both constraints, subclasses must store their child layers in the
+    public attribute 'self.mlx_layers' (which MLX will track), and this property
+    will return it.
+    """
+    if not hasattr(self, 'mlx_layers'):
+      raise AttributeError("self.mlx_layers backing attribute not initialized")
+    return self.mlx_layers
 
   @property
   def supports_step(self):
@@ -213,12 +225,8 @@ class SerialModules(
 
   def __init__(self, layers: _Sequence[types.SequenceLayer]):
     super().__init__()
-    self._layers = list(layers)
-
-  @property
-  @override
-  def layers(self) -> list[types.SequenceLayer]:
-    return self._layers
+    # Store in mlx_layers to enable MLX parameter tracking
+    self.mlx_layers = list(layers)
 
 
 class Serial(
@@ -257,13 +265,10 @@ def __init__(
         if isinstance(name_opt, str):
           name = name_opt
       self._layer_names.append(name)
-      setattr(self, name, l)
-      setattr(self, f'layers_{i}', l)
+    # Store in mlx_layers to enable MLX parameter tracking
+    self.mlx_layers = layers
+
 
-  @property
-  @override
-  def layers(self) -> list[types.SequenceLayer]:
-    return [getattr(self, name) for name in self._layer_names]
 
   @classmethod
   def from_config(cls, config, backend='mlx'):