From a92c70c08116511e0977c0e71595219d32834f02 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 17 Jun 2026 05:36:56 +0000
Subject: [PATCH 1/7] port final set of model tests and others

---
 tests/models/test_modeling_common.py          | 2172 +----------------
 .../test_models_dit_transformer2d.py          |  109 +-
 .../test_models_pixart_transformer2d.py       |  116 +-
 .../models/transformers/test_models_prior.py  |  105 +-
 .../test_models_transformer_allegro.py        |   87 +-
 .../test_models_transformer_aura_flow.py      |   89 +-
 .../test_models_transformer_cogvideox.py      |  161 +-
 .../test_models_transformer_cogview3plus.py   |  108 +-
 .../test_models_transformer_cogview4.py       |   95 +-
 .../test_models_transformer_consisid.py       |   88 +-
 .../test_models_transformer_latte.py          |   97 +-
 .../test_models_transformer_motif_video.py    |    2 +-
 .../test_models_transformer_sana_video.py     |  103 +-
 .../test_models_transformer_temporal.py       |   76 +-
 tests/others/test_utils.py                    |    2 +-
 15 files changed, 780 insertions(+), 2630 deletions(-)

diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
index 8575439649d7..7e7822ac16ea 100644
--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -13,225 +13,46 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
-import gc
-import glob
 import inspect
-import json
+import logging
 import os
-import re
 import tempfile
-import traceback
-import unittest
 import unittest.mock as mock
 import uuid
-from collections import defaultdict
-from typing import Dict, List, Tuple
 
-import numpy as np
 import pytest
 import requests_mock
-import safetensors.torch
 import torch
-import torch.nn as nn
-from accelerate.utils.modeling import _get_proper_dtype, compute_module_sizes, dtype_byte_size
 from huggingface_hub import ModelCard, delete_repo, snapshot_download, try_to_load_from_cache
 from huggingface_hub.utils import HfHubHTTPError, is_jinja_available
-from parameterized import parameterized
 
 from diffusers.models import FluxTransformer2DModel, SD3Transformer2DModel, UNet2DConditionModel
-from diffusers.models.attention_processor import (
-    AttnProcessor,
-    AttnProcessor2_0,
-    AttnProcessorNPU,
-    XFormersAttnProcessor,
-)
-from diffusers.models.auto_model import AutoModel
-from diffusers.models.modeling_outputs import BaseOutput
-from diffusers.training_utils import EMAModel
-from diffusers.utils import (
-    SAFE_WEIGHTS_INDEX_NAME,
-    WEIGHTS_INDEX_NAME,
-    is_peft_available,
-    is_torch_npu_available,
-    is_xformers_available,
-    logging,
-)
-from diffusers.utils.hub_utils import _add_variant
-from diffusers.utils.torch_utils import get_torch_cuda_device_capability
 
 from ..others.test_utils import TOKEN, USER, is_staging_test
 from ..testing_utils import (
     CaptureLogger,
-    _check_safetensors_serialization,
-    backend_empty_cache,
-    backend_max_memory_allocated,
-    backend_reset_peak_memory_stats,
-    backend_synchronize,
-    check_if_dicts_are_equal,
-    get_python_version,
-    is_torch_compile,
-    numpy_cosine_similarity_distance,
-    require_peft_backend,
-    require_peft_version_greater,
-    require_torch_2,
     require_torch_accelerator,
-    require_torch_accelerator_with_training,
-    require_torch_multi_accelerator,
-    require_torch_version_greater,
-    run_test_in_subprocess,
-    slow,
-    torch_all_close,
     torch_device,
 )
 
 
-if is_peft_available():
-    from peft.tuners.tuners_utils import BaseTunerLayer
-
-
-def caculate_expected_num_shards(index_map_path):
-    with open(index_map_path) as f:
-        weight_map_dict = json.load(f)["weight_map"]
-    first_key = list(weight_map_dict.keys())[0]
-    weight_loc = weight_map_dict[first_key]  # e.g., diffusion_pytorch_model-00001-of-00002.safetensors
-    expected_num_shards = int(weight_loc.split("-")[-1].split(".")[0])
-    return expected_num_shards
-
-
-def check_if_lora_correctly_set(model) -> bool:
-    """
-    Checks if the LoRA layers are correctly set with peft
-    """
-    for module in model.modules():
-        if isinstance(module, BaseTunerLayer):
-            return True
-    return False
-
-
-def normalize_output(out):
-    out0 = out[0] if isinstance(out, (BaseOutput, tuple)) else out
-    return torch.stack(out0) if isinstance(out0, list) else out0
-
-
-# Will be run via run_test_in_subprocess
-def _test_from_save_pretrained_dynamo(in_queue, out_queue, timeout):
-    error = None
-    try:
-        init_dict, model_class = in_queue.get(timeout=timeout)
-
-        model = model_class(**init_dict)
-        model.to(torch_device)
-        model = torch.compile(model)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(tmpdirname, safe_serialization=False)
-            new_model = model_class.from_pretrained(tmpdirname)
-            new_model.to(torch_device)
-
-        assert new_model.__class__ == model_class
-    except Exception:
-        error = f"{traceback.format_exc()}"
-
-    results = {"error": error}
-    out_queue.put(results, timeout=timeout)
-    out_queue.join()
-
-
-def named_persistent_module_tensors(
-    module: nn.Module,
-    recurse: bool = False,
-):
-    """
-    A helper function that gathers all the tensors (parameters + persistent buffers) of a given module.
-
-    Args:
-        module (`torch.nn.Module`):
-            The module we want the tensors on.
-        recurse (`bool`, *optional`, defaults to `False`):
-            Whether or not to go look in every submodule or just return the direct parameters and buffers.
-    """
-    yield from module.named_parameters(recurse=recurse)
-
-    for named_buffer in module.named_buffers(recurse=recurse):
-        name, _ = named_buffer
-        # Get parent by splitting on dots and traversing the model
-        parent = module
-        if "." in name:
-            parent_name = name.rsplit(".", 1)[0]
-            for part in parent_name.split("."):
-                parent = getattr(parent, part)
-            name = name.split(".")[-1]
-        if name not in parent._non_persistent_buffers_set:
-            yield named_buffer
-
-
-def compute_module_persistent_sizes(
-    model: nn.Module,
-    dtype: str | torch.device | None = None,
-    special_dtypes: dict[str, str | torch.device] | None = None,
-):
-    """
-    Compute the size of each submodule of a given model (parameters + persistent buffers).
-    """
-    if dtype is not None:
-        dtype = _get_proper_dtype(dtype)
-        dtype_size = dtype_byte_size(dtype)
-    if special_dtypes is not None:
-        special_dtypes = {key: _get_proper_dtype(dtyp) for key, dtyp in special_dtypes.items()}
-        special_dtypes_size = {key: dtype_byte_size(dtyp) for key, dtyp in special_dtypes.items()}
-    module_sizes = defaultdict(int)
-
-    module_list = []
-
-    module_list = named_persistent_module_tensors(model, recurse=True)
-
-    for name, tensor in module_list:
-        if special_dtypes is not None and name in special_dtypes:
-            size = tensor.numel() * special_dtypes_size[name]
-        elif dtype is None:
-            size = tensor.numel() * dtype_byte_size(tensor.dtype)
-        elif str(tensor.dtype).startswith(("torch.uint", "torch.int", "torch.bool")):
-            # According to the code in set_module_tensor_to_device, these types won't be converted
-            # so use their original size here
-            size = tensor.numel() * dtype_byte_size(tensor.dtype)
-        else:
-            size = tensor.numel() * min(dtype_size, dtype_byte_size(tensor.dtype))
-        name_parts = name.split(".")
-        for idx in range(len(name_parts) + 1):
-            module_sizes[".".join(name_parts[:idx])] += size
-
-    return module_sizes
-
-
-def cast_maybe_tensor_dtype(maybe_tensor, current_dtype, target_dtype):
-    if torch.is_tensor(maybe_tensor):
-        return maybe_tensor.to(target_dtype) if maybe_tensor.dtype == current_dtype else maybe_tensor
-    if isinstance(maybe_tensor, dict):
-        return {k: cast_maybe_tensor_dtype(v, current_dtype, target_dtype) for k, v in maybe_tensor.items()}
-    if isinstance(maybe_tensor, list):
-        return [cast_maybe_tensor_dtype(v, current_dtype, target_dtype) for v in maybe_tensor]
-    return maybe_tensor
-
-
-class ModelUtilsTest(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-
+class TestModelUtils:
     def test_missing_key_loading_warning_message(self):
-        with self.assertLogs("diffusers.models.modeling_utils", level="WARNING") as logs:
+        logger = logging.getLogger("diffusers.models.modeling_utils")
+        with CaptureLogger(logger) as cap_logger:
             UNet2DConditionModel.from_pretrained("hf-internal-testing/stable-diffusion-broken", subfolder="unet")
 
         # make sure that error message states what keys are missing
-        assert "conv_out.bias" in " ".join(logs.output)
+        assert "conv_out.bias" in cap_logger.out
 
-    @parameterized.expand(
+    @pytest.mark.parametrize(
+        "repo_id, subfolder, use_local",
         [
             ("hf-internal-testing/tiny-stable-diffusion-pipe-variants-all-kinds", "unet", False),
             ("hf-internal-testing/tiny-stable-diffusion-pipe-variants-all-kinds", "unet", True),
             ("hf-internal-testing/tiny-sd-unet-with-sharded-ckpt", None, False),
             ("hf-internal-testing/tiny-sd-unet-with-sharded-ckpt", None, True),
-        ]
+        ],
     )
     def test_variant_sharded_ckpt_legacy_format_raises_warning(self, repo_id, subfolder, use_local):
         def load_model(path):
@@ -240,7 +61,7 @@ def load_model(path):
                 kwargs["subfolder"] = subfolder
             return UNet2DConditionModel.from_pretrained(path, **kwargs)
 
-        with self.assertWarns(FutureWarning) as warning:
+        with pytest.warns(FutureWarning) as warning:
             if use_local:
                 with tempfile.TemporaryDirectory() as tmpdirname:
                     tmpdirname = snapshot_download(repo_id=repo_id)
@@ -248,19 +69,20 @@ def load_model(path):
             else:
                 _ = load_model(repo_id)
 
-        warning_messages = " ".join(str(w.message) for w in warning.warnings)
-        self.assertIn("This serialization format is now deprecated to standardize the serialization", warning_messages)
+        warning_messages = " ".join(str(w.message) for w in warning)
+        assert "This serialization format is now deprecated to standardize the serialization" in warning_messages
 
     # Local tests are already covered down below.
-    @parameterized.expand(
+    @pytest.mark.parametrize(
+        "repo_id, subfolder, variant",
         [
             ("hf-internal-testing/tiny-sd-unet-sharded-latest-format", None, "fp16"),
             ("hf-internal-testing/tiny-sd-unet-sharded-latest-format-subfolder", "unet", "fp16"),
             ("hf-internal-testing/tiny-sd-unet-sharded-no-variants", None, None),
             ("hf-internal-testing/tiny-sd-unet-sharded-no-variants-subfolder", "unet", None),
-        ]
+        ],
     )
-    def test_variant_sharded_ckpt_loads_from_hub(self, repo_id, subfolder, variant=None):
+    def test_variant_sharded_ckpt_loads_from_hub(self, repo_id, subfolder, variant):
         def load_model():
             kwargs = {}
             if variant:
@@ -312,7 +134,7 @@ def test_local_files_only_with_sharded_checkpoint(self):
             with mock.patch("huggingface_hub.hf_api.get_session", return_value=client_mock):
                 # Should fail with local_files_only=False (network required)
                 # We would make a network call with model_info
-                with self.assertRaises(OSError):
+                with pytest.raises(OSError):
                     FluxTransformer2DModel.from_pretrained(
                         repo_id, subfolder="transformer", cache_dir=tmpdir, local_files_only=False
                     )
@@ -334,19 +156,19 @@ def test_local_files_only_with_sharded_checkpoint(self):
             os.remove(cached_shard_file)
 
             # Attempting to load from cache should raise an error
-            with self.assertRaises(OSError) as context:
+            with pytest.raises(OSError) as context:
                 FluxTransformer2DModel.from_pretrained(
                     repo_id, subfolder="transformer", cache_dir=tmpdir, local_files_only=True
                 )
 
             # Verify error mentions the missing shard
-            error_msg = str(context.exception)
+            error_msg = str(context.value)
             assert cached_shard_file in error_msg or "required according to the checkpoint index" in error_msg, (
                 f"Expected error about missing shard, got: {error_msg}"
             )
 
-    @unittest.skip("Flaky behaviour on CI. Re-enable after migrating to new runners")
-    @unittest.skipIf(torch_device == "mps", reason="Test not supported for MPS.")
+    @pytest.mark.skip(reason="Flaky behaviour on CI. Re-enable after migrating to new runners")
+    @pytest.mark.skipif(torch_device == "mps", reason="Test not supported for MPS.")
     def test_one_request_upon_cached(self):
         use_safetensors = False
 
@@ -379,7 +201,7 @@ def test_one_request_upon_cached(self):
             )
 
     def test_weight_overwrite(self):
-        with tempfile.TemporaryDirectory() as tmpdirname, self.assertRaises(ValueError) as error_context:
+        with tempfile.TemporaryDirectory() as tmpdirname, pytest.raises(ValueError) as error_context:
             UNet2DConditionModel.from_pretrained(
                 "hf-internal-testing/tiny-stable-diffusion-torch",
                 subfolder="unet",
@@ -388,7 +210,7 @@ def test_weight_overwrite(self):
             )
 
         # make sure that error message states what keys are missing
-        assert "Cannot load" in str(error_context.exception)
+        assert "Cannot load" in str(error_context.value)
 
         with tempfile.TemporaryDirectory() as tmpdirname:
             model = UNet2DConditionModel.from_pretrained(
@@ -420,9 +242,9 @@ def test_keep_modules_in_fp32(self):
             for name, module in model.named_modules():
                 if isinstance(module, torch.nn.Linear):
                     if name in model._keep_in_fp32_modules:
-                        self.assertTrue(module.weight.dtype == torch.float32)
+                        assert module.weight.dtype == torch.float32
                     else:
-                        self.assertTrue(module.weight.dtype == torch_dtype)
+                        assert module.weight.dtype == torch_dtype
 
         def get_dummy_inputs():
             batch_size = 2
@@ -486,1542 +308,8 @@ def test_forward_with_norm_groups(self):
         assert output.shape == expected_shape, "Input and output shapes do not match"
 
 
-class ModelTesterMixin:
-    main_input_name = None  # overwrite in model specific tester class
-    base_precision = 1e-3
-    forward_requires_fresh_args = False
-    model_split_percents = [0.5, 0.7, 0.9]
-    uses_custom_attn_processor = False
-
-    def check_device_map_is_respected(self, model, device_map):
-        for param_name, param in model.named_parameters():
-            # Find device in device_map
-            while len(param_name) > 0 and param_name not in device_map:
-                param_name = ".".join(param_name.split(".")[:-1])
-            if param_name not in device_map:
-                raise ValueError("device map is incomplete, it does not contain any device for `param_name`.")
-
-            param_device = device_map[param_name]
-            if param_device in ["cpu", "disk"]:
-                self.assertEqual(param.device, torch.device("meta"))
-            else:
-                self.assertEqual(param.device, torch.device(param_device))
-
-    def test_from_save_pretrained(self, expected_max_diff=5e-5):
-        if self.forward_requires_fresh_args:
-            model = self.model_class(**self.init_dict)
-        else:
-            init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-            model = self.model_class(**init_dict)
-
-        if hasattr(model, "set_default_attn_processor"):
-            model.set_default_attn_processor()
-        model.to(torch_device)
-        model.eval()
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(tmpdirname, safe_serialization=False)
-            new_model = self.model_class.from_pretrained(tmpdirname)
-            if hasattr(new_model, "set_default_attn_processor"):
-                new_model.set_default_attn_processor()
-            new_model.to(torch_device)
-
-        with torch.no_grad():
-            if self.forward_requires_fresh_args:
-                image = model(**self.inputs_dict(0))
-            else:
-                image = model(**inputs_dict)
-
-            if isinstance(image, dict):
-                image = image.to_tuple()[0]
-
-            if self.forward_requires_fresh_args:
-                new_image = new_model(**self.inputs_dict(0))
-            else:
-                new_image = new_model(**inputs_dict)
-
-            if isinstance(new_image, dict):
-                new_image = new_image.to_tuple()[0]
-
-            image = normalize_output(image)
-            new_image = normalize_output(new_image)
-
-        max_diff = (image - new_image).abs().max().item()
-        self.assertLessEqual(max_diff, expected_max_diff, "Models give different forward passes")
-
-    def test_getattr_is_correct(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-
-        # save some things to test
-        model.dummy_attribute = 5
-        model.register_to_config(test_attribute=5)
-
-        logger = logging.get_logger("diffusers.models.modeling_utils")
-        # 30 for warning
-        logger.setLevel(30)
-        with CaptureLogger(logger) as cap_logger:
-            assert hasattr(model, "dummy_attribute")
-            assert getattr(model, "dummy_attribute") == 5
-            assert model.dummy_attribute == 5
-
-        # no warning should be thrown
-        assert cap_logger.out == ""
-
-        logger = logging.get_logger("diffusers.models.modeling_utils")
-        # 30 for warning
-        logger.setLevel(30)
-        with CaptureLogger(logger) as cap_logger:
-            assert hasattr(model, "save_pretrained")
-            fn = model.save_pretrained
-            fn_1 = getattr(model, "save_pretrained")
-
-            assert fn == fn_1
-        # no warning should be thrown
-        assert cap_logger.out == ""
-
-        # warning should be thrown
-        with self.assertWarns(FutureWarning):
-            assert model.test_attribute == 5
-
-        with self.assertWarns(FutureWarning):
-            assert getattr(model, "test_attribute") == 5
-
-        with self.assertRaises(AttributeError) as error:
-            model.does_not_exist
-
-        assert str(error.exception) == f"'{type(model).__name__}' object has no attribute 'does_not_exist'"
-
-    @unittest.skipIf(
-        torch_device != "npu" or not is_torch_npu_available(),
-        reason="torch npu flash attention is only available with NPU and `torch_npu` installed",
-    )
-    def test_set_torch_npu_flash_attn_processor_determinism(self):
-        torch.use_deterministic_algorithms(False)
-        if self.forward_requires_fresh_args:
-            model = self.model_class(**self.init_dict)
-        else:
-            init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-            model = self.model_class(**init_dict)
-        model.to(torch_device)
-
-        if not hasattr(model, "set_attn_processor"):
-            # If not has `set_attn_processor`, skip test
-            return
-
-        model.set_default_attn_processor()
-        assert all(type(proc) == AttnProcessorNPU for proc in model.attn_processors.values())
-        with torch.no_grad():
-            if self.forward_requires_fresh_args:
-                output = model(**self.inputs_dict(0))[0]
-            else:
-                output = model(**inputs_dict)[0]
-
-        model.enable_npu_flash_attention()
-        assert all(type(proc) == AttnProcessorNPU for proc in model.attn_processors.values())
-        with torch.no_grad():
-            if self.forward_requires_fresh_args:
-                output_2 = model(**self.inputs_dict(0))[0]
-            else:
-                output_2 = model(**inputs_dict)[0]
-
-        model.set_attn_processor(AttnProcessorNPU())
-        assert all(type(proc) == AttnProcessorNPU for proc in model.attn_processors.values())
-        with torch.no_grad():
-            if self.forward_requires_fresh_args:
-                output_3 = model(**self.inputs_dict(0))[0]
-            else:
-                output_3 = model(**inputs_dict)[0]
-
-        torch.use_deterministic_algorithms(True)
-
-        assert torch.allclose(output, output_2, atol=self.base_precision)
-        assert torch.allclose(output, output_3, atol=self.base_precision)
-        assert torch.allclose(output_2, output_3, atol=self.base_precision)
-
-    @unittest.skipIf(
-        torch_device != "cuda" or not is_xformers_available(),
-        reason="XFormers attention is only available with CUDA and `xformers` installed",
-    )
-    def test_set_xformers_attn_processor_for_determinism(self):
-        torch.use_deterministic_algorithms(False)
-        if self.forward_requires_fresh_args:
-            model = self.model_class(**self.init_dict)
-        else:
-            init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-            model = self.model_class(**init_dict)
-        model.to(torch_device)
-
-        if not hasattr(model, "set_attn_processor"):
-            # If not has `set_attn_processor`, skip test
-            return
-
-        if not hasattr(model, "set_default_attn_processor"):
-            # If not has `set_attn_processor`, skip test
-            return
-
-        model.set_default_attn_processor()
-        assert all(type(proc) == AttnProcessor for proc in model.attn_processors.values())
-        with torch.no_grad():
-            if self.forward_requires_fresh_args:
-                output = model(**self.inputs_dict(0))[0]
-            else:
-                output = model(**inputs_dict)[0]
-
-        model.enable_xformers_memory_efficient_attention()
-        assert all(type(proc) == XFormersAttnProcessor for proc in model.attn_processors.values())
-        with torch.no_grad():
-            if self.forward_requires_fresh_args:
-                output_2 = model(**self.inputs_dict(0))[0]
-            else:
-                output_2 = model(**inputs_dict)[0]
-
-        model.set_attn_processor(XFormersAttnProcessor())
-        assert all(type(proc) == XFormersAttnProcessor for proc in model.attn_processors.values())
-        with torch.no_grad():
-            if self.forward_requires_fresh_args:
-                output_3 = model(**self.inputs_dict(0))[0]
-            else:
-                output_3 = model(**inputs_dict)[0]
-
-        torch.use_deterministic_algorithms(True)
-
-        assert torch.allclose(output, output_2, atol=self.base_precision)
-        assert torch.allclose(output, output_3, atol=self.base_precision)
-        assert torch.allclose(output_2, output_3, atol=self.base_precision)
-
-    @require_torch_accelerator
-    def test_set_attn_processor_for_determinism(self):
-        if self.uses_custom_attn_processor:
-            return
-
-        torch.use_deterministic_algorithms(False)
-        if self.forward_requires_fresh_args:
-            model = self.model_class(**self.init_dict)
-        else:
-            init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-            model = self.model_class(**init_dict)
-
-        model.to(torch_device)
-
-        if not hasattr(model, "set_attn_processor"):
-            # If not has `set_attn_processor`, skip test
-            return
-
-        assert all(type(proc) == AttnProcessor2_0 for proc in model.attn_processors.values())
-        with torch.no_grad():
-            if self.forward_requires_fresh_args:
-                output_1 = model(**self.inputs_dict(0))[0]
-            else:
-                output_1 = model(**inputs_dict)[0]
-
-        model.set_default_attn_processor()
-        assert all(type(proc) == AttnProcessor for proc in model.attn_processors.values())
-        with torch.no_grad():
-            if self.forward_requires_fresh_args:
-                output_2 = model(**self.inputs_dict(0))[0]
-            else:
-                output_2 = model(**inputs_dict)[0]
-
-        model.set_attn_processor(AttnProcessor2_0())
-        assert all(type(proc) == AttnProcessor2_0 for proc in model.attn_processors.values())
-        with torch.no_grad():
-            if self.forward_requires_fresh_args:
-                output_4 = model(**self.inputs_dict(0))[0]
-            else:
-                output_4 = model(**inputs_dict)[0]
-
-        model.set_attn_processor(AttnProcessor())
-        assert all(type(proc) == AttnProcessor for proc in model.attn_processors.values())
-        with torch.no_grad():
-            if self.forward_requires_fresh_args:
-                output_5 = model(**self.inputs_dict(0))[0]
-            else:
-                output_5 = model(**inputs_dict)[0]
-
-        torch.use_deterministic_algorithms(True)
-
-        # make sure that outputs match
-        assert torch.allclose(output_2, output_1, atol=self.base_precision)
-        assert torch.allclose(output_2, output_4, atol=self.base_precision)
-        assert torch.allclose(output_2, output_5, atol=self.base_precision)
-
-    def test_from_save_pretrained_variant(self, expected_max_diff=5e-5):
-        if self.forward_requires_fresh_args:
-            model = self.model_class(**self.init_dict)
-        else:
-            init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-            model = self.model_class(**init_dict)
-
-        if hasattr(model, "set_default_attn_processor"):
-            model.set_default_attn_processor()
-
-        model.to(torch_device)
-        model.eval()
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(tmpdirname, variant="fp16", safe_serialization=False)
-            new_model = self.model_class.from_pretrained(tmpdirname, variant="fp16")
-            if hasattr(new_model, "set_default_attn_processor"):
-                new_model.set_default_attn_processor()
-
-            # non-variant cannot be loaded
-            with self.assertRaises(OSError) as error_context:
-                self.model_class.from_pretrained(tmpdirname)
-
-            # make sure that error message states what keys are missing
-            assert "Error no file named diffusion_pytorch_model.bin found in directory" in str(error_context.exception)
-
-            new_model.to(torch_device)
-
-        with torch.no_grad():
-            if self.forward_requires_fresh_args:
-                image = model(**self.inputs_dict(0))
-            else:
-                image = model(**inputs_dict)
-            if isinstance(image, dict):
-                image = image.to_tuple()[0]
-
-            if self.forward_requires_fresh_args:
-                new_image = new_model(**self.inputs_dict(0))
-            else:
-                new_image = new_model(**inputs_dict)
-
-            if isinstance(new_image, dict):
-                new_image = new_image.to_tuple()[0]
-
-            image = normalize_output(image)
-            new_image = normalize_output(new_image)
-
-        max_diff = (image - new_image).abs().max().item()
-        self.assertLessEqual(max_diff, expected_max_diff, "Models give different forward passes")
-
-    @is_torch_compile
-    @require_torch_2
-    @unittest.skipIf(
-        get_python_version == (3, 12),
-        reason="Torch Dynamo isn't yet supported for Python 3.12.",
-    )
-    def test_from_save_pretrained_dynamo(self):
-        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
-        inputs = [init_dict, self.model_class]
-        run_test_in_subprocess(test_case=self, target_func=_test_from_save_pretrained_dynamo, inputs=inputs)
-
-    def test_from_save_pretrained_dtype(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        model.eval()
-
-        for dtype in [torch.float32, torch.float16, torch.bfloat16]:
-            if torch_device == "mps" and dtype == torch.bfloat16:
-                continue
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.to(dtype)
-                model.save_pretrained(tmpdirname, safe_serialization=False)
-                new_model = self.model_class.from_pretrained(tmpdirname, low_cpu_mem_usage=True, torch_dtype=dtype)
-                assert new_model.dtype == dtype
-                if (
-                    hasattr(self.model_class, "_keep_in_fp32_modules")
-                    and self.model_class._keep_in_fp32_modules is None
-                ):
-                    new_model = self.model_class.from_pretrained(
-                        tmpdirname, low_cpu_mem_usage=False, torch_dtype=dtype
-                    )
-                    assert new_model.dtype == dtype
-
-    def test_determinism(self, expected_max_diff=1e-5):
-        if self.forward_requires_fresh_args:
-            model = self.model_class(**self.init_dict)
-        else:
-            init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-            model = self.model_class(**init_dict)
-        model.to(torch_device)
-        model.eval()
-
-        with torch.no_grad():
-            if self.forward_requires_fresh_args:
-                first = model(**self.inputs_dict(0))
-            else:
-                first = model(**inputs_dict)
-            if isinstance(first, dict):
-                first = first.to_tuple()[0]
-
-            if self.forward_requires_fresh_args:
-                second = model(**self.inputs_dict(0))
-            else:
-                second = model(**inputs_dict)
-            if isinstance(second, dict):
-                second = second.to_tuple()[0]
-
-            first = normalize_output(first)
-            second = normalize_output(second)
-
-        out_1 = first.cpu().numpy()
-        out_2 = second.cpu().numpy()
-        out_1 = out_1[~np.isnan(out_1)]
-        out_2 = out_2[~np.isnan(out_2)]
-        max_diff = np.amax(np.abs(out_1 - out_2))
-        self.assertLessEqual(max_diff, expected_max_diff)
-
-    def test_output(self, expected_output_shape=None):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        model.eval()
-
-        with torch.no_grad():
-            output = model(**inputs_dict)
-
-            if isinstance(output, dict):
-                output = output.to_tuple()[0]
-            if isinstance(output, list):
-                output = torch.stack(output)
-
-        self.assertIsNotNone(output)
-
-        # input & output have to have the same shape
-        input_tensor = inputs_dict[self.main_input_name]
-        if isinstance(input_tensor, list):
-            input_tensor = torch.stack(input_tensor)
-
-        if expected_output_shape is None:
-            expected_shape = input_tensor.shape
-            self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
-        else:
-            self.assertEqual(output.shape, expected_output_shape, "Input and output shapes do not match")
-
-    def test_model_from_pretrained(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        model.eval()
-
-        # test if the model can be loaded from the config
-        # and has all the expected shape
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(tmpdirname, safe_serialization=False)
-            new_model = self.model_class.from_pretrained(tmpdirname)
-            new_model.to(torch_device)
-            new_model.eval()
-
-        # check if all parameters shape are the same
-        for param_name in model.state_dict().keys():
-            param_1 = model.state_dict()[param_name]
-            param_2 = new_model.state_dict()[param_name]
-            self.assertEqual(param_1.shape, param_2.shape)
-
-        with torch.no_grad():
-            output_1 = model(**inputs_dict)
-
-            if isinstance(output_1, dict):
-                output_1 = output_1.to_tuple()[0]
-            if isinstance(output_1, list):
-                output_1 = torch.stack(output_1)
-
-            output_2 = new_model(**inputs_dict)
-
-            if isinstance(output_2, dict):
-                output_2 = output_2.to_tuple()[0]
-            if isinstance(output_2, list):
-                output_2 = torch.stack(output_2)
-
-        self.assertEqual(output_1.shape, output_2.shape)
-
-    @require_torch_accelerator_with_training
-    def test_training(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        model.train()
-        output = model(**inputs_dict)
-
-        if isinstance(output, dict):
-            output = output.to_tuple()[0]
-
-        input_tensor = inputs_dict[self.main_input_name]
-        noise = torch.randn((input_tensor.shape[0],) + self.output_shape).to(torch_device)
-        loss = torch.nn.functional.mse_loss(output, noise)
-        loss.backward()
-
-    @require_torch_accelerator_with_training
-    def test_ema_training(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        model.train()
-        ema_model = EMAModel(model.parameters())
-
-        output = model(**inputs_dict)
-
-        if isinstance(output, dict):
-            output = output.to_tuple()[0]
-
-        input_tensor = inputs_dict[self.main_input_name]
-        noise = torch.randn((input_tensor.shape[0],) + self.output_shape).to(torch_device)
-        loss = torch.nn.functional.mse_loss(output, noise)
-        loss.backward()
-        ema_model.step(model.parameters())
-
-    def test_outputs_equivalence(self):
-        def set_nan_tensor_to_zero(t):
-            # Temporary fallback until `aten::_index_put_impl_` is implemented in mps
-            # Track progress in https://github.com/pytorch/pytorch/issues/77764
-            device = t.device
-            if device.type == "mps":
-                t = t.to("cpu")
-            t[t != t] = 0
-            return t.to(device)
-
-        def recursive_check(tuple_object, dict_object):
-            if isinstance(tuple_object, (List, Tuple)):
-                for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object.values()):
-                    recursive_check(tuple_iterable_value, dict_iterable_value)
-            elif isinstance(tuple_object, Dict):
-                for tuple_iterable_value, dict_iterable_value in zip(tuple_object.values(), dict_object.values()):
-                    recursive_check(tuple_iterable_value, dict_iterable_value)
-            elif tuple_object is None:
-                return
-            else:
-                self.assertTrue(
-                    torch.allclose(
-                        set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5
-                    ),
-                    msg=(
-                        "Tuple and dict output are not equal. Difference:"
-                        f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:"
-                        f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has"
-                        f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}."
-                    ),
-                )
-
-        if self.forward_requires_fresh_args:
-            model = self.model_class(**self.init_dict)
-        else:
-            init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-            model = self.model_class(**init_dict)
-
-        model.to(torch_device)
-        model.eval()
-
-        with torch.no_grad():
-            if self.forward_requires_fresh_args:
-                outputs_dict = model(**self.inputs_dict(0))
-                outputs_tuple = model(**self.inputs_dict(0), return_dict=False)
-            else:
-                outputs_dict = model(**inputs_dict)
-                outputs_tuple = model(**inputs_dict, return_dict=False)
-
-        recursive_check(outputs_tuple, outputs_dict)
-
-    @require_torch_accelerator_with_training
-    def test_enable_disable_gradient_checkpointing(self):
-        # Skip test if model does not support gradient checkpointing
-        if not self.model_class._supports_gradient_checkpointing:
-            pytest.skip("Gradient checkpointing is not supported.")
-
-        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
-
-        # at init model should have gradient checkpointing disabled
-        model = self.model_class(**init_dict)
-        self.assertFalse(model.is_gradient_checkpointing)
-
-        # check enable works
-        model.enable_gradient_checkpointing()
-        self.assertTrue(model.is_gradient_checkpointing)
-
-        # check disable works
-        model.disable_gradient_checkpointing()
-        self.assertFalse(model.is_gradient_checkpointing)
-
-    @require_torch_accelerator_with_training
-    def test_effective_gradient_checkpointing(self, loss_tolerance=1e-5, param_grad_tol=5e-5, skip: set[str] = {}):
-        # Skip test if model does not support gradient checkpointing
-        if not self.model_class._supports_gradient_checkpointing:
-            pytest.skip("Gradient checkpointing is not supported.")
-
-        # enable deterministic behavior for gradient checkpointing
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        inputs_dict_copy = copy.deepcopy(inputs_dict)
-        torch.manual_seed(0)
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-
-        assert not model.is_gradient_checkpointing and model.training
-
-        out = model(**inputs_dict).sample
-        # run the backwards pass on the model. For backwards pass, for simplicity purpose,
-        # we won't calculate the loss and rather backprop on out.sum()
-        model.zero_grad()
-
-        labels = torch.randn_like(out)
-        loss = (out - labels).mean()
-        loss.backward()
-
-        # re-instantiate the model now enabling gradient checkpointing
-        torch.manual_seed(0)
-        model_2 = self.model_class(**init_dict)
-        # clone model
-        model_2.load_state_dict(model.state_dict())
-        model_2.to(torch_device)
-        model_2.enable_gradient_checkpointing()
-
-        assert model_2.is_gradient_checkpointing and model_2.training
-
-        out_2 = model_2(**inputs_dict_copy).sample
-        # run the backwards pass on the model. For backwards pass, for simplicity purpose,
-        # we won't calculate the loss and rather backprop on out.sum()
-        model_2.zero_grad()
-        loss_2 = (out_2 - labels).mean()
-        loss_2.backward()
-
-        # compare the output and parameters gradients
-        self.assertTrue((loss - loss_2).abs() < loss_tolerance)
-        named_params = dict(model.named_parameters())
-        named_params_2 = dict(model_2.named_parameters())
-
-        for name, param in named_params.items():
-            if "post_quant_conv" in name:
-                continue
-            if name in skip:
-                continue
-            # TODO(aryan): remove the below lines after looking into easyanimate transformer a little more
-            # It currently errors out the gradient checkpointing test because the gradients for attn2.to_out is None
-            if param.grad is None:
-                continue
-            self.assertTrue(torch_all_close(param.grad.data, named_params_2[name].grad.data, atol=param_grad_tol))
-
-    @unittest.skipIf(torch_device == "mps", "This test is not supported for MPS devices.")
-    def test_gradient_checkpointing_is_applied(
-        self, expected_set=None, attention_head_dim=None, num_attention_heads=None, block_out_channels=None
-    ):
-        # Skip test if model does not support gradient checkpointing
-        if not self.model_class._supports_gradient_checkpointing:
-            pytest.skip("Gradient checkpointing is not supported.")
-
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        if attention_head_dim is not None:
-            init_dict["attention_head_dim"] = attention_head_dim
-        if num_attention_heads is not None:
-            init_dict["num_attention_heads"] = num_attention_heads
-        if block_out_channels is not None:
-            init_dict["block_out_channels"] = block_out_channels
-
-        model_class_copy = copy.copy(self.model_class)
-        model = model_class_copy(**init_dict)
-        model.enable_gradient_checkpointing()
-
-        modules_with_gc_enabled = {}
-        for submodule in model.modules():
-            if hasattr(submodule, "gradient_checkpointing"):
-                self.assertTrue(submodule.gradient_checkpointing)
-                modules_with_gc_enabled[submodule.__class__.__name__] = True
-
-        assert set(modules_with_gc_enabled.keys()) == expected_set
-        assert all(modules_with_gc_enabled.values()), "All modules should be enabled"
-
-    def test_deprecated_kwargs(self):
-        has_kwarg_in_model_class = "kwargs" in inspect.signature(self.model_class.__init__).parameters
-        has_deprecated_kwarg = len(self.model_class._deprecated_kwargs) > 0
-
-        if has_kwarg_in_model_class and not has_deprecated_kwarg:
-            raise ValueError(
-                f"{self.model_class} has `**kwargs` in its __init__ method but has not defined any deprecated kwargs"
-                " under the `_deprecated_kwargs` class attribute. Make sure to either remove `**kwargs` if there are"
-                " no deprecated arguments or add the deprecated argument with `_deprecated_kwargs ="
-                " [<deprecated_argument>]`"
-            )
-
-        if not has_kwarg_in_model_class and has_deprecated_kwarg:
-            raise ValueError(
-                f"{self.model_class} doesn't have `**kwargs` in its __init__ method but has defined deprecated kwargs"
-                " under the `_deprecated_kwargs` class attribute. Make sure to either add the `**kwargs` argument to"
-                f" {self.model_class}.__init__ if there are deprecated arguments or remove the deprecated argument"
-                " from `_deprecated_kwargs = [<deprecated_argument>]`"
-            )
-
-    @parameterized.expand([(4, 4, True), (4, 8, False), (8, 4, False)])
-    @torch.no_grad()
-    @unittest.skipIf(not is_peft_available(), "Only with PEFT")
-    def test_save_load_lora_adapter(self, rank, lora_alpha, use_dora=False):
-        from peft import LoraConfig
-        from peft.utils import get_peft_model_state_dict
-
-        from diffusers.loaders.peft import PeftAdapterMixin
-
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict).to(torch_device)
-
-        if not issubclass(model.__class__, PeftAdapterMixin):
-            pytest.skip(f"PEFT is not supported for this model ({model.__class__.__name__}).")
-
-        torch.manual_seed(0)
-        output_no_lora = model(**inputs_dict, return_dict=False)[0]
-        if isinstance(output_no_lora, list):
-            output_no_lora = torch.stack(output_no_lora)
-
-        denoiser_lora_config = LoraConfig(
-            r=rank,
-            lora_alpha=lora_alpha,
-            target_modules=["to_q", "to_k", "to_v", "to_out.0"],
-            init_lora_weights=False,
-            use_dora=use_dora,
-        )
-        model.add_adapter(denoiser_lora_config)
-        self.assertTrue(check_if_lora_correctly_set(model), "LoRA layers not set correctly")
-
-        torch.manual_seed(0)
-        outputs_with_lora = model(**inputs_dict, return_dict=False)[0]
-        if isinstance(outputs_with_lora, list):
-            outputs_with_lora = torch.stack(outputs_with_lora)
-
-        self.assertFalse(torch.allclose(output_no_lora, outputs_with_lora, atol=1e-4, rtol=1e-4))
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            model.save_lora_adapter(tmpdir)
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
-
-            state_dict_loaded = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
-
-            model.unload_lora()
-            self.assertFalse(check_if_lora_correctly_set(model), "LoRA layers not set correctly")
-
-            model.load_lora_adapter(tmpdir, prefix=None, use_safetensors=True)
-            state_dict_retrieved = get_peft_model_state_dict(model, adapter_name="default_0")
-
-            for k in state_dict_loaded:
-                loaded_v = state_dict_loaded[k]
-                retrieved_v = state_dict_retrieved[k].to(loaded_v.device)
-                self.assertTrue(torch.allclose(loaded_v, retrieved_v))
-
-            self.assertTrue(check_if_lora_correctly_set(model), "LoRA layers not set correctly")
-
-        torch.manual_seed(0)
-        outputs_with_lora_2 = model(**inputs_dict, return_dict=False)[0]
-        if isinstance(outputs_with_lora_2, list):
-            outputs_with_lora_2 = torch.stack(outputs_with_lora_2)
-
-        self.assertFalse(torch.allclose(output_no_lora, outputs_with_lora_2, atol=1e-4, rtol=1e-4))
-        self.assertTrue(torch.allclose(outputs_with_lora, outputs_with_lora_2, atol=1e-4, rtol=1e-4))
-
-    @unittest.skipIf(not is_peft_available(), "Only with PEFT")
-    def test_lora_wrong_adapter_name_raises_error(self):
-        from peft import LoraConfig
-
-        from diffusers.loaders.peft import PeftAdapterMixin
-
-        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict).to(torch_device)
-
-        if not issubclass(model.__class__, PeftAdapterMixin):
-            pytest.skip(f"PEFT is not supported for this model ({model.__class__.__name__}).")
-
-        denoiser_lora_config = LoraConfig(
-            r=4,
-            lora_alpha=4,
-            target_modules=["to_q", "to_k", "to_v", "to_out.0"],
-            init_lora_weights=False,
-            use_dora=False,
-        )
-        model.add_adapter(denoiser_lora_config)
-        self.assertTrue(check_if_lora_correctly_set(model), "LoRA layers not set correctly")
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            wrong_name = "foo"
-            with self.assertRaises(ValueError) as err_context:
-                model.save_lora_adapter(tmpdir, adapter_name=wrong_name)
-
-            self.assertTrue(f"Adapter name {wrong_name} not found in the model." in str(err_context.exception))
-
-    @parameterized.expand([(4, 4, True), (4, 8, False), (8, 4, False)])
-    @torch.no_grad()
-    @unittest.skipIf(not is_peft_available(), "Only with PEFT")
-    def test_lora_adapter_metadata_is_loaded_correctly(self, rank, lora_alpha, use_dora):
-        from peft import LoraConfig
-
-        from diffusers.loaders.peft import PeftAdapterMixin
-
-        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict).to(torch_device)
-
-        if not issubclass(model.__class__, PeftAdapterMixin):
-            pytest.skip(f"PEFT is not supported for this model ({model.__class__.__name__}).")
-
-        denoiser_lora_config = LoraConfig(
-            r=rank,
-            lora_alpha=lora_alpha,
-            target_modules=["to_q", "to_k", "to_v", "to_out.0"],
-            init_lora_weights=False,
-            use_dora=use_dora,
-        )
-        model.add_adapter(denoiser_lora_config)
-        metadata = model.peft_config["default"].to_dict()
-        self.assertTrue(check_if_lora_correctly_set(model), "LoRA layers not set correctly")
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            model.save_lora_adapter(tmpdir)
-            model_file = os.path.join(tmpdir, "pytorch_lora_weights.safetensors")
-            self.assertTrue(os.path.isfile(model_file))
-
-            model.unload_lora()
-            self.assertFalse(check_if_lora_correctly_set(model), "LoRA layers not set correctly")
-
-            model.load_lora_adapter(tmpdir, prefix=None, use_safetensors=True)
-            parsed_metadata = model.peft_config["default_0"].to_dict()
-            check_if_dicts_are_equal(metadata, parsed_metadata)
-
-    @torch.no_grad()
-    @unittest.skipIf(not is_peft_available(), "Only with PEFT")
-    def test_lora_adapter_wrong_metadata_raises_error(self):
-        from peft import LoraConfig
-
-        from diffusers.loaders.lora_base import LORA_ADAPTER_METADATA_KEY
-        from diffusers.loaders.peft import PeftAdapterMixin
-
-        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict).to(torch_device)
-
-        if not issubclass(model.__class__, PeftAdapterMixin):
-            pytest.skip(f"PEFT is not supported for this model ({model.__class__.__name__}).")
-
-        denoiser_lora_config = LoraConfig(
-            r=4,
-            lora_alpha=4,
-            target_modules=["to_q", "to_k", "to_v", "to_out.0"],
-            init_lora_weights=False,
-            use_dora=False,
-        )
-        model.add_adapter(denoiser_lora_config)
-        self.assertTrue(check_if_lora_correctly_set(model), "LoRA layers not set correctly")
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            model.save_lora_adapter(tmpdir)
-            model_file = os.path.join(tmpdir, "pytorch_lora_weights.safetensors")
-            self.assertTrue(os.path.isfile(model_file))
-
-            # Perturb the metadata in the state dict.
-            loaded_state_dict = safetensors.torch.load_file(model_file)
-            metadata = {"format": "pt"}
-            lora_adapter_metadata = denoiser_lora_config.to_dict()
-            lora_adapter_metadata.update({"foo": 1, "bar": 2})
-            for key, value in lora_adapter_metadata.items():
-                if isinstance(value, set):
-                    lora_adapter_metadata[key] = list(value)
-            metadata[LORA_ADAPTER_METADATA_KEY] = json.dumps(lora_adapter_metadata, indent=2, sort_keys=True)
-            safetensors.torch.save_file(loaded_state_dict, model_file, metadata=metadata)
-
-            model.unload_lora()
-            self.assertFalse(check_if_lora_correctly_set(model), "LoRA layers not set correctly")
-
-            with self.assertRaises(TypeError) as err_context:
-                model.load_lora_adapter(tmpdir, prefix=None, use_safetensors=True)
-            self.assertTrue("`LoraConfig` class could not be instantiated" in str(err_context.exception))
-
-    @require_torch_accelerator
-    def test_cpu_offload(self):
-        if self.model_class._no_split_modules is None:
-            pytest.skip("Test not supported for this model as `_no_split_modules` is not set.")
-
-        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**config).eval()
-        model = model.to(torch_device)
-
-        torch.manual_seed(0)
-        base_output = model(**inputs_dict)
-        base_normalized_output = normalize_output(base_output)
-
-        model_size = compute_module_sizes(model)[""]
-        max_gpu_sizes = [int(p * model_size) for p in self.model_split_percents[1:]]
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.cpu().save_pretrained(tmp_dir)
-
-            for max_size in max_gpu_sizes:
-                max_memory = {0: max_size, "cpu": model_size * 2}
-                new_model = self.model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory)
-
-                # Making sure part of the model will actually end up offloaded
-                self.assertSetEqual(set(new_model.hf_device_map.values()), {0, "cpu"})
-
-                self.check_device_map_is_respected(new_model, new_model.hf_device_map)
-
-                torch.manual_seed(0)
-                new_output = new_model(**inputs_dict)
-                new_normalized_output = normalize_output(new_output)
-
-                self.assertTrue(torch.allclose(base_normalized_output, new_normalized_output, atol=1e-5))
-
-    @require_torch_accelerator
-    def test_disk_offload_without_safetensors(self):
-        if self.model_class._no_split_modules is None:
-            pytest.skip("Test not supported for this model as `_no_split_modules` is not set.")
-        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**config).eval()
-
-        model = model.to(torch_device)
-
-        torch.manual_seed(0)
-        base_output = model(**inputs_dict)
-        base_normalized_output = normalize_output(base_output)
-
-        model_size = compute_module_sizes(model)[""]
-        max_size = int(self.model_split_percents[0] * model_size)
-        # Force disk offload by setting very small CPU memory
-        max_memory = {0: max_size, "cpu": int(0.1 * max_size)}
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.cpu().save_pretrained(tmp_dir, safe_serialization=False)
-            with self.assertRaises(ValueError):
-                # This errors out because it's missing an offload folder
-                new_model = self.model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory)
-
-            new_model = self.model_class.from_pretrained(
-                tmp_dir, device_map="auto", max_memory=max_memory, offload_folder=tmp_dir
-            )
-
-            self.check_device_map_is_respected(new_model, new_model.hf_device_map)
-            torch.manual_seed(0)
-            new_output = new_model(**inputs_dict)
-            new_normalized_output = normalize_output(new_output)
-            self.assertTrue(torch.allclose(base_normalized_output, new_normalized_output, atol=1e-5))
-
-    @require_torch_accelerator
-    def test_disk_offload_with_safetensors(self):
-        if self.model_class._no_split_modules is None:
-            pytest.skip("Test not supported for this model as `_no_split_modules` is not set.")
-        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**config).eval()
-
-        model = model.to(torch_device)
-
-        torch.manual_seed(0)
-        base_output = model(**inputs_dict)
-        base_normalized_output = normalize_output(base_output)
-
-        model_size = compute_module_sizes(model)[""]
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.cpu().save_pretrained(tmp_dir)
-
-            max_size = int(self.model_split_percents[0] * model_size)
-            max_memory = {0: max_size, "cpu": max_size}
-            new_model = self.model_class.from_pretrained(
-                tmp_dir, device_map="auto", offload_folder=tmp_dir, max_memory=max_memory
-            )
-
-            self.check_device_map_is_respected(new_model, new_model.hf_device_map)
-            torch.manual_seed(0)
-            new_output = new_model(**inputs_dict)
-            new_normalized_output = normalize_output(new_output)
-
-            self.assertTrue(torch.allclose(base_normalized_output, new_normalized_output, atol=1e-5))
-
-    @require_torch_multi_accelerator
-    def test_model_parallelism(self):
-        if self.model_class._no_split_modules is None:
-            pytest.skip("Test not supported for this model as `_no_split_modules` is not set.")
-        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**config).eval()
-
-        model = model.to(torch_device)
-
-        torch.manual_seed(0)
-        base_output = model(**inputs_dict)
-
-        model_size = compute_module_sizes(model)[""]
-        # We test several splits of sizes to make sure it works.
-        max_gpu_sizes = [int(p * model_size) for p in self.model_split_percents[1:]]
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.cpu().save_pretrained(tmp_dir)
-
-            for max_size in max_gpu_sizes:
-                max_memory = {0: max_size, 1: model_size * 2, "cpu": model_size * 2}
-                new_model = self.model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory)
-                # Making sure part of the model will actually end up offloaded
-                self.assertSetEqual(set(new_model.hf_device_map.values()), {0, 1})
-
-                self.check_device_map_is_respected(new_model, new_model.hf_device_map)
-
-                torch.manual_seed(0)
-                new_output = new_model(**inputs_dict)
-
-                self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
-
-    @require_torch_accelerator
-    def test_sharded_checkpoints(self):
-        torch.manual_seed(0)
-        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**config).eval()
-        model = model.to(torch_device)
-
-        base_output = model(**inputs_dict)
-        base_normalized_output = normalize_output(base_output)
-
-        model_size = compute_module_persistent_sizes(model)[""]
-        max_shard_size = int((model_size * 0.75) / (2**10))  # Convert to KB as these test models are small.
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.cpu().save_pretrained(tmp_dir, max_shard_size=f"{max_shard_size}KB")
-            self.assertTrue(os.path.exists(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)))
-
-            # Now check if the right number of shards exists. First, let's get the number of shards.
-            # Since this number can be dependent on the model being tested, it's important that we calculate it
-            # instead of hardcoding it.
-            expected_num_shards = caculate_expected_num_shards(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME))
-            actual_num_shards = len([file for file in os.listdir(tmp_dir) if file.endswith(".safetensors")])
-            self.assertTrue(actual_num_shards == expected_num_shards)
-
-            new_model = self.model_class.from_pretrained(tmp_dir).eval()
-            new_model = new_model.to(torch_device)
-
-            torch.manual_seed(0)
-            if "generator" in inputs_dict:
-                _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-            new_output = new_model(**inputs_dict)
-            new_normalized_output = normalize_output(new_output)
-
-            self.assertTrue(torch.allclose(base_normalized_output, new_normalized_output, atol=1e-5))
-
-    @require_torch_accelerator
-    def test_sharded_checkpoints_with_variant(self):
-        torch.manual_seed(0)
-        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**config).eval()
-        model = model.to(torch_device)
-
-        base_output = model(**inputs_dict)
-        base_normalized_output = normalize_output(base_output)
-
-        model_size = compute_module_persistent_sizes(model)[""]
-        max_shard_size = int((model_size * 0.75) / (2**10))  # Convert to KB as these test models are small.
-        variant = "fp16"
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            # It doesn't matter if the actual model is in fp16 or not. Just adding the variant and
-            # testing if loading works with the variant when the checkpoint is sharded should be
-            # enough.
-            model.cpu().save_pretrained(tmp_dir, max_shard_size=f"{max_shard_size}KB", variant=variant)
-
-            index_filename = _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)
-            self.assertTrue(os.path.exists(os.path.join(tmp_dir, index_filename)))
-
-            # Now check if the right number of shards exists. First, let's get the number of shards.
-            # Since this number can be dependent on the model being tested, it's important that we calculate it
-            # instead of hardcoding it.
-            expected_num_shards = caculate_expected_num_shards(os.path.join(tmp_dir, index_filename))
-            actual_num_shards = len([file for file in os.listdir(tmp_dir) if file.endswith(".safetensors")])
-            self.assertTrue(actual_num_shards == expected_num_shards)
-
-            new_model = self.model_class.from_pretrained(tmp_dir, variant=variant).eval()
-            new_model = new_model.to(torch_device)
-
-            torch.manual_seed(0)
-            if "generator" in inputs_dict:
-                _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-            new_output = new_model(**inputs_dict)
-            new_normalized_output = normalize_output(new_output)
-
-            self.assertTrue(torch.allclose(base_normalized_output, new_normalized_output, atol=1e-5))
-
-    @require_torch_accelerator
-    def test_sharded_checkpoints_with_parallel_loading(self):
-        torch.manual_seed(0)
-        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**config).eval()
-        model = model.to(torch_device)
-
-        base_output = model(**inputs_dict)
-        base_normalized_output = normalize_output(base_output)
-
-        model_size = compute_module_persistent_sizes(model)[""]
-        max_shard_size = int((model_size * 0.75) / (2**10))  # Convert to KB as these test models are small.
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.cpu().save_pretrained(tmp_dir, max_shard_size=f"{max_shard_size}KB")
-            self.assertTrue(os.path.exists(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)))
-
-            # Now check if the right number of shards exists. First, let's get the number of shards.
-            # Since this number can be dependent on the model being tested, it's important that we calculate it
-            # instead of hardcoding it.
-            expected_num_shards = caculate_expected_num_shards(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME))
-            actual_num_shards = len([file for file in os.listdir(tmp_dir) if file.endswith(".safetensors")])
-            self.assertTrue(actual_num_shards == expected_num_shards)
-
-            # Load with parallel loading
-            os.environ["HF_ENABLE_PARALLEL_LOADING"] = "yes"
-            new_model = self.model_class.from_pretrained(tmp_dir).eval()
-            new_model = new_model.to(torch_device)
-
-            torch.manual_seed(0)
-            if "generator" in inputs_dict:
-                _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-            new_output = new_model(**inputs_dict)
-            new_normalized_output = normalize_output(new_output)
-
-            self.assertTrue(torch.allclose(base_normalized_output, new_normalized_output, atol=1e-5))
-            # set to no.
-            os.environ["HF_ENABLE_PARALLEL_LOADING"] = "no"
-
-    @require_torch_accelerator
-    def test_sharded_checkpoints_device_map(self):
-        if self.model_class._no_split_modules is None:
-            pytest.skip("Test not supported for this model as `_no_split_modules` is not set.")
-        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**config).eval()
-        model = model.to(torch_device)
-
-        torch.manual_seed(0)
-        base_output = model(**inputs_dict)
-        base_normalized_output = normalize_output(base_output)
-
-        model_size = compute_module_persistent_sizes(model)[""]
-        max_shard_size = int((model_size * 0.75) / (2**10))  # Convert to KB as these test models are small.
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.cpu().save_pretrained(tmp_dir, max_shard_size=f"{max_shard_size}KB")
-            self.assertTrue(os.path.exists(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)))
-
-            # Now check if the right number of shards exists. First, let's get the number of shards.
-            # Since this number can be dependent on the model being tested, it's important that we calculate it
-            # instead of hardcoding it.
-            expected_num_shards = caculate_expected_num_shards(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME))
-            actual_num_shards = len([file for file in os.listdir(tmp_dir) if file.endswith(".safetensors")])
-            self.assertTrue(actual_num_shards == expected_num_shards)
-
-            new_model = self.model_class.from_pretrained(tmp_dir, device_map="auto")
-
-            torch.manual_seed(0)
-            if "generator" in inputs_dict:
-                _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-            new_output = new_model(**inputs_dict)
-            new_normalized_output = normalize_output(new_output)
-
-            self.assertTrue(torch.allclose(base_normalized_output, new_normalized_output, atol=1e-5))
-
-    # This test is okay without a GPU because we're not running any execution. We're just serializing
-    # and check if the resultant files are following an expected format.
-    def test_variant_sharded_ckpt_right_format(self):
-        for use_safe in [True, False]:
-            extension = ".safetensors" if use_safe else ".bin"
-            config, _ = self.prepare_init_args_and_inputs_for_common()
-            model = self.model_class(**config).eval()
-
-            model_size = compute_module_persistent_sizes(model)[""]
-            max_shard_size = int((model_size * 0.75) / (2**10))  # Convert to KB as these test models are small.
-            variant = "fp16"
-            with tempfile.TemporaryDirectory() as tmp_dir:
-                model.cpu().save_pretrained(
-                    tmp_dir, variant=variant, max_shard_size=f"{max_shard_size}KB", safe_serialization=use_safe
-                )
-                index_variant = _add_variant(SAFE_WEIGHTS_INDEX_NAME if use_safe else WEIGHTS_INDEX_NAME, variant)
-                self.assertTrue(os.path.exists(os.path.join(tmp_dir, index_variant)))
-
-                # Now check if the right number of shards exists. First, let's get the number of shards.
-                # Since this number can be dependent on the model being tested, it's important that we calculate it
-                # instead of hardcoding it.
-                expected_num_shards = caculate_expected_num_shards(os.path.join(tmp_dir, index_variant))
-                actual_num_shards = len([file for file in os.listdir(tmp_dir) if file.endswith(extension)])
-                self.assertTrue(actual_num_shards == expected_num_shards)
-
-                # Check if the variant is present as a substring in the checkpoints.
-                shard_files = [
-                    file
-                    for file in os.listdir(tmp_dir)
-                    if file.endswith(extension) or ("index" in file and "json" in file)
-                ]
-                assert all(variant in f for f in shard_files)
-
-                # Check if the sharded checkpoints were serialized in the right format.
-                shard_files = [file for file in os.listdir(tmp_dir) if file.endswith(extension)]
-                # Example: diffusion_pytorch_model.fp16-00001-of-00002.safetensors
-                assert all(f.split(".")[1].split("-")[0] == variant for f in shard_files)
-
-    def test_layerwise_casting_training(self):
-        def test_fn(storage_dtype, compute_dtype):
-            if torch.device(torch_device).type == "cpu" and compute_dtype == torch.bfloat16:
-                pytest.skip("Skipping test because CPU doesn't go well with bfloat16.")
-            init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-            model = self.model_class(**init_dict)
-            model = model.to(torch_device, dtype=compute_dtype)
-            model.enable_layerwise_casting(storage_dtype=storage_dtype, compute_dtype=compute_dtype)
-            model.train()
-
-            inputs_dict = cast_maybe_tensor_dtype(inputs_dict, torch.float32, compute_dtype)
-            with torch.amp.autocast(device_type=torch.device(torch_device).type):
-                output = model(**inputs_dict)
-
-                if isinstance(output, dict):
-                    output = output.to_tuple()[0]
-
-                input_tensor = inputs_dict[self.main_input_name]
-                noise = torch.randn((input_tensor.shape[0],) + self.output_shape).to(torch_device)
-                noise = cast_maybe_tensor_dtype(noise, torch.float32, compute_dtype)
-                loss = torch.nn.functional.mse_loss(output, noise)
-
-            loss.backward()
-
-        test_fn(torch.float16, torch.float32)
-        test_fn(torch.float8_e4m3fn, torch.float32)
-        test_fn(torch.float8_e5m2, torch.float32)
-        test_fn(torch.float8_e4m3fn, torch.bfloat16)
-
-    @torch.no_grad()
-    def test_layerwise_casting_inference(self):
-        from diffusers.hooks._common import _GO_LC_SUPPORTED_PYTORCH_LAYERS
-        from diffusers.hooks.layerwise_casting import DEFAULT_SKIP_MODULES_PATTERN
-
-        torch.manual_seed(0)
-        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**config)
-        model.eval()
-        model.to(torch_device)
-        base_slice = model(**inputs_dict)[0]
-        base_slice = normalize_output(base_slice)
-        base_slice = base_slice.detach().flatten().cpu().numpy()
-
-        def check_linear_dtype(module, storage_dtype, compute_dtype):
-            patterns_to_check = DEFAULT_SKIP_MODULES_PATTERN
-            if getattr(module, "_skip_layerwise_casting_patterns", None) is not None:
-                patterns_to_check += tuple(module._skip_layerwise_casting_patterns)
-            for name, submodule in module.named_modules():
-                if not isinstance(submodule, _GO_LC_SUPPORTED_PYTORCH_LAYERS):
-                    continue
-                dtype_to_check = storage_dtype
-                if any(re.search(pattern, name) for pattern in patterns_to_check):
-                    dtype_to_check = compute_dtype
-                if getattr(submodule, "weight", None) is not None:
-                    self.assertEqual(submodule.weight.dtype, dtype_to_check)
-                if getattr(submodule, "bias", None) is not None:
-                    self.assertEqual(submodule.bias.dtype, dtype_to_check)
-
-        def test_layerwise_casting(storage_dtype, compute_dtype):
-            torch.manual_seed(0)
-            config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-            inputs_dict = cast_maybe_tensor_dtype(inputs_dict, torch.float32, compute_dtype)
-            model = self.model_class(**config).eval()
-            model = model.to(torch_device, dtype=compute_dtype)
-            model.enable_layerwise_casting(storage_dtype=storage_dtype, compute_dtype=compute_dtype)
-
-            check_linear_dtype(model, storage_dtype, compute_dtype)
-            output = model(**inputs_dict)[0]
-            output = normalize_output(output)
-            output = output.float().flatten().detach().cpu().numpy()
-
-            # The precision test is not very important for fast tests. In most cases, the outputs will not be the same.
-            # We just want to make sure that the layerwise casting is working as expected.
-            self.assertTrue(numpy_cosine_similarity_distance(base_slice, output) < 1.0)
-
-        test_layerwise_casting(torch.float16, torch.float32)
-        test_layerwise_casting(torch.float8_e4m3fn, torch.float32)
-        test_layerwise_casting(torch.float8_e5m2, torch.float32)
-        test_layerwise_casting(torch.float8_e4m3fn, torch.bfloat16)
-
-    @require_torch_accelerator
-    @torch.no_grad()
-    def test_layerwise_casting_memory(self):
-        MB_TOLERANCE = 0.2
-        LEAST_COMPUTE_CAPABILITY = 8.0
-
-        def reset_memory_stats():
-            gc.collect()
-            backend_synchronize(torch_device)
-            backend_empty_cache(torch_device)
-            backend_reset_peak_memory_stats(torch_device)
-
-        def get_memory_usage(storage_dtype, compute_dtype):
-            torch.manual_seed(0)
-            config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-            inputs_dict = cast_maybe_tensor_dtype(inputs_dict, torch.float32, compute_dtype)
-            model = self.model_class(**config).eval()
-            model = model.to(torch_device, dtype=compute_dtype)
-            model.enable_layerwise_casting(storage_dtype=storage_dtype, compute_dtype=compute_dtype)
-
-            reset_memory_stats()
-            model(**inputs_dict)
-            model_memory_footprint = model.get_memory_footprint()
-            peak_inference_memory_allocated_mb = backend_max_memory_allocated(torch_device) / 1024**2
-
-            return model_memory_footprint, peak_inference_memory_allocated_mb
-
-        fp32_memory_footprint, fp32_max_memory = get_memory_usage(torch.float32, torch.float32)
-        fp8_e4m3_fp32_memory_footprint, fp8_e4m3_fp32_max_memory = get_memory_usage(torch.float8_e4m3fn, torch.float32)
-        fp8_e4m3_bf16_memory_footprint, fp8_e4m3_bf16_max_memory = get_memory_usage(
-            torch.float8_e4m3fn, torch.bfloat16
-        )
-
-        compute_capability = get_torch_cuda_device_capability() if torch_device == "cuda" else None
-        self.assertTrue(fp8_e4m3_bf16_memory_footprint < fp8_e4m3_fp32_memory_footprint < fp32_memory_footprint)
-        # NOTE: the following assertion would fail on our CI (running Tesla T4) due to bf16 using more memory than fp32.
-        # On other devices, such as DGX (Ampere) and Audace (Ada), the test passes. So, we conditionally check it.
-        if compute_capability and compute_capability >= LEAST_COMPUTE_CAPABILITY:
-            self.assertTrue(fp8_e4m3_bf16_max_memory < fp8_e4m3_fp32_max_memory)
-        # On this dummy test case with a small model, sometimes fp8_e4m3_fp32 max memory usage is higher than fp32 by a few
-        # bytes. This only happens for some models, so we allow a small tolerance.
-        # For any real model being tested, the order would be fp8_e4m3_bf16 < fp8_e4m3_fp32 < fp32.
-        self.assertTrue(
-            fp8_e4m3_fp32_max_memory < fp32_max_memory
-            or abs(fp8_e4m3_fp32_max_memory - fp32_max_memory) < MB_TOLERANCE
-        )
-
-    @parameterized.expand([False, True])
-    @require_torch_accelerator
-    def test_group_offloading(self, record_stream):
-        for cls in inspect.getmro(self.__class__):
-            if "test_group_offloading" in cls.__dict__ and cls is not ModelTesterMixin:
-                # Skip this test if it is overwritten by child class. We need to do this because parameterized
-                # materializes the test methods on invocation which cannot be overridden.
-                pytest.skip("Model does not support group offloading.")
-
-        if not self.model_class._supports_group_offloading:
-            pytest.skip("Model does not support group offloading.")
-
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        torch.manual_seed(0)
-
-        @torch.no_grad()
-        def run_forward(model):
-            self.assertTrue(
-                all(
-                    module._diffusers_hook.get_hook("group_offloading") is not None
-                    for module in model.modules()
-                    if hasattr(module, "_diffusers_hook")
-                )
-            )
-            model.eval()
-            return model(**inputs_dict)[0]
-
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        output_without_group_offloading = run_forward(model)
-        output_without_group_offloading = normalize_output(output_without_group_offloading)
-
-        torch.manual_seed(0)
-        model = self.model_class(**init_dict)
-        model.enable_group_offload(torch_device, offload_type="block_level", num_blocks_per_group=1)
-        output_with_group_offloading1 = run_forward(model)
-        output_with_group_offloading1 = normalize_output(output_with_group_offloading1)
-
-        torch.manual_seed(0)
-        model = self.model_class(**init_dict)
-        model.enable_group_offload(torch_device, offload_type="block_level", num_blocks_per_group=1, non_blocking=True)
-        output_with_group_offloading2 = run_forward(model)
-        output_with_group_offloading2 = normalize_output(output_with_group_offloading2)
-
-        torch.manual_seed(0)
-        model = self.model_class(**init_dict)
-        model.enable_group_offload(torch_device, offload_type="leaf_level")
-        output_with_group_offloading3 = run_forward(model)
-        output_with_group_offloading3 = normalize_output(output_with_group_offloading3)
-
-        torch.manual_seed(0)
-        model = self.model_class(**init_dict)
-        model.enable_group_offload(
-            torch_device, offload_type="leaf_level", use_stream=True, record_stream=record_stream
-        )
-        output_with_group_offloading4 = run_forward(model)
-        output_with_group_offloading4 = normalize_output(output_with_group_offloading4)
-
-        self.assertTrue(torch.allclose(output_without_group_offloading, output_with_group_offloading1, atol=1e-5))
-        self.assertTrue(torch.allclose(output_without_group_offloading, output_with_group_offloading2, atol=1e-5))
-        self.assertTrue(torch.allclose(output_without_group_offloading, output_with_group_offloading3, atol=1e-5))
-        self.assertTrue(torch.allclose(output_without_group_offloading, output_with_group_offloading4, atol=1e-5))
-
-    @parameterized.expand([(False, "block_level"), (True, "leaf_level")])
-    @require_torch_accelerator
-    @torch.no_grad()
-    def test_group_offloading_with_layerwise_casting(self, record_stream, offload_type):
-        if not self.model_class._supports_group_offloading:
-            pytest.skip("Model does not support group offloading.")
-
-        torch.manual_seed(0)
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-
-        model.to(torch_device)
-        model.eval()
-        _ = model(**inputs_dict)[0]
-
-        torch.manual_seed(0)
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        storage_dtype, compute_dtype = torch.float16, torch.float32
-        inputs_dict = cast_maybe_tensor_dtype(inputs_dict, torch.float32, compute_dtype)
-        model = self.model_class(**init_dict)
-        model.eval()
-        additional_kwargs = {} if offload_type == "leaf_level" else {"num_blocks_per_group": 1}
-        model.enable_group_offload(
-            torch_device, offload_type=offload_type, use_stream=True, record_stream=record_stream, **additional_kwargs
-        )
-        model.enable_layerwise_casting(storage_dtype=storage_dtype, compute_dtype=compute_dtype)
-        _ = model(**inputs_dict)[0]
-
-    @parameterized.expand([("block_level", False), ("leaf_level", True)])
-    @require_torch_accelerator
-    @torch.no_grad()
-    @torch.inference_mode()
-    def test_group_offloading_with_disk(self, offload_type, record_stream, atol=1e-5):
-        for cls in inspect.getmro(self.__class__):
-            if "test_group_offloading_with_disk" in cls.__dict__ and cls is not ModelTesterMixin:
-                # Skip this test if it is overwritten by child class. We need to do this because parameterized
-                # materializes the test methods on invocation which cannot be overridden.
-                pytest.skip("Model does not support group offloading with disk yet.")
-
-        if not self.model_class._supports_group_offloading:
-            pytest.skip("Model does not support group offloading.")
-
-        def _has_generator_arg(model):
-            sig = inspect.signature(model.forward)
-            params = sig.parameters
-            return "generator" in params
-
-        def _run_forward(model, inputs_dict):
-            accepts_generator = _has_generator_arg(model)
-            if accepts_generator:
-                inputs_dict["generator"] = torch.manual_seed(0)
-            torch.manual_seed(0)
-            return model(**inputs_dict)[0]
-
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        torch.manual_seed(0)
-        model = self.model_class(**init_dict)
-
-        model.eval()
-        model.to(torch_device)
-        output_without_group_offloading = _run_forward(model, inputs_dict)
-        output_without_group_offloading = normalize_output(output_without_group_offloading)
-
-        torch.manual_seed(0)
-        model = self.model_class(**init_dict)
-        model.eval()
-
-        num_blocks_per_group = None if offload_type == "leaf_level" else 1
-        additional_kwargs = {} if offload_type == "leaf_level" else {"num_blocks_per_group": num_blocks_per_group}
-        with tempfile.TemporaryDirectory() as tmpdir:
-            model.enable_group_offload(
-                torch_device,
-                offload_type=offload_type,
-                offload_to_disk_path=tmpdir,
-                use_stream=True,
-                record_stream=record_stream,
-                **additional_kwargs,
-            )
-            has_safetensors = glob.glob(f"{tmpdir}/*.safetensors")
-            self.assertTrue(has_safetensors, "No safetensors found in the directory.")
-
-            # For "leaf-level", there is a prefetching hook which makes this check a bit non-deterministic
-            # in nature. So, skip it.
-            if offload_type != "leaf_level":
-                is_correct, extra_files, missing_files = _check_safetensors_serialization(
-                    module=model,
-                    offload_to_disk_path=tmpdir,
-                    offload_type=offload_type,
-                    num_blocks_per_group=num_blocks_per_group,
-                    block_modules=model._group_offload_block_modules
-                    if hasattr(model, "_group_offload_block_modules")
-                    else None,
-                )
-                if not is_correct:
-                    if extra_files:
-                        raise ValueError(f"Found extra files: {', '.join(extra_files)}")
-                    elif missing_files:
-                        raise ValueError(f"Following files are missing: {', '.join(missing_files)}")
-
-            output_with_group_offloading = _run_forward(model, inputs_dict)
-            output_with_group_offloading = normalize_output(output_with_group_offloading)
-            self.assertTrue(torch.allclose(output_without_group_offloading, output_with_group_offloading, atol=atol))
-
-    def test_auto_model(self, expected_max_diff=5e-5):
-        if self.forward_requires_fresh_args:
-            model = self.model_class(**self.init_dict)
-        else:
-            init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-            model = self.model_class(**init_dict)
-
-        model = model.eval()
-        model = model.to(torch_device)
-
-        if hasattr(model, "set_default_attn_processor"):
-            model.set_default_attn_processor()
-
-        with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as tmpdirname:
-            model.save_pretrained(tmpdirname, safe_serialization=False)
-
-            auto_model = AutoModel.from_pretrained(tmpdirname)
-            if hasattr(auto_model, "set_default_attn_processor"):
-                auto_model.set_default_attn_processor()
-
-        auto_model = auto_model.eval()
-        auto_model = auto_model.to(torch_device)
-
-        with torch.no_grad():
-            if self.forward_requires_fresh_args:
-                output_original = model(**self.inputs_dict(0))
-                output_auto = auto_model(**self.inputs_dict(0))
-            else:
-                output_original = model(**inputs_dict)
-                output_auto = auto_model(**inputs_dict)
-
-        if isinstance(output_original, dict):
-            output_original = output_original.to_tuple()[0]
-        if isinstance(output_auto, dict):
-            output_auto = output_auto.to_tuple()[0]
-
-        if isinstance(output_original, list):
-            output_original = torch.stack(output_original)
-        if isinstance(output_auto, list):
-            output_auto = torch.stack(output_auto)
-
-        output_original, output_auto = output_original.float(), output_auto.float()
-
-        max_diff = (output_original - output_auto).abs().max().item()
-        self.assertLessEqual(
-            max_diff,
-            expected_max_diff,
-            f"AutoModel forward pass diff: {max_diff} exceeds threshold {expected_max_diff}",
-        )
-
-    @parameterized.expand(
-        [
-            (-1, "You can't pass device_map as a negative int"),
-            ("foo", "When passing device_map as a string, the value needs to be a device name"),
-        ]
-    )
-    def test_wrong_device_map_raises_error(self, device_map, msg_substring):
-        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-        with tempfile.TemporaryDirectory() as tmpdir:
-            model.save_pretrained(tmpdir)
-            with self.assertRaises(ValueError) as err_ctx:
-                _ = self.model_class.from_pretrained(tmpdir, device_map=device_map)
-
-        assert msg_substring in str(err_ctx.exception)
-
-    @parameterized.expand([0, torch_device, torch.device(torch_device)])
-    @require_torch_accelerator
-    def test_passing_non_dict_device_map_works(self, device_map):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict).eval()
-        with tempfile.TemporaryDirectory() as tmpdir:
-            model.save_pretrained(tmpdir)
-            loaded_model = self.model_class.from_pretrained(tmpdir, device_map=device_map)
-            _ = loaded_model(**inputs_dict)
-
-    @parameterized.expand([("", torch_device), ("", torch.device(torch_device))])
-    @require_torch_accelerator
-    def test_passing_dict_device_map_works(self, name, device):
-        # There are other valid dict-based `device_map` values too. It's best to refer to
-        # the docs for those: https://huggingface.co/docs/accelerate/en/concept_guides/big_model_inference#the-devicemap.
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict).eval()
-        device_map = {name: device}
-        with tempfile.TemporaryDirectory() as tmpdir:
-            model.save_pretrained(tmpdir)
-            loaded_model = self.model_class.from_pretrained(tmpdir, device_map=device_map)
-            _ = loaded_model(**inputs_dict)
-
-
 @is_staging_test
-class ModelPushToHubTester(unittest.TestCase):
+class TestModelPushToHub:
     identifier = uuid.uuid4()
     repo_id = f"test-model-{identifier}"
     org_repo_id = f"valid_org/{repo_id}-org"
@@ -2041,7 +329,7 @@ def test_push_to_hub(self):
 
         new_model = UNet2DConditionModel.from_pretrained(f"{USER}/{self.repo_id}")
         for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(torch.equal(p1, p2))
+            assert torch.equal(p1, p2)
 
         # Push to hub via save_pretrained to a separate repo. Reusing `self.repo_id` after
         # deleting it makes the staging server's LFS GC reject the next commit with
@@ -2052,7 +340,7 @@ def test_push_to_hub(self):
 
         new_model = UNet2DConditionModel.from_pretrained(f"{USER}/{save_repo_id}")
         for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(torch.equal(p1, p2))
+            assert torch.equal(p1, p2)
 
         # Reset repos
         delete_repo(token=TOKEN, repo_id=self.repo_id)
@@ -2073,7 +361,7 @@ def test_push_to_hub_in_organization(self):
 
         new_model = UNet2DConditionModel.from_pretrained(self.org_repo_id)
         for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(torch.equal(p1, p2))
+            assert torch.equal(p1, p2)
 
         # Push to hub via save_pretrained to a separate repo. Reusing `self.org_repo_id` after
         # deleting it makes the staging server's LFS GC reject the next commit with
@@ -2084,13 +372,13 @@ def test_push_to_hub_in_organization(self):
 
         new_model = UNet2DConditionModel.from_pretrained(save_org_repo_id)
         for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(torch.equal(p1, p2))
+            assert torch.equal(p1, p2)
 
         # Reset repos
         delete_repo(token=TOKEN, repo_id=self.org_repo_id)
         delete_repo(save_org_repo_id, token=TOKEN)
 
-    @unittest.skipIf(
+    @pytest.mark.skipif(
         not is_jinja_available(),
         reason="Model card tests cannot be performed without Jinja installed.",
     )
@@ -2115,403 +403,3 @@ def test_push_to_hub_library_name(self):
 
         # Reset repo
         delete_repo(repo_id, token=TOKEN)
-
-
-@require_torch_accelerator
-@require_torch_2
-@is_torch_compile
-@slow
-@require_torch_version_greater("2.7.1")
-class TorchCompileTesterMixin:
-    different_shapes_for_compilation = None
-
-    def setUp(self):
-        # clean up the VRAM before each test
-        super().setUp()
-        torch.compiler.reset()
-        gc.collect()
-        backend_empty_cache(torch_device)
-
-    def tearDown(self):
-        # clean up the VRAM after each test in case of CUDA runtime errors
-        super().tearDown()
-        torch.compiler.reset()
-        gc.collect()
-        backend_empty_cache(torch_device)
-
-    def test_torch_compile_recompilation_and_graph_break(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        model = self.model_class(**init_dict).to(torch_device)
-        model.eval()
-        model = torch.compile(model, fullgraph=True)
-
-        with (
-            torch._inductor.utils.fresh_inductor_cache(),
-            torch._dynamo.config.patch(error_on_recompile=True),
-            torch.no_grad(),
-        ):
-            _ = model(**inputs_dict)
-            _ = model(**inputs_dict)
-
-    def test_torch_compile_repeated_blocks(self):
-        if self.model_class._repeated_blocks is None:
-            pytest.skip("Skipping test as the model class doesn't have `_repeated_blocks` set.")
-
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        model = self.model_class(**init_dict).to(torch_device)
-        model.eval()
-        model.compile_repeated_blocks(fullgraph=True)
-
-        recompile_limit = 1
-        if self.model_class.__name__ == "UNet2DConditionModel":
-            recompile_limit = 2
-        elif self.model_class.__name__ == "ZImageTransformer2DModel":
-            recompile_limit = 3
-
-        with (
-            torch._inductor.utils.fresh_inductor_cache(),
-            torch._dynamo.config.patch(recompile_limit=recompile_limit),
-            torch.no_grad(),
-        ):
-            _ = model(**inputs_dict)
-            _ = model(**inputs_dict)
-
-    def test_compile_with_group_offloading(self):
-        if not self.model_class._supports_group_offloading:
-            pytest.skip("Model does not support group offloading.")
-
-        torch._dynamo.config.cache_size_limit = 10000
-
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-        model.eval()
-        # TODO: Can test for other group offloading kwargs later if needed.
-        group_offload_kwargs = {
-            "onload_device": torch_device,
-            "offload_device": "cpu",
-            "offload_type": "block_level",
-            "num_blocks_per_group": 1,
-            "use_stream": True,
-            "non_blocking": True,
-        }
-        model.enable_group_offload(**group_offload_kwargs)
-        model.compile()
-
-        with torch.no_grad():
-            _ = model(**inputs_dict)
-            _ = model(**inputs_dict)
-
-    def test_compile_on_different_shapes(self):
-        if self.different_shapes_for_compilation is None:
-            pytest.skip(f"Skipping as `different_shapes_for_compilation` is not set for {self.__class__.__name__}.")
-        torch.fx.experimental._config.use_duck_shape = False
-
-        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict).to(torch_device)
-        model.eval()
-        model = torch.compile(model, fullgraph=True, dynamic=True)
-
-        for height, width in self.different_shapes_for_compilation:
-            with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
-                inputs_dict = self.prepare_dummy_input(height=height, width=width)
-                _ = model(**inputs_dict)
-
-    def test_compile_works_with_aot(self):
-        from torch._inductor.package import load_package
-
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        model = self.model_class(**init_dict).to(torch_device)
-        exported_model = torch.export.export(model, args=(), kwargs=inputs_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            package_path = os.path.join(tmpdir, f"{self.model_class.__name__}.pt2")
-            _ = torch._inductor.aoti_compile_and_package(exported_model, package_path=package_path)
-            assert os.path.exists(package_path)
-            loaded_binary = load_package(package_path, run_single_threaded=True)
-
-        model.forward = loaded_binary
-
-        with torch.no_grad():
-            _ = model(**inputs_dict)
-            _ = model(**inputs_dict)
-
-
-@slow
-@require_torch_2
-@require_torch_accelerator
-@require_peft_backend
-@require_peft_version_greater("0.14.0")
-@require_torch_version_greater("2.7.1")
-@is_torch_compile
-class LoraHotSwappingForModelTesterMixin:
-    """Test that hotswapping does not result in recompilation on the model directly.
-
-    We're not extensively testing the hotswapping functionality since it is implemented in PEFT and is extensively
-    tested there. The goal of this test is specifically to ensure that hotswapping with diffusers does not require
-    recompilation.
-
-    See
-    https://github.com/huggingface/peft/blob/eaab05e18d51fb4cce20a73c9acd82a00c013b83/tests/test_gpu_examples.py#L4252
-    for the analogous PEFT test.
-
-    """
-
-    different_shapes_for_compilation = None
-
-    def tearDown(self):
-        # It is critical that the dynamo cache is reset for each test. Otherwise, if the test re-uses the same model,
-        # there will be recompilation errors, as torch caches the model when run in the same process.
-        super().tearDown()
-        torch.compiler.reset()
-        gc.collect()
-        backend_empty_cache(torch_device)
-
-    def get_lora_config(self, lora_rank, lora_alpha, target_modules):
-        from peft import LoraConfig
-
-        lora_config = LoraConfig(
-            r=lora_rank,
-            lora_alpha=lora_alpha,
-            target_modules=target_modules,
-            init_lora_weights=False,
-            use_dora=False,
-        )
-        return lora_config
-
-    def get_linear_module_name_other_than_attn(self, model):
-        linear_names = [
-            name for name, module in model.named_modules() if isinstance(module, nn.Linear) and "to_" not in name
-        ]
-        return linear_names[0]
-
-    def check_model_hotswap(self, do_compile, rank0, rank1, target_modules0, target_modules1=None):
-        """
-        Check that hotswapping works on a small unet.
-
-        Steps:
-        - create 2 LoRA adapters and save them
-        - load the first adapter
-        - hotswap the second adapter
-        - check that the outputs are correct
-        - optionally compile the model
-        - optionally check if recompilations happen on different shapes
-
-        Note: We set rank == alpha here because save_lora_adapter does not save the alpha scalings, thus the test would
-        fail if the values are different. Since rank != alpha does not matter for the purpose of this test, this is
-        fine.
-        """
-        different_shapes = self.different_shapes_for_compilation
-        # create 2 adapters with different ranks and alphas
-        torch.manual_seed(0)
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict).to(torch_device)
-
-        alpha0, alpha1 = rank0, rank1
-        max_rank = max([rank0, rank1])
-        if target_modules1 is None:
-            target_modules1 = target_modules0[:]
-        lora_config0 = self.get_lora_config(rank0, alpha0, target_modules0)
-        lora_config1 = self.get_lora_config(rank1, alpha1, target_modules1)
-
-        model.add_adapter(lora_config0, adapter_name="adapter0")
-        with torch.inference_mode():
-            torch.manual_seed(0)
-            output0_before = model(**inputs_dict)["sample"]
-
-        model.add_adapter(lora_config1, adapter_name="adapter1")
-        model.set_adapter("adapter1")
-        with torch.inference_mode():
-            torch.manual_seed(0)
-            output1_before = model(**inputs_dict)["sample"]
-
-        # sanity checks:
-        tol = 5e-3
-        assert not torch.allclose(output0_before, output1_before, atol=tol, rtol=tol)
-        assert not (output0_before == 0).all()
-        assert not (output1_before == 0).all()
-
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            # save the adapter checkpoints
-            model.save_lora_adapter(os.path.join(tmp_dirname, "0"), safe_serialization=True, adapter_name="adapter0")
-            model.save_lora_adapter(os.path.join(tmp_dirname, "1"), safe_serialization=True, adapter_name="adapter1")
-            del model
-
-            # load the first adapter
-            torch.manual_seed(0)
-            init_dict, _ = self.prepare_init_args_and_inputs_for_common()
-            model = self.model_class(**init_dict).to(torch_device)
-
-            if do_compile or (rank0 != rank1):
-                # no need to prepare if the model is not compiled or if the ranks are identical
-                model.enable_lora_hotswap(target_rank=max_rank)
-
-            file_name0 = os.path.join(os.path.join(tmp_dirname, "0"), "pytorch_lora_weights.safetensors")
-            file_name1 = os.path.join(os.path.join(tmp_dirname, "1"), "pytorch_lora_weights.safetensors")
-            model.load_lora_adapter(file_name0, safe_serialization=True, adapter_name="adapter0", prefix=None)
-
-            if do_compile:
-                model = torch.compile(model, mode="reduce-overhead", dynamic=different_shapes is not None)
-
-            with torch.inference_mode():
-                # additionally check if dynamic compilation works.
-                if different_shapes is not None:
-                    for height, width in different_shapes:
-                        new_inputs_dict = self.prepare_dummy_input(height=height, width=width)
-                        _ = model(**new_inputs_dict)
-                else:
-                    output0_after = model(**inputs_dict)["sample"]
-                    assert torch.allclose(output0_before, output0_after, atol=tol, rtol=tol)
-
-            # hotswap the 2nd adapter
-            model.load_lora_adapter(file_name1, adapter_name="adapter0", hotswap=True, prefix=None)
-
-            # we need to call forward to potentially trigger recompilation
-            with torch.inference_mode():
-                if different_shapes is not None:
-                    for height, width in different_shapes:
-                        new_inputs_dict = self.prepare_dummy_input(height=height, width=width)
-                        _ = model(**new_inputs_dict)
-                else:
-                    output1_after = model(**inputs_dict)["sample"]
-                    assert torch.allclose(output1_before, output1_after, atol=tol, rtol=tol)
-
-            # check error when not passing valid adapter name
-            name = "does-not-exist"
-            msg = f"Trying to hotswap LoRA adapter '{name}' but there is no existing adapter by that name"
-            with self.assertRaisesRegex(ValueError, msg):
-                model.load_lora_adapter(file_name1, adapter_name=name, hotswap=True, prefix=None)
-
-    @parameterized.expand([(11, 11), (7, 13), (13, 7)])  # important to test small to large and vice versa
-    def test_hotswapping_model(self, rank0, rank1):
-        self.check_model_hotswap(
-            do_compile=False, rank0=rank0, rank1=rank1, target_modules0=["to_q", "to_k", "to_v", "to_out.0"]
-        )
-
-    @parameterized.expand([(11, 11), (7, 13), (13, 7)])  # important to test small to large and vice versa
-    def test_hotswapping_compiled_model_linear(self, rank0, rank1):
-        # It's important to add this context to raise an error on recompilation
-        target_modules = ["to_q", "to_k", "to_v", "to_out.0"]
-        with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache():
-            self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)
-
-    @parameterized.expand([(11, 11), (7, 13), (13, 7)])  # important to test small to large and vice versa
-    def test_hotswapping_compiled_model_conv2d(self, rank0, rank1):
-        if "unet" not in self.model_class.__name__.lower():
-            pytest.skip("Test only applies to UNet.")
-
-        # It's important to add this context to raise an error on recompilation
-        target_modules = ["conv", "conv1", "conv2"]
-        with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache():
-            self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)
-
-    @parameterized.expand([(11, 11), (7, 13), (13, 7)])  # important to test small to large and vice versa
-    def test_hotswapping_compiled_model_both_linear_and_conv2d(self, rank0, rank1):
-        if "unet" not in self.model_class.__name__.lower():
-            pytest.skip("Test only applies to UNet.")
-
-        # It's important to add this context to raise an error on recompilation
-        target_modules = ["to_q", "conv"]
-        with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache():
-            self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)
-
-    @parameterized.expand([(11, 11), (7, 13), (13, 7)])  # important to test small to large and vice versa
-    def test_hotswapping_compiled_model_both_linear_and_other(self, rank0, rank1):
-        # In `test_hotswapping_compiled_model_both_linear_and_conv2d()`, we check if we can do hotswapping
-        # with `torch.compile()` for models that have both linear and conv layers. In this test, we check
-        # if we can target a linear layer from the transformer blocks and another linear layer from non-attention
-        # block.
-        target_modules = ["to_q"]
-        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-
-        target_modules.append(self.get_linear_module_name_other_than_attn(model))
-        del model
-
-        # It's important to add this context to raise an error on recompilation
-        with torch._dynamo.config.patch(error_on_recompile=True):
-            self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)
-
-    def test_enable_lora_hotswap_called_after_adapter_added_raises(self):
-        # ensure that enable_lora_hotswap is called before loading the first adapter
-        lora_config = self.get_lora_config(8, 8, target_modules=["to_q"])
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict).to(torch_device)
-        model.add_adapter(lora_config)
-
-        msg = re.escape("Call `enable_lora_hotswap` before loading the first adapter.")
-        with self.assertRaisesRegex(RuntimeError, msg):
-            model.enable_lora_hotswap(target_rank=32)
-
-    def test_enable_lora_hotswap_called_after_adapter_added_warning(self):
-        # ensure that enable_lora_hotswap is called before loading the first adapter
-        from diffusers.loaders.peft import logger
-
-        lora_config = self.get_lora_config(8, 8, target_modules=["to_q"])
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict).to(torch_device)
-        model.add_adapter(lora_config)
-        msg = (
-            "It is recommended to call `enable_lora_hotswap` before loading the first adapter to avoid recompilation."
-        )
-        with self.assertLogs(logger=logger, level="WARNING") as cm:
-            model.enable_lora_hotswap(target_rank=32, check_compiled="warn")
-            assert any(msg in log for log in cm.output)
-
-    def test_enable_lora_hotswap_called_after_adapter_added_ignore(self):
-        # check possibility to ignore the error/warning
-        from diffusers.loaders.peft import logger
-
-        lora_config = self.get_lora_config(8, 8, target_modules=["to_q"])
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict).to(torch_device)
-        model.add_adapter(lora_config)
-        # note: assertNoLogs requires Python 3.10+
-        with self.assertNoLogs(logger, level="WARNING"):
-            model.enable_lora_hotswap(target_rank=32, check_compiled="ignore")
-
-    def test_enable_lora_hotswap_wrong_check_compiled_argument_raises(self):
-        # check that wrong argument value raises an error
-        lora_config = self.get_lora_config(8, 8, target_modules=["to_q"])
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict).to(torch_device)
-        model.add_adapter(lora_config)
-        msg = re.escape("check_compiles should be one of 'error', 'warn', or 'ignore', got 'wrong-argument' instead.")
-        with self.assertRaisesRegex(ValueError, msg):
-            model.enable_lora_hotswap(target_rank=32, check_compiled="wrong-argument")
-
-    def test_hotswap_second_adapter_targets_more_layers_raises(self):
-        # check the error and log
-        from diffusers.loaders.peft import logger
-
-        # at the moment, PEFT requires the 2nd adapter to target the same or a subset of layers
-        target_modules0 = ["to_q"]
-        target_modules1 = ["to_q", "to_k"]
-        with self.assertRaises(RuntimeError):  # peft raises RuntimeError
-            with self.assertLogs(logger=logger, level="ERROR") as cm:
-                self.check_model_hotswap(
-                    do_compile=True, rank0=8, rank1=8, target_modules0=target_modules0, target_modules1=target_modules1
-                )
-                assert any("Hotswapping adapter0 was unsuccessful" in log for log in cm.output)
-
-    @parameterized.expand([(11, 11), (7, 13), (13, 7)])
-    @require_torch_version_greater("2.7.1")
-    def test_hotswapping_compile_on_different_shapes(self, rank0, rank1):
-        different_shapes_for_compilation = self.different_shapes_for_compilation
-        if different_shapes_for_compilation is None:
-            pytest.skip(f"Skipping as `different_shapes_for_compilation` is not set for {self.__class__.__name__}.")
-        # Specifying `use_duck_shape=False` instructs the compiler if it should use the same symbolic
-        # variable to represent input sizes that are the same. For more details,
-        # check out this [comment](https://github.com/huggingface/diffusers/pull/11327#discussion_r2047659790).
-        torch.fx.experimental._config.use_duck_shape = False
-
-        target_modules = ["to_q", "to_k", "to_v", "to_out.0"]
-        with torch._dynamo.config.patch(error_on_recompile=True):
-            self.check_model_hotswap(
-                do_compile=True,
-                rank0=rank0,
-                rank1=rank1,
-                target_modules0=target_modules,
-            )
diff --git a/tests/models/transformers/test_models_dit_transformer2d.py b/tests/models/transformers/test_models_dit_transformer2d.py
index 473a87637578..f1efb362d104 100644
--- a/tests/models/transformers/test_models_dit_transformer2d.py
+++ b/tests/models/transformers/test_models_dit_transformer2d.py
@@ -13,52 +13,48 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
-
+import pytest
 import torch
 
 from diffusers import DiTTransformer2DModel, Transformer2DModel
-
-from ...testing_utils import (
-    enable_full_determinism,
-    floats_tensor,
-    slow,
-    torch_device,
+from diffusers.utils.torch_utils import randn_tensor
+
+from ...testing_utils import enable_full_determinism, slow, torch_device
+from ..testing_utils import (
+    AttentionTesterMixin,
+    BaseModelTesterConfig,
+    MemoryTesterMixin,
+    ModelTesterMixin,
+    TrainingTesterMixin,
 )
-from ..test_modeling_common import ModelTesterMixin
 
 
 enable_full_determinism()
 
 
-class DiTTransformer2DModelTests(ModelTesterMixin, unittest.TestCase):
-    model_class = DiTTransformer2DModel
-    main_input_name = "hidden_states"
-
+class DiTTransformer2DTesterConfig(BaseModelTesterConfig):
     @property
-    def dummy_input(self):
-        batch_size = 4
-        in_channels = 4
-        sample_size = 8
-        scheduler_num_train_steps = 1000
-        num_class_labels = 4
+    def model_class(self):
+        return DiTTransformer2DModel
 
-        hidden_states = floats_tensor((batch_size, in_channels, sample_size, sample_size)).to(torch_device)
-        timesteps = torch.randint(0, scheduler_num_train_steps, size=(batch_size,)).to(torch_device)
-        class_label_ids = torch.randint(0, num_class_labels, size=(batch_size,)).to(torch_device)
-
-        return {"hidden_states": hidden_states, "timestep": timesteps, "class_labels": class_label_ids}
+    @property
+    def main_input_name(self) -> str:
+        return "hidden_states"
 
     @property
-    def input_shape(self):
+    def input_shape(self) -> tuple:
         return (4, 8, 8)
 
     @property
-    def output_shape(self):
+    def output_shape(self) -> tuple:
         return (8, 8, 8)
 
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
+    @property
+    def generator(self):
+        return torch.Generator("cpu").manual_seed(0)
+
+    def get_init_dict(self) -> dict:
+        return {
             "in_channels": 4,
             "out_channels": 8,
             "activation_fn": "gelu-approximate",
@@ -71,26 +67,38 @@ def prepare_init_args_and_inputs_for_common(self):
             "patch_size": 2,
             "sample_size": 8,
         }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
 
-    def test_output(self):
-        super().test_output(
-            expected_output_shape=(self.dummy_input[self.main_input_name].shape[0],) + self.output_shape
-        )
+    def get_dummy_inputs(self, batch_size: int = 4) -> dict[str, torch.Tensor]:
+        in_channels = 4
+        sample_size = 8
+        scheduler_num_train_steps = 1000
+        num_class_labels = 4
+
+        return {
+            "hidden_states": randn_tensor(
+                (batch_size, in_channels, sample_size, sample_size), generator=self.generator, device=torch_device
+            ),
+            "timestep": torch.randint(0, scheduler_num_train_steps, size=(batch_size,), generator=self.generator).to(
+                torch_device
+            ),
+            "class_labels": torch.randint(0, num_class_labels, size=(batch_size,), generator=self.generator).to(
+                torch_device
+            ),
+        }
+
+
+class TestDiTTransformer2D(DiTTransformer2DTesterConfig, ModelTesterMixin):
+    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
+    def test_from_save_pretrained_dtype_inference(self, tmp_path, dtype):
+        # Skip: fp16/bf16 require very high atol to pass, providing little signal.
+        # Dtype preservation is already tested by test_from_save_pretrained_dtype and test_keep_in_fp32_modules.
+        pytest.skip("Tolerance requirements too high for meaningful test")
 
     def test_correct_class_remapping_from_dict_config(self):
-        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
+        init_dict = self.get_init_dict()
         model = Transformer2DModel.from_config(init_dict)
         assert isinstance(model, DiTTransformer2DModel)
 
-    def test_gradient_checkpointing_is_applied(self):
-        expected_set = {"DiTTransformer2DModel"}
-        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
-
-    def test_effective_gradient_checkpointing(self):
-        super().test_effective_gradient_checkpointing(loss_tolerance=1e-4)
-
     def test_correct_class_remapping_from_pretrained_config(self):
         config = DiTTransformer2DModel.load_config("facebook/DiT-XL-2-256", subfolder="transformer")
         model = Transformer2DModel.from_config(config)
@@ -100,3 +108,20 @@ def test_correct_class_remapping_from_pretrained_config(self):
     def test_correct_class_remapping(self):
         model = Transformer2DModel.from_pretrained("facebook/DiT-XL-2-256", subfolder="transformer")
         assert isinstance(model, DiTTransformer2DModel)
+
+
+class TestDiTTransformer2DMemory(DiTTransformer2DTesterConfig, MemoryTesterMixin):
+    pass
+
+
+class TestDiTTransformer2DAttention(DiTTransformer2DTesterConfig, AttentionTesterMixin):
+    pass
+
+
+class TestDiTTransformer2DTraining(DiTTransformer2DTesterConfig, TrainingTesterMixin):
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"DiTTransformer2DModel"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
+    def test_gradient_checkpointing_equivalence(self):
+        super().test_gradient_checkpointing_equivalence(loss_tolerance=1e-4)
diff --git a/tests/models/transformers/test_models_pixart_transformer2d.py b/tests/models/transformers/test_models_pixart_transformer2d.py
index 17c400cf1911..879274c52dbd 100644
--- a/tests/models/transformers/test_models_pixart_transformer2d.py
+++ b/tests/models/transformers/test_models_pixart_transformer2d.py
@@ -13,60 +13,53 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
-
+import pytest
 import torch
 
 from diffusers import PixArtTransformer2DModel, Transformer2DModel
-
-from ...testing_utils import (
-    enable_full_determinism,
-    floats_tensor,
-    slow,
-    torch_device,
+from diffusers.utils.torch_utils import randn_tensor
+
+from ...testing_utils import enable_full_determinism, slow, torch_device
+from ..testing_utils import (
+    AttentionTesterMixin,
+    BaseModelTesterConfig,
+    MemoryTesterMixin,
+    ModelTesterMixin,
+    TrainingTesterMixin,
 )
-from ..test_modeling_common import ModelTesterMixin
 
 
 enable_full_determinism()
 
 
-class PixArtTransformer2DModelTests(ModelTesterMixin, unittest.TestCase):
-    model_class = PixArtTransformer2DModel
-    main_input_name = "hidden_states"
-    # We override the items here because the transformer under consideration is small.
-    model_split_percents = [0.7, 0.6, 0.6]
-
+class PixArtTransformer2DTesterConfig(BaseModelTesterConfig):
     @property
-    def dummy_input(self):
-        batch_size = 4
-        in_channels = 4
-        sample_size = 8
-        scheduler_num_train_steps = 1000
-        cross_attention_dim = 8
-        seq_len = 8
+    def model_class(self):
+        return PixArtTransformer2DModel
 
-        hidden_states = floats_tensor((batch_size, in_channels, sample_size, sample_size)).to(torch_device)
-        timesteps = torch.randint(0, scheduler_num_train_steps, size=(batch_size,)).to(torch_device)
-        encoder_hidden_states = floats_tensor((batch_size, seq_len, cross_attention_dim)).to(torch_device)
-
-        return {
-            "hidden_states": hidden_states,
-            "timestep": timesteps,
-            "encoder_hidden_states": encoder_hidden_states,
-            "added_cond_kwargs": {"aspect_ratio": None, "resolution": None},
-        }
+    @property
+    def main_input_name(self) -> str:
+        return "hidden_states"
 
     @property
-    def input_shape(self):
+    def input_shape(self) -> tuple:
         return (4, 8, 8)
 
     @property
-    def output_shape(self):
+    def output_shape(self) -> tuple:
         return (8, 8, 8)
 
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
+    @property
+    def model_split_percents(self) -> list:
+        # We override the items here because the transformer under consideration is small.
+        return [0.7, 0.6, 0.6]
+
+    @property
+    def generator(self):
+        return torch.Generator("cpu").manual_seed(0)
+
+    def get_init_dict(self) -> dict:
+        return {
             "sample_size": 8,
             "num_layers": 1,
             "patch_size": 2,
@@ -84,20 +77,37 @@ def prepare_init_args_and_inputs_for_common(self):
             "use_additional_conditions": False,
             "caption_channels": None,
         }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
 
-    def test_output(self):
-        super().test_output(
-            expected_output_shape=(self.dummy_input[self.main_input_name].shape[0],) + self.output_shape
-        )
+    def get_dummy_inputs(self, batch_size: int = 4) -> dict[str, torch.Tensor]:
+        in_channels = 4
+        sample_size = 8
+        scheduler_num_train_steps = 1000
+        cross_attention_dim = 8
+        seq_len = 8
+
+        return {
+            "hidden_states": randn_tensor(
+                (batch_size, in_channels, sample_size, sample_size), generator=self.generator, device=torch_device
+            ),
+            "timestep": torch.randint(0, scheduler_num_train_steps, size=(batch_size,), generator=self.generator).to(
+                torch_device
+            ),
+            "encoder_hidden_states": randn_tensor(
+                (batch_size, seq_len, cross_attention_dim), generator=self.generator, device=torch_device
+            ),
+            "added_cond_kwargs": {"aspect_ratio": None, "resolution": None},
+        }
+
 
-    def test_gradient_checkpointing_is_applied(self):
-        expected_set = {"PixArtTransformer2DModel"}
-        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+class TestPixArtTransformer2D(PixArtTransformer2DTesterConfig, ModelTesterMixin):
+    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
+    def test_from_save_pretrained_dtype_inference(self, tmp_path, dtype):
+        # Skip: fp16/bf16 require very high atol to pass, providing little signal.
+        # Dtype preservation is already tested by test_from_save_pretrained_dtype and test_keep_in_fp32_modules.
+        pytest.skip("Tolerance requirements too high for meaningful test")
 
     def test_correct_class_remapping_from_dict_config(self):
-        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
+        init_dict = self.get_init_dict()
         model = Transformer2DModel.from_config(init_dict)
         assert isinstance(model, PixArtTransformer2DModel)
 
@@ -110,3 +120,17 @@ def test_correct_class_remapping_from_pretrained_config(self):
     def test_correct_class_remapping(self):
         model = Transformer2DModel.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="transformer")
         assert isinstance(model, PixArtTransformer2DModel)
+
+
+class TestPixArtTransformer2DMemory(PixArtTransformer2DTesterConfig, MemoryTesterMixin):
+    pass
+
+
+class TestPixArtTransformer2DAttention(PixArtTransformer2DTesterConfig, AttentionTesterMixin):
+    pass
+
+
+class TestPixArtTransformer2DTraining(PixArtTransformer2DTesterConfig, TrainingTesterMixin):
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"PixArtTransformer2DModel"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
diff --git a/tests/models/transformers/test_models_prior.py b/tests/models/transformers/test_models_prior.py
index af5ac4bbbd76..1da32b77786a 100644
--- a/tests/models/transformers/test_models_prior.py
+++ b/tests/models/transformers/test_models_prior.py
@@ -21,41 +21,69 @@
 from parameterized import parameterized
 
 from diffusers import PriorTransformer
+from diffusers.utils.torch_utils import randn_tensor
 
 from ...testing_utils import (
     backend_empty_cache,
     enable_full_determinism,
-    floats_tensor,
     slow,
     torch_all_close,
     torch_device,
 )
-from ..test_modeling_common import ModelTesterMixin
+from ..testing_utils import (
+    AttentionTesterMixin,
+    BaseModelTesterConfig,
+    MemoryTesterMixin,
+    ModelTesterMixin,
+    TrainingTesterMixin,
+)
 
 
 enable_full_determinism()
 
 
-class PriorTransformerTests(ModelTesterMixin, unittest.TestCase):
-    model_class = PriorTransformer
-    main_input_name = "hidden_states"
+class PriorTransformerTesterConfig(BaseModelTesterConfig):
+    @property
+    def model_class(self):
+        return PriorTransformer
 
     @property
-    def dummy_input(self):
-        batch_size = 4
-        embedding_dim = 8
-        num_embeddings = 7
+    def main_input_name(self) -> str:
+        return "hidden_states"
+
+    @property
+    def input_shape(self) -> tuple:
+        return (4, 8)
 
-        hidden_states = floats_tensor((batch_size, embedding_dim)).to(torch_device)
+    @property
+    def output_shape(self) -> tuple:
+        return (4, 8)
 
-        proj_embedding = floats_tensor((batch_size, embedding_dim)).to(torch_device)
-        encoder_hidden_states = floats_tensor((batch_size, num_embeddings, embedding_dim)).to(torch_device)
+    @property
+    def generator(self):
+        return torch.Generator("cpu").manual_seed(0)
 
+    def get_init_dict(self) -> dict:
         return {
-            "hidden_states": hidden_states,
+            "num_attention_heads": 2,
+            "attention_head_dim": 4,
+            "num_layers": 2,
+            "embedding_dim": 8,
+            "num_embeddings": 7,
+            "additional_embeddings": 4,
+        }
+
+    def get_dummy_inputs(self, batch_size: int = 4) -> dict:
+        embedding_dim = 8
+        num_embeddings = 7
+
+        return {
+            "hidden_states": randn_tensor((batch_size, embedding_dim), generator=self.generator, device=torch_device),
             "timestep": 2,
-            "proj_embedding": proj_embedding,
-            "encoder_hidden_states": encoder_hidden_states,
+            "proj_embedding": randn_tensor((batch_size, embedding_dim), generator=self.generator, device=torch_device),
+            "encoder_hidden_states": randn_tensor(
+                (batch_size, num_embeddings, embedding_dim), generator=self.generator, device=torch_device
+            ),
         }
 
     def get_dummy_seed_input(self, seed=0):
@@ -65,7 +93,6 @@ def get_dummy_seed_input(self, seed=0):
         num_embeddings = 7
 
         hidden_states = torch.randn((batch_size, embedding_dim)).to(torch_device)
-
         proj_embedding = torch.randn((batch_size, embedding_dim)).to(torch_device)
         encoder_hidden_states = torch.randn((batch_size, num_embeddings, embedding_dim)).to(torch_device)
 
@@ -76,48 +103,28 @@ def get_dummy_seed_input(self, seed=0):
             "encoder_hidden_states": encoder_hidden_states,
         }
 
-    @property
-    def input_shape(self):
-        return (4, 8)
-
-    @property
-    def output_shape(self):
-        return (4, 8)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "num_attention_heads": 2,
-            "attention_head_dim": 4,
-            "num_layers": 2,
-            "embedding_dim": 8,
-            "num_embeddings": 7,
-            "additional_embeddings": 4,
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
 
+class TestPriorTransformer(PriorTransformerTesterConfig, ModelTesterMixin):
     def test_from_pretrained_hub(self):
         model, loading_info = PriorTransformer.from_pretrained(
             "hf-internal-testing/prior-dummy", output_loading_info=True
         )
-        self.assertIsNotNone(model)
-        self.assertEqual(len(loading_info["missing_keys"]), 0)
+        assert model is not None
+        assert len(loading_info["missing_keys"]) == 0
 
         model.to(torch_device)
-        hidden_states = model(**self.dummy_input)[0]
+        hidden_states = model(**self.get_dummy_inputs())[0]
 
         assert hidden_states is not None, "Make sure output is not None"
 
     def test_forward_signature(self):
-        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
-
-        model = self.model_class(**init_dict)
+        model = self.model_class(**self.get_init_dict())
         signature = inspect.signature(model.forward)
         # signature.parameters is an OrderedDict => so arg_names order is deterministic
         arg_names = [*signature.parameters.keys()]
 
         expected_arg_names = ["hidden_states", "timestep"]
-        self.assertListEqual(arg_names[:2], expected_arg_names)
+        assert arg_names[:2] == expected_arg_names
 
     def test_output_pretrained(self):
         model = PriorTransformer.from_pretrained("hf-internal-testing/prior-dummy")
@@ -136,7 +143,19 @@ def test_output_pretrained(self):
         # Since the VAE Gaussian prior's generator is seeded on the appropriate device,
         # the expected output slices are not the same for CPU and GPU.
         expected_output_slice = torch.tensor([-1.3436, -0.2870, 0.7538, 0.4368, -0.0239])
-        self.assertTrue(torch_all_close(output_slice, expected_output_slice, rtol=1e-2))
+        assert torch_all_close(output_slice, expected_output_slice, rtol=1e-2)
+
+
+class TestPriorTransformerMemory(PriorTransformerTesterConfig, MemoryTesterMixin):
+    pass
+
+
+class TestPriorTransformerAttention(PriorTransformerTesterConfig, AttentionTesterMixin):
+    pass
+
+
+class TestPriorTransformerTraining(PriorTransformerTesterConfig, TrainingTesterMixin):
+    pass
 
 
 @slow
diff --git a/tests/models/transformers/test_models_transformer_allegro.py b/tests/models/transformers/test_models_transformer_allegro.py
index 7c002f87819e..0c3e302a3f0d 100644
--- a/tests/models/transformers/test_models_transformer_allegro.py
+++ b/tests/models/transformers/test_models_transformer_allegro.py
@@ -12,57 +12,47 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
-
 import torch
 
 from diffusers import AllegroTransformer3DModel
+from diffusers.utils.torch_utils import randn_tensor
 
-from ...testing_utils import (
-    enable_full_determinism,
-    torch_device,
+from ...testing_utils import enable_full_determinism, torch_device
+from ..testing_utils import (
+    AttentionTesterMixin,
+    BaseModelTesterConfig,
+    MemoryTesterMixin,
+    ModelTesterMixin,
+    TrainingTesterMixin,
 )
-from ..test_modeling_common import ModelTesterMixin
 
 
 enable_full_determinism()
 
 
-class AllegroTransformerTests(ModelTesterMixin, unittest.TestCase):
-    model_class = AllegroTransformer3DModel
-    main_input_name = "hidden_states"
-    uses_custom_attn_processor = True
-
+class AllegroTransformerTesterConfig(BaseModelTesterConfig):
     @property
-    def dummy_input(self):
-        batch_size = 2
-        num_channels = 4
-        num_frames = 2
-        height = 8
-        width = 8
-        embedding_dim = 16
-        sequence_length = 16
-
-        hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device)
-        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim // 2)).to(torch_device)
-        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
+    def model_class(self):
+        return AllegroTransformer3DModel
 
-        return {
-            "hidden_states": hidden_states,
-            "encoder_hidden_states": encoder_hidden_states,
-            "timestep": timestep,
-        }
+    @property
+    def main_input_name(self) -> str:
+        return "hidden_states"
 
     @property
-    def input_shape(self):
+    def input_shape(self) -> tuple:
         return (4, 2, 8, 8)
 
     @property
-    def output_shape(self):
+    def output_shape(self) -> tuple:
         return (4, 2, 8, 8)
 
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
+    @property
+    def generator(self):
+        return torch.Generator("cpu").manual_seed(0)
+
+    def get_init_dict(self) -> dict:
+        return {
             # Product of num_attention_heads * attention_head_dim must be divisible by 16 for 3D positional embeddings.
             "num_attention_heads": 2,
             "attention_head_dim": 8,
@@ -75,9 +65,38 @@ def prepare_init_args_and_inputs_for_common(self):
             "sample_frames": 8,
             "caption_channels": 8,
         }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
 
+    def get_dummy_inputs(self, batch_size: int = 2) -> dict[str, torch.Tensor]:
+        num_channels = 4
+        num_frames = 2
+        height = width = 8
+        embedding_dim = 16
+        sequence_length = 16
+
+        return {
+            "hidden_states": randn_tensor(
+                (batch_size, num_channels, num_frames, height, width), generator=self.generator, device=torch_device
+            ),
+            "encoder_hidden_states": randn_tensor(
+                (batch_size, sequence_length, embedding_dim // 2), generator=self.generator, device=torch_device
+            ),
+            "timestep": torch.randint(0, 1000, size=(batch_size,), generator=self.generator).to(torch_device),
+        }
+
+
+class TestAllegroTransformer(AllegroTransformerTesterConfig, ModelTesterMixin):
+    pass
+
+
+class TestAllegroTransformerMemory(AllegroTransformerTesterConfig, MemoryTesterMixin):
+    pass
+
+
+class TestAllegroTransformerAttention(AllegroTransformerTesterConfig, AttentionTesterMixin):
+    pass
+
+
+class TestAllegroTransformerTraining(AllegroTransformerTesterConfig, TrainingTesterMixin):
     def test_gradient_checkpointing_is_applied(self):
         expected_set = {"AllegroTransformer3DModel"}
         super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
diff --git a/tests/models/transformers/test_models_transformer_aura_flow.py b/tests/models/transformers/test_models_transformer_aura_flow.py
index ae8c3b7234a3..3e13945977fd 100644
--- a/tests/models/transformers/test_models_transformer_aura_flow.py
+++ b/tests/models/transformers/test_models_transformer_aura_flow.py
@@ -13,52 +13,52 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
-
 import torch
 
 from diffusers import AuraFlowTransformer2DModel
+from diffusers.utils.torch_utils import randn_tensor
 
 from ...testing_utils import enable_full_determinism, torch_device
-from ..test_modeling_common import ModelTesterMixin
+from ..testing_utils import (
+    AttentionTesterMixin,
+    BaseModelTesterConfig,
+    MemoryTesterMixin,
+    ModelTesterMixin,
+    TrainingTesterMixin,
+)
 
 
 enable_full_determinism()
 
 
-class AuraFlowTransformerTests(ModelTesterMixin, unittest.TestCase):
-    model_class = AuraFlowTransformer2DModel
-    main_input_name = "hidden_states"
-    # We override the items here because the transformer under consideration is small.
-    model_split_percents = [0.7, 0.6, 0.6]
-
+class AuraFlowTransformerTesterConfig(BaseModelTesterConfig):
     @property
-    def dummy_input(self):
-        batch_size = 2
-        num_channels = 4
-        height = width = embedding_dim = 32
-        sequence_length = 256
-
-        hidden_states = torch.randn((batch_size, num_channels, height, width)).to(torch_device)
-        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
-        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
+    def model_class(self):
+        return AuraFlowTransformer2DModel
 
-        return {
-            "hidden_states": hidden_states,
-            "encoder_hidden_states": encoder_hidden_states,
-            "timestep": timestep,
-        }
+    @property
+    def main_input_name(self) -> str:
+        return "hidden_states"
 
     @property
-    def input_shape(self):
+    def input_shape(self) -> tuple:
         return (4, 32, 32)
 
     @property
-    def output_shape(self):
+    def output_shape(self) -> tuple:
         return (4, 32, 32)
 
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
+    @property
+    def model_split_percents(self) -> list:
+        # We override the items here because the transformer under consideration is small.
+        return [0.7, 0.6, 0.6]
+
+    @property
+    def generator(self):
+        return torch.Generator("cpu").manual_seed(0)
+
+    def get_init_dict(self) -> dict:
+        return {
             "sample_size": 32,
             "patch_size": 2,
             "in_channels": 4,
@@ -71,13 +71,36 @@ def prepare_init_args_and_inputs_for_common(self):
             "out_channels": 4,
             "pos_embed_max_size": 256,
         }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
 
+    def get_dummy_inputs(self, batch_size: int = 2) -> dict[str, torch.Tensor]:
+        num_channels = 4
+        height = width = embedding_dim = 32
+        sequence_length = 256
+
+        return {
+            "hidden_states": randn_tensor(
+                (batch_size, num_channels, height, width), generator=self.generator, device=torch_device
+            ),
+            "encoder_hidden_states": randn_tensor(
+                (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device
+            ),
+            "timestep": torch.randint(0, 1000, size=(batch_size,), generator=self.generator).to(torch_device),
+        }
+
+
+class TestAuraFlowTransformer(AuraFlowTransformerTesterConfig, ModelTesterMixin):
+    pass
+
+
+class TestAuraFlowTransformerMemory(AuraFlowTransformerTesterConfig, MemoryTesterMixin):
+    pass
+
+
+class TestAuraFlowTransformerAttention(AuraFlowTransformerTesterConfig, AttentionTesterMixin):
+    pass
+
+
+class TestAuraFlowTransformerTraining(AuraFlowTransformerTesterConfig, TrainingTesterMixin):
     def test_gradient_checkpointing_is_applied(self):
         expected_set = {"AuraFlowTransformer2DModel"}
         super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
-
-    @unittest.skip("AuraFlowTransformer2DModel uses its own dedicated attention processor. This test does not apply")
-    def test_set_attn_processor_for_determinism(self):
-        pass
diff --git a/tests/models/transformers/test_models_transformer_cogvideox.py b/tests/models/transformers/test_models_transformer_cogvideox.py
index f632add7e5a7..97ac1b40621f 100644
--- a/tests/models/transformers/test_models_transformer_cogvideox.py
+++ b/tests/models/transformers/test_models_transformer_cogvideox.py
@@ -13,58 +13,51 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
-
 import torch
 
 from diffusers import CogVideoXTransformer3DModel
-
-from ...testing_utils import (
-    enable_full_determinism,
-    torch_device,
+from diffusers.utils.torch_utils import randn_tensor
+
+from ...testing_utils import enable_full_determinism, torch_device
+from ..testing_utils import (
+    AttentionTesterMixin,
+    BaseModelTesterConfig,
+    MemoryTesterMixin,
+    ModelTesterMixin,
+    TrainingTesterMixin,
 )
-from ..test_modeling_common import ModelTesterMixin
 
 
 enable_full_determinism()
 
 
-class CogVideoXTransformerTests(ModelTesterMixin, unittest.TestCase):
-    model_class = CogVideoXTransformer3DModel
-    main_input_name = "hidden_states"
-    uses_custom_attn_processor = True
-    model_split_percents = [0.7, 0.7, 0.8]
-
+class CogVideoXTransformerTesterConfig(BaseModelTesterConfig):
     @property
-    def dummy_input(self):
-        batch_size = 2
-        num_channels = 4
-        num_frames = 1
-        height = 8
-        width = 8
-        embedding_dim = 8
-        sequence_length = 8
+    def model_class(self):
+        return CogVideoXTransformer3DModel
 
-        hidden_states = torch.randn((batch_size, num_frames, num_channels, height, width)).to(torch_device)
-        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
-        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
-
-        return {
-            "hidden_states": hidden_states,
-            "encoder_hidden_states": encoder_hidden_states,
-            "timestep": timestep,
-        }
+    @property
+    def main_input_name(self) -> str:
+        return "hidden_states"
 
     @property
-    def input_shape(self):
+    def input_shape(self) -> tuple:
         return (1, 4, 8, 8)
 
     @property
-    def output_shape(self):
+    def output_shape(self) -> tuple:
         return (1, 4, 8, 8)
 
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
+    @property
+    def model_split_percents(self) -> list:
+        return [0.7, 0.7, 0.8]
+
+    @property
+    def generator(self):
+        return torch.Generator("cpu").manual_seed(0)
+
+    def get_init_dict(self) -> dict:
+        return {
             # Product of num_attention_heads * attention_head_dim must be divisible by 16 for 3D positional embeddings.
             "num_attention_heads": 2,
             "attention_head_dim": 8,
@@ -81,49 +74,36 @@ def prepare_init_args_and_inputs_for_common(self):
             "temporal_compression_ratio": 4,
             "max_text_seq_length": 8,
         }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_gradient_checkpointing_is_applied(self):
-        expected_set = {"CogVideoXTransformer3DModel"}
-        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
-
 
-class CogVideoX1_5TransformerTests(ModelTesterMixin, unittest.TestCase):
-    model_class = CogVideoXTransformer3DModel
-    main_input_name = "hidden_states"
-    uses_custom_attn_processor = True
-
-    @property
-    def dummy_input(self):
-        batch_size = 2
+    def get_dummy_inputs(self, batch_size: int = 2) -> dict[str, torch.Tensor]:
         num_channels = 4
-        num_frames = 2
-        height = 8
-        width = 8
+        num_frames = 1
+        height = width = 8
         embedding_dim = 8
         sequence_length = 8
 
-        hidden_states = torch.randn((batch_size, num_frames, num_channels, height, width)).to(torch_device)
-        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
-        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
-
         return {
-            "hidden_states": hidden_states,
-            "encoder_hidden_states": encoder_hidden_states,
-            "timestep": timestep,
+            "hidden_states": randn_tensor(
+                (batch_size, num_frames, num_channels, height, width), generator=self.generator, device=torch_device
+            ),
+            "encoder_hidden_states": randn_tensor(
+                (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device
+            ),
+            "timestep": torch.randint(0, 1000, size=(batch_size,), generator=self.generator).to(torch_device),
         }
 
+
+class CogVideoX15TransformerTesterConfig(CogVideoXTransformerTesterConfig):
     @property
-    def input_shape(self):
-        return (1, 4, 8, 8)
+    def output_shape(self) -> tuple:
+        return (2, 4, 8, 8)
 
     @property
-    def output_shape(self):
-        return (1, 4, 8, 8)
+    def model_split_percents(self) -> list:
+        return [0.9]
 
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
+    def get_init_dict(self) -> dict:
+        return {
             # Product of num_attention_heads * attention_head_dim must be divisible by 16 for 3D positional embeddings.
             "num_attention_heads": 2,
             "attention_head_dim": 8,
@@ -141,9 +121,56 @@ def prepare_init_args_and_inputs_for_common(self):
             "max_text_seq_length": 8,
             "use_rotary_positional_embeddings": True,
         }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
 
+    def get_dummy_inputs(self, batch_size: int = 2) -> dict[str, torch.Tensor]:
+        num_channels = 4
+        num_frames = 2
+        height = width = 8
+        embedding_dim = 8
+        sequence_length = 8
+
+        return {
+            "hidden_states": randn_tensor(
+                (batch_size, num_frames, num_channels, height, width), generator=self.generator, device=torch_device
+            ),
+            "encoder_hidden_states": randn_tensor(
+                (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device
+            ),
+            "timestep": torch.randint(0, 1000, size=(batch_size,), generator=self.generator).to(torch_device),
+        }
+
+
+class TestCogVideoXTransformer(CogVideoXTransformerTesterConfig, ModelTesterMixin):
+    pass
+
+
+class TestCogVideoXTransformerMemory(CogVideoXTransformerTesterConfig, MemoryTesterMixin):
+    pass
+
+
+class TestCogVideoXTransformerAttention(CogVideoXTransformerTesterConfig, AttentionTesterMixin):
+    pass
+
+
+class TestCogVideoXTransformerTraining(CogVideoXTransformerTesterConfig, TrainingTesterMixin):
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"CogVideoXTransformer3DModel"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
+
+class TestCogVideoX15Transformer(CogVideoX15TransformerTesterConfig, ModelTesterMixin):
+    pass
+
+
+class TestCogVideoX15TransformerMemory(CogVideoX15TransformerTesterConfig, MemoryTesterMixin):
+    pass
+
+
+class TestCogVideoX15TransformerAttention(CogVideoX15TransformerTesterConfig, AttentionTesterMixin):
+    pass
+
+
+class TestCogVideoX15TransformerTraining(CogVideoX15TransformerTesterConfig, TrainingTesterMixin):
     def test_gradient_checkpointing_is_applied(self):
         expected_set = {"CogVideoXTransformer3DModel"}
         super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
diff --git a/tests/models/transformers/test_models_transformer_cogview3plus.py b/tests/models/transformers/test_models_transformer_cogview3plus.py
index d38d77531d4c..97ac28a108e1 100644
--- a/tests/models/transformers/test_models_transformer_cogview3plus.py
+++ b/tests/models/transformers/test_models_transformer_cogview3plus.py
@@ -13,63 +13,52 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
-
+import pytest
 import torch
 
 from diffusers import CogView3PlusTransformer2DModel
+from diffusers.utils.torch_utils import randn_tensor
 
-from ...testing_utils import (
-    enable_full_determinism,
-    torch_device,
+from ...testing_utils import enable_full_determinism, torch_device
+from ..testing_utils import (
+    AttentionTesterMixin,
+    BaseModelTesterConfig,
+    MemoryTesterMixin,
+    ModelTesterMixin,
+    TrainingTesterMixin,
 )
-from ..test_modeling_common import ModelTesterMixin
 
 
 enable_full_determinism()
 
 
-class CogView3PlusTransformerTests(ModelTesterMixin, unittest.TestCase):
-    model_class = CogView3PlusTransformer2DModel
-    main_input_name = "hidden_states"
-    uses_custom_attn_processor = True
-    model_split_percents = [0.7, 0.6, 0.6]
-
+class CogView3PlusTransformerTesterConfig(BaseModelTesterConfig):
     @property
-    def dummy_input(self):
-        batch_size = 2
-        num_channels = 4
-        height = 8
-        width = 8
-        embedding_dim = 8
-        sequence_length = 8
+    def model_class(self):
+        return CogView3PlusTransformer2DModel
 
-        hidden_states = torch.randn((batch_size, num_channels, height, width)).to(torch_device)
-        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
-        original_size = torch.tensor([height * 8, width * 8]).unsqueeze(0).repeat(batch_size, 1).to(torch_device)
-        target_size = torch.tensor([height * 8, width * 8]).unsqueeze(0).repeat(batch_size, 1).to(torch_device)
-        crop_coords = torch.tensor([0, 0]).unsqueeze(0).repeat(batch_size, 1).to(torch_device)
-        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
-
-        return {
-            "hidden_states": hidden_states,
-            "encoder_hidden_states": encoder_hidden_states,
-            "original_size": original_size,
-            "target_size": target_size,
-            "crop_coords": crop_coords,
-            "timestep": timestep,
-        }
+    @property
+    def main_input_name(self) -> str:
+        return "hidden_states"
 
     @property
-    def input_shape(self):
+    def input_shape(self) -> tuple:
         return (1, 4, 8, 8)
 
     @property
-    def output_shape(self):
+    def output_shape(self) -> tuple:
         return (1, 4, 8, 8)
 
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
+    @property
+    def model_split_percents(self) -> list:
+        return [0.7, 0.6, 0.6]
+
+    @property
+    def generator(self):
+        return torch.Generator("cpu").manual_seed(0)
+
+    def get_init_dict(self) -> dict:
+        return {
             "patch_size": 2,
             "in_channels": 4,
             "num_layers": 2,
@@ -82,9 +71,48 @@ def prepare_init_args_and_inputs_for_common(self):
             "pos_embed_max_size": 8,
             "sample_size": 8,
         }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
 
+    def get_dummy_inputs(self, batch_size: int = 2) -> dict[str, torch.Tensor]:
+        num_channels = 4
+        height = width = 8
+        embedding_dim = 8
+        sequence_length = 8
+
+        original_size = torch.tensor([height * 8, width * 8]).unsqueeze(0).repeat(batch_size, 1).to(torch_device)
+        target_size = torch.tensor([height * 8, width * 8]).unsqueeze(0).repeat(batch_size, 1).to(torch_device)
+        crop_coords = torch.tensor([0, 0]).unsqueeze(0).repeat(batch_size, 1).to(torch_device)
+
+        return {
+            "hidden_states": randn_tensor(
+                (batch_size, num_channels, height, width), generator=self.generator, device=torch_device
+            ),
+            "encoder_hidden_states": randn_tensor(
+                (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device
+            ),
+            "original_size": original_size,
+            "target_size": target_size,
+            "crop_coords": crop_coords,
+            "timestep": torch.randint(0, 1000, size=(batch_size,), generator=self.generator).to(torch_device),
+        }
+
+
+class TestCogView3PlusTransformer(CogView3PlusTransformerTesterConfig, ModelTesterMixin):
+    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
+    def test_from_save_pretrained_dtype_inference(self, tmp_path, dtype):
+        # Skip: fp16/bf16 require very high atol to pass, providing little signal.
+        # Dtype preservation is already tested by test_from_save_pretrained_dtype and test_keep_in_fp32_modules.
+        pytest.skip("Tolerance requirements too high for meaningful test")
+
+
+class TestCogView3PlusTransformerMemory(CogView3PlusTransformerTesterConfig, MemoryTesterMixin):
+    pass
+
+
+class TestCogView3PlusTransformerAttention(CogView3PlusTransformerTesterConfig, AttentionTesterMixin):
+    pass
+
+
+class TestCogView3PlusTransformerTraining(CogView3PlusTransformerTesterConfig, TrainingTesterMixin):
     def test_gradient_checkpointing_is_applied(self):
         expected_set = {"CogView3PlusTransformer2DModel"}
         super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
diff --git a/tests/models/transformers/test_models_transformer_cogview4.py b/tests/models/transformers/test_models_transformer_cogview4.py
index 084c3b7cea41..0f390cb356e9 100644
--- a/tests/models/transformers/test_models_transformer_cogview4.py
+++ b/tests/models/transformers/test_models_transformer_cogview4.py
@@ -12,59 +12,47 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
-
 import torch
 
 from diffusers import CogView4Transformer2DModel
+from diffusers.utils.torch_utils import randn_tensor
 
 from ...testing_utils import enable_full_determinism, torch_device
-from ..test_modeling_common import ModelTesterMixin
+from ..testing_utils import (
+    AttentionTesterMixin,
+    BaseModelTesterConfig,
+    MemoryTesterMixin,
+    ModelTesterMixin,
+    TrainingTesterMixin,
+)
 
 
 enable_full_determinism()
 
 
-class CogView3PlusTransformerTests(ModelTesterMixin, unittest.TestCase):
-    model_class = CogView4Transformer2DModel
-    main_input_name = "hidden_states"
-    uses_custom_attn_processor = True
-
+class CogView4TransformerTesterConfig(BaseModelTesterConfig):
     @property
-    def dummy_input(self):
-        batch_size = 2
-        num_channels = 4
-        height = 8
-        width = 8
-        embedding_dim = 8
-        sequence_length = 8
+    def model_class(self):
+        return CogView4Transformer2DModel
 
-        hidden_states = torch.randn((batch_size, num_channels, height, width)).to(torch_device)
-        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
-        original_size = torch.tensor([height * 8, width * 8]).unsqueeze(0).repeat(batch_size, 1).to(torch_device)
-        target_size = torch.tensor([height * 8, width * 8]).unsqueeze(0).repeat(batch_size, 1).to(torch_device)
-        crop_coords = torch.tensor([0, 0]).unsqueeze(0).repeat(batch_size, 1).to(torch_device)
-        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
-
-        return {
-            "hidden_states": hidden_states,
-            "encoder_hidden_states": encoder_hidden_states,
-            "timestep": timestep,
-            "original_size": original_size,
-            "target_size": target_size,
-            "crop_coords": crop_coords,
-        }
+    @property
+    def main_input_name(self) -> str:
+        return "hidden_states"
 
     @property
-    def input_shape(self):
+    def input_shape(self) -> tuple:
         return (4, 8, 8)
 
     @property
-    def output_shape(self):
+    def output_shape(self) -> tuple:
         return (4, 8, 8)
 
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
+    @property
+    def generator(self):
+        return torch.Generator("cpu").manual_seed(0)
+
+    def get_init_dict(self) -> dict:
+        return {
             "patch_size": 2,
             "in_channels": 4,
             "num_layers": 2,
@@ -75,9 +63,44 @@ def prepare_init_args_and_inputs_for_common(self):
             "time_embed_dim": 8,
             "condition_dim": 4,
         }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
 
+    def get_dummy_inputs(self, batch_size: int = 2) -> dict[str, torch.Tensor]:
+        num_channels = 4
+        height = width = 8
+        embedding_dim = 8
+        sequence_length = 8
+
+        original_size = torch.tensor([height * 8, width * 8]).unsqueeze(0).repeat(batch_size, 1).to(torch_device)
+        target_size = torch.tensor([height * 8, width * 8]).unsqueeze(0).repeat(batch_size, 1).to(torch_device)
+        crop_coords = torch.tensor([0, 0]).unsqueeze(0).repeat(batch_size, 1).to(torch_device)
+
+        return {
+            "hidden_states": randn_tensor(
+                (batch_size, num_channels, height, width), generator=self.generator, device=torch_device
+            ),
+            "encoder_hidden_states": randn_tensor(
+                (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device
+            ),
+            "timestep": torch.randint(0, 1000, size=(batch_size,), generator=self.generator).to(torch_device),
+            "original_size": original_size,
+            "target_size": target_size,
+            "crop_coords": crop_coords,
+        }
+
+
+class TestCogView4Transformer(CogView4TransformerTesterConfig, ModelTesterMixin):
+    pass
+
+
+class TestCogView4TransformerMemory(CogView4TransformerTesterConfig, MemoryTesterMixin):
+    pass
+
+
+class TestCogView4TransformerAttention(CogView4TransformerTesterConfig, AttentionTesterMixin):
+    pass
+
+
+class TestCogView4TransformerTraining(CogView4TransformerTesterConfig, TrainingTesterMixin):
     def test_gradient_checkpointing_is_applied(self):
         expected_set = {"CogView4Transformer2DModel"}
         super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
diff --git a/tests/models/transformers/test_models_transformer_consisid.py b/tests/models/transformers/test_models_transformer_consisid.py
index 77fc172d078a..cb02e8a359b3 100644
--- a/tests/models/transformers/test_models_transformer_consisid.py
+++ b/tests/models/transformers/test_models_transformer_consisid.py
@@ -13,61 +13,46 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
-
 import torch
 
 from diffusers import ConsisIDTransformer3DModel
+from diffusers.utils.torch_utils import randn_tensor
 
-from ...testing_utils import (
-    enable_full_determinism,
-    torch_device,
+from ...testing_utils import enable_full_determinism, torch_device
+from ..testing_utils import (
+    BaseModelTesterConfig,
+    MemoryTesterMixin,
+    ModelTesterMixin,
+    TrainingTesterMixin,
 )
-from ..test_modeling_common import ModelTesterMixin
 
 
 enable_full_determinism()
 
 
-class ConsisIDTransformerTests(ModelTesterMixin, unittest.TestCase):
-    model_class = ConsisIDTransformer3DModel
-    main_input_name = "hidden_states"
-    uses_custom_attn_processor = True
-
+class ConsisIDTransformerTesterConfig(BaseModelTesterConfig):
     @property
-    def dummy_input(self):
-        batch_size = 2
-        num_channels = 4
-        num_frames = 1
-        height = 8
-        width = 8
-        embedding_dim = 8
-        sequence_length = 8
-
-        hidden_states = torch.randn((batch_size, num_frames, num_channels, height, width)).to(torch_device)
-        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
-        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
-        id_vit_hidden = [torch.ones([batch_size, 2, 2]).to(torch_device)] * 1
-        id_cond = torch.ones(batch_size, 2).to(torch_device)
+    def model_class(self):
+        return ConsisIDTransformer3DModel
 
-        return {
-            "hidden_states": hidden_states,
-            "encoder_hidden_states": encoder_hidden_states,
-            "timestep": timestep,
-            "id_vit_hidden": id_vit_hidden,
-            "id_cond": id_cond,
-        }
+    @property
+    def main_input_name(self) -> str:
+        return "hidden_states"
 
     @property
-    def input_shape(self):
+    def input_shape(self) -> tuple:
         return (1, 4, 8, 8)
 
     @property
-    def output_shape(self):
+    def output_shape(self) -> tuple:
         return (1, 4, 8, 8)
 
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
+    @property
+    def generator(self):
+        return torch.Generator("cpu").manual_seed(0)
+
+    def get_init_dict(self) -> dict:
+        return {
             "num_attention_heads": 2,
             "attention_head_dim": 8,
             "in_channels": 4,
@@ -97,9 +82,36 @@ def prepare_init_args_and_inputs_for_common(self):
             "LFE_ff_mult": 1,
             "LFE_num_scale": 1,
         }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
 
+    def get_dummy_inputs(self, batch_size: int = 2) -> dict[str, torch.Tensor]:
+        num_channels = 4
+        num_frames = 1
+        height = width = 8
+        embedding_dim = 8
+        sequence_length = 8
+
+        return {
+            "hidden_states": randn_tensor(
+                (batch_size, num_frames, num_channels, height, width), generator=self.generator, device=torch_device
+            ),
+            "encoder_hidden_states": randn_tensor(
+                (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device
+            ),
+            "timestep": torch.randint(0, 1000, size=(batch_size,), generator=self.generator).to(torch_device),
+            "id_vit_hidden": [torch.ones([batch_size, 2, 2]).to(torch_device)] * 1,
+            "id_cond": torch.ones(batch_size, 2).to(torch_device),
+        }
+
+
+class TestConsisIDTransformer(ConsisIDTransformerTesterConfig, ModelTesterMixin):
+    pass
+
+
+class TestConsisIDTransformerMemory(ConsisIDTransformerTesterConfig, MemoryTesterMixin):
+    pass
+
+
+class TestConsisIDTransformerTraining(ConsisIDTransformerTesterConfig, TrainingTesterMixin):
     def test_gradient_checkpointing_is_applied(self):
         expected_set = {"ConsisIDTransformer3DModel"}
         super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
diff --git a/tests/models/transformers/test_models_transformer_latte.py b/tests/models/transformers/test_models_transformer_latte.py
index 7bf2c52e6269..946e5ce8a5a9 100644
--- a/tests/models/transformers/test_models_transformer_latte.py
+++ b/tests/models/transformers/test_models_transformer_latte.py
@@ -13,56 +13,48 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
-
+import pytest
 import torch
 
 from diffusers import LatteTransformer3DModel
-
-from ...testing_utils import (
-    enable_full_determinism,
-    torch_device,
+from diffusers.utils.torch_utils import randn_tensor
+
+from ...testing_utils import enable_full_determinism, torch_device
+from ..testing_utils import (
+    AttentionTesterMixin,
+    BaseModelTesterConfig,
+    MemoryTesterMixin,
+    ModelTesterMixin,
+    TrainingTesterMixin,
 )
-from ..test_modeling_common import ModelTesterMixin
 
 
 enable_full_determinism()
 
 
-class LatteTransformerTests(ModelTesterMixin, unittest.TestCase):
-    model_class = LatteTransformer3DModel
-    main_input_name = "hidden_states"
-
+class LatteTransformerTesterConfig(BaseModelTesterConfig):
     @property
-    def dummy_input(self):
-        batch_size = 2
-        num_channels = 4
-        num_frames = 1
-        height = width = 8
-        embedding_dim = 8
-        sequence_length = 8
+    def model_class(self):
+        return LatteTransformer3DModel
 
-        hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device)
-        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
-        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
-
-        return {
-            "hidden_states": hidden_states,
-            "encoder_hidden_states": encoder_hidden_states,
-            "timestep": timestep,
-            "enable_temporal_attentions": True,
-        }
+    @property
+    def main_input_name(self) -> str:
+        return "hidden_states"
 
     @property
-    def input_shape(self):
+    def input_shape(self) -> tuple:
         return (4, 1, 8, 8)
 
     @property
-    def output_shape(self):
+    def output_shape(self) -> tuple:
         return (8, 1, 8, 8)
 
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
+    @property
+    def generator(self):
+        return torch.Generator("cpu").manual_seed(0)
+
+    def get_init_dict(self) -> dict:
+        return {
             "sample_size": 8,
             "num_layers": 1,
             "patch_size": 2,
@@ -79,14 +71,43 @@ def prepare_init_args_and_inputs_for_common(self):
             "norm_elementwise_affine": False,
             "norm_eps": 1e-6,
         }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
 
-    def test_output(self):
-        super().test_output(
-            expected_output_shape=(self.dummy_input[self.main_input_name].shape[0],) + self.output_shape
-        )
+    def get_dummy_inputs(self, batch_size: int = 2) -> dict[str, torch.Tensor]:
+        num_channels = 4
+        num_frames = 1
+        height = width = 8
+        embedding_dim = 8
+        sequence_length = 8
+
+        return {
+            "hidden_states": randn_tensor(
+                (batch_size, num_channels, num_frames, height, width), generator=self.generator, device=torch_device
+            ),
+            "encoder_hidden_states": randn_tensor(
+                (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device
+            ),
+            "timestep": torch.randint(0, 1000, size=(batch_size,), generator=self.generator).to(torch_device),
+            "enable_temporal_attentions": True,
+        }
+
+
+class TestLatteTransformer(LatteTransformerTesterConfig, ModelTesterMixin):
+    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
+    def test_from_save_pretrained_dtype_inference(self, tmp_path, dtype):
+        # Skip: fp16/bf16 require very high atol to pass, providing little signal.
+        # Dtype preservation is already tested by test_from_save_pretrained_dtype and test_keep_in_fp32_modules.
+        pytest.skip("Tolerance requirements too high for meaningful test")
+
+
+class TestLatteTransformerMemory(LatteTransformerTesterConfig, MemoryTesterMixin):
+    pass
+
+
+class TestLatteTransformerAttention(LatteTransformerTesterConfig, AttentionTesterMixin):
+    pass
+
 
+class TestLatteTransformerTraining(LatteTransformerTesterConfig, TrainingTesterMixin):
     def test_gradient_checkpointing_is_applied(self):
         expected_set = {"LatteTransformer3DModel"}
         super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
diff --git a/tests/models/transformers/test_models_transformer_motif_video.py b/tests/models/transformers/test_models_transformer_motif_video.py
index d3ac3a874927..8d8693acda37 100644
--- a/tests/models/transformers/test_models_transformer_motif_video.py
+++ b/tests/models/transformers/test_models_transformer_motif_video.py
@@ -19,10 +19,10 @@
 from diffusers.utils.torch_utils import randn_tensor
 
 from ...testing_utils import enable_full_determinism, torch_device
-from ..test_modeling_common import LoraHotSwappingForModelTesterMixin
 from ..testing_utils import (
     AttentionTesterMixin,
     BaseModelTesterConfig,
+    LoraHotSwappingForModelTesterMixin,
     LoraTesterMixin,
     MemoryTesterMixin,
     ModelTesterMixin,
diff --git a/tests/models/transformers/test_models_transformer_sana_video.py b/tests/models/transformers/test_models_transformer_sana_video.py
index ff564ed8918d..e9d3a2d8da8e 100644
--- a/tests/models/transformers/test_models_transformer_sana_video.py
+++ b/tests/models/transformers/test_models_transformer_sana_video.py
@@ -12,57 +12,48 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
-
+import pytest
 import torch
 
 from diffusers import SanaVideoTransformer3DModel
-
-from ...testing_utils import (
-    enable_full_determinism,
-    torch_device,
+from diffusers.utils.torch_utils import randn_tensor
+
+from ...testing_utils import enable_full_determinism, torch_device
+from ..testing_utils import (
+    AttentionTesterMixin,
+    BaseModelTesterConfig,
+    MemoryTesterMixin,
+    ModelTesterMixin,
+    TrainingTesterMixin,
 )
-from ..test_modeling_common import ModelTesterMixin, TorchCompileTesterMixin
 
 
 enable_full_determinism()
 
 
-class SanaVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase):
-    model_class = SanaVideoTransformer3DModel
-    main_input_name = "hidden_states"
-    uses_custom_attn_processor = True
-
+class SanaVideoTransformer3DTesterConfig(BaseModelTesterConfig):
     @property
-    def dummy_input(self):
-        batch_size = 1
-        num_channels = 16
-        num_frames = 2
-        height = 16
-        width = 16
-        text_encoder_embedding_dim = 16
-        sequence_length = 12
-
-        hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device)
-        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
-        encoder_hidden_states = torch.randn((batch_size, sequence_length, text_encoder_embedding_dim)).to(torch_device)
+    def model_class(self):
+        return SanaVideoTransformer3DModel
 
-        return {
-            "hidden_states": hidden_states,
-            "encoder_hidden_states": encoder_hidden_states,
-            "timestep": timestep,
-        }
+    @property
+    def main_input_name(self) -> str:
+        return "hidden_states"
 
     @property
-    def input_shape(self):
+    def input_shape(self) -> tuple:
         return (16, 2, 16, 16)
 
     @property
-    def output_shape(self):
+    def output_shape(self) -> tuple:
         return (16, 2, 16, 16)
 
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
+    @property
+    def generator(self):
+        return torch.Generator("cpu").manual_seed(0)
+
+    def get_init_dict(self) -> dict:
+        return {
             "in_channels": 16,
             "out_channels": 16,
             "num_attention_heads": 2,
@@ -82,16 +73,44 @@ def prepare_init_args_and_inputs_for_common(self):
             "qk_norm": "rms_norm_across_heads",
             "rope_max_seq_len": 32,
         }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
 
-    def test_gradient_checkpointing_is_applied(self):
-        expected_set = {"SanaVideoTransformer3DModel"}
-        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+    def get_dummy_inputs(self, batch_size: int = 1) -> dict[str, torch.Tensor]:
+        num_channels = 16
+        num_frames = 2
+        height = width = 16
+        text_encoder_embedding_dim = 16
+        sequence_length = 12
+
+        return {
+            "hidden_states": randn_tensor(
+                (batch_size, num_channels, num_frames, height, width), generator=self.generator, device=torch_device
+            ),
+            "encoder_hidden_states": randn_tensor(
+                (batch_size, sequence_length, text_encoder_embedding_dim),
+                generator=self.generator,
+                device=torch_device,
+            ),
+            "timestep": torch.randint(0, 1000, size=(batch_size,), generator=self.generator).to(torch_device),
+        }
 
 
-class SanaVideoTransformerCompileTests(TorchCompileTesterMixin, unittest.TestCase):
-    model_class = SanaVideoTransformer3DModel
+class TestSanaVideoTransformer3D(SanaVideoTransformer3DTesterConfig, ModelTesterMixin):
+    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
+    def test_from_save_pretrained_dtype_inference(self, tmp_path, dtype):
+        # Skip: fp16/bf16 require very high atol to pass, providing little signal.
+        # Dtype preservation is already tested by test_from_save_pretrained_dtype and test_keep_in_fp32_modules.
+        pytest.skip("Tolerance requirements too high for meaningful test")
 
-    def prepare_init_args_and_inputs_for_common(self):
-        return SanaVideoTransformer3DTests().prepare_init_args_and_inputs_for_common()
+
+class TestSanaVideoTransformer3DMemory(SanaVideoTransformer3DTesterConfig, MemoryTesterMixin):
+    pass
+
+
+class TestSanaVideoTransformer3DAttention(SanaVideoTransformer3DTesterConfig, AttentionTesterMixin):
+    pass
+
+
+class TestSanaVideoTransformer3DTraining(SanaVideoTransformer3DTesterConfig, TrainingTesterMixin):
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"SanaVideoTransformer3DModel"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
diff --git a/tests/models/transformers/test_models_transformer_temporal.py b/tests/models/transformers/test_models_transformer_temporal.py
index aff83be51124..ff917f65cf33 100644
--- a/tests/models/transformers/test_models_transformer_temporal.py
+++ b/tests/models/transformers/test_models_transformer_temporal.py
@@ -13,55 +13,77 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
-
 import torch
 
 from diffusers.models.transformers import TransformerTemporalModel
+from diffusers.utils.torch_utils import randn_tensor
 
-from ...testing_utils import (
-    enable_full_determinism,
-    torch_device,
+from ...testing_utils import enable_full_determinism, torch_device
+from ..testing_utils import (
+    AttentionTesterMixin,
+    BaseModelTesterConfig,
+    MemoryTesterMixin,
+    ModelTesterMixin,
+    TrainingTesterMixin,
 )
-from ..test_modeling_common import ModelTesterMixin
 
 
 enable_full_determinism()
 
 
-class TemporalTransformerTests(ModelTesterMixin, unittest.TestCase):
-    model_class = TransformerTemporalModel
-    main_input_name = "hidden_states"
-
+class TemporalTransformerTesterConfig(BaseModelTesterConfig):
     @property
-    def dummy_input(self):
-        batch_size = 2
-        num_channels = 4
-        height = width = 32
+    def model_class(self):
+        return TransformerTemporalModel
 
-        hidden_states = torch.randn((batch_size, num_channels, height, width)).to(torch_device)
-        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
-
-        return {
-            "hidden_states": hidden_states,
-            "timestep": timestep,
-        }
+    @property
+    def main_input_name(self) -> str:
+        return "hidden_states"
 
     @property
-    def input_shape(self):
+    def input_shape(self) -> tuple:
         return (4, 32, 32)
 
     @property
-    def output_shape(self):
+    def output_shape(self) -> tuple:
         return (4, 32, 32)
 
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
+    @property
+    def generator(self):
+        return torch.Generator("cpu").manual_seed(0)
+
+    def get_init_dict(self) -> dict:
+        return {
             "num_attention_heads": 8,
             "attention_head_dim": 4,
             "in_channels": 4,
             "num_layers": 1,
             "norm_num_groups": 1,
         }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
+
+    def get_dummy_inputs(self, batch_size: int = 2) -> dict[str, torch.Tensor]:
+        num_channels = 4
+        height = width = 32
+
+        return {
+            "hidden_states": randn_tensor(
+                (batch_size, num_channels, height, width), generator=self.generator, device=torch_device
+            ),
+            "timestep": torch.randint(0, 1000, size=(batch_size,), generator=self.generator).to(torch_device),
+        }
+
+
+class TestTemporalTransformer(TemporalTransformerTesterConfig, ModelTesterMixin):
+    pass
+
+
+class TestTemporalTransformerMemory(TemporalTransformerTesterConfig, MemoryTesterMixin):
+    pass
+
+
+class TestTemporalTransformerAttention(TemporalTransformerTesterConfig, AttentionTesterMixin):
+    pass
+
+
+class TestTemporalTransformerTraining(TemporalTransformerTesterConfig, TrainingTesterMixin):
+    pass
diff --git a/tests/others/test_utils.py b/tests/others/test_utils.py
index 4600f5f3710a..5db007b7ed6d 100755
--- a/tests/others/test_utils.py
+++ b/tests/others/test_utils.py
@@ -342,6 +342,6 @@ def is_staging_test(test_case):
     Those tests will run using the staging environment of huggingface.co instead of the real model hub.
     """
     if not _run_staging:
-        return unittest.skip("test is staging test")(test_case)
+        return pytest.mark.skip("test is staging test")(test_case)
     else:
         return pytest.mark.is_staging_test()(test_case)

From 178c4cb99be7ca9f28aeee556e37e9436c5e6f66 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 17 Jun 2026 06:16:10 +0000
Subject: [PATCH 2/7] fix extracter.

---
 utils/extract_tests_from_mixin.py | 40 +++++++++++++++++++++----------
 1 file changed, 27 insertions(+), 13 deletions(-)

diff --git a/utils/extract_tests_from_mixin.py b/utils/extract_tests_from_mixin.py
index c8b65b96ee16..04b157ff502c 100644
--- a/utils/extract_tests_from_mixin.py
+++ b/utils/extract_tests_from_mixin.py
@@ -30,32 +30,46 @@ def generate_pytest_pattern(test_methods: List[str]) -> str:
     return " or ".join(test_methods)
 
 
-def generate_pattern_for_mixin(mixin_class: Type) -> str:
+def generate_pattern_for_mixins(mixin_classes: List[Type]) -> str:
     """
-    Generate pytest pattern for a specific mixin class.
+    Generate a pytest pattern covering the test methods of all the given mixin classes.
     """
-    if mixin_cls is None:
-        return ""
-    test_methods = get_test_methods_from_class(mixin_class)
-    return generate_pytest_pattern(test_methods)
+    test_methods = set()
+    for mixin_class in mixin_classes:
+        test_methods.update(get_test_methods_from_class(mixin_class))
+    return generate_pytest_pattern(sorted(test_methods))
 
 
 if __name__ == "__main__":
-    mixin_cls = None
+    mixin_classes = []
     if args.type == "pipeline":
         from tests.pipelines.test_pipelines_common import PipelineTesterMixin
 
-        mixin_cls = PipelineTesterMixin
+        mixin_classes = [PipelineTesterMixin]
 
     elif args.type == "models":
-        from tests.models.test_modeling_common import ModelTesterMixin
-
-        mixin_cls = ModelTesterMixin
+        # The model tester suite is split across several mixins under `tests/models/testing_utils`,
+        # so aggregate their test methods to reconstruct the full coverage.
+        from tests.models.testing_utils import (
+            AttentionTesterMixin,
+            LoraTesterMixin,
+            MemoryTesterMixin,
+            ModelTesterMixin,
+            TrainingTesterMixin,
+        )
+
+        mixin_classes = [
+            ModelTesterMixin,
+            MemoryTesterMixin,
+            TrainingTesterMixin,
+            AttentionTesterMixin,
+            LoraTesterMixin,
+        ]
 
     elif args.type == "lora":
         from tests.lora.utils import PeftLoraLoaderMixinTests
 
-        mixin_cls = PeftLoraLoaderMixinTests
+        mixin_classes = [PeftLoraLoaderMixinTests]
 
-    pattern = generate_pattern_for_mixin(mixin_cls)
+    pattern = generate_pattern_for_mixins(mixin_classes)
     print(pattern)

From e54fc90fce5d8ae550657bd7456d16d62e10b63e Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 17 Jun 2026 07:45:27 +0000
Subject: [PATCH 3/7] fix cuda tests for models.

---
 src/diffusers/models/downsampling.py                  |  6 +++---
 src/diffusers/models/upsampling.py                    |  6 +++---
 .../autoencoders/test_models_autoencoder_tiny.py      |  7 ++++++-
 .../test_models_consistency_decoder_vae.py            |  9 ++++++++-
 .../controlnets/test_models_controlnet_cosmos.py      |  4 ++++
 tests/models/testing_utils/common.py                  | 11 ++++++++---
 .../transformers/test_models_transformer_z_image.py   |  4 ++++
 tests/testing_utils.py                                | 11 +++++++++++
 8 files changed, 47 insertions(+), 11 deletions(-)

diff --git a/src/diffusers/models/downsampling.py b/src/diffusers/models/downsampling.py
index 871c0ed7ddf7..4c7a8f8c67bb 100644
--- a/src/diffusers/models/downsampling.py
+++ b/src/diffusers/models/downsampling.py
@@ -227,7 +227,7 @@ def _downsample_2d(
             stride_value = [factor, factor]
             upfirdn_input = upfirdn2d_native(
                 hidden_states,
-                torch.tensor(kernel, device=hidden_states.device),
+                kernel.to(device=hidden_states.device, dtype=hidden_states.dtype),
                 pad=((pad_value + 1) // 2, pad_value // 2),
             )
             output = F.conv2d(upfirdn_input, weight, stride=stride_value, padding=0)
@@ -235,7 +235,7 @@ def _downsample_2d(
             pad_value = kernel.shape[0] - factor
             output = upfirdn2d_native(
                 hidden_states,
-                torch.tensor(kernel, device=hidden_states.device),
+                kernel.to(device=hidden_states.device, dtype=hidden_states.dtype),
                 down=factor,
                 pad=((pad_value + 1) // 2, pad_value // 2),
             )
@@ -392,7 +392,7 @@ def downsample_2d(
     pad_value = kernel.shape[0] - factor
     output = upfirdn2d_native(
         hidden_states,
-        kernel.to(device=hidden_states.device),
+        kernel.to(device=hidden_states.device, dtype=hidden_states.dtype),
         down=factor,
         pad=((pad_value + 1) // 2, pad_value // 2),
     )
diff --git a/src/diffusers/models/upsampling.py b/src/diffusers/models/upsampling.py
index cd3986287303..5a185b4d41f0 100644
--- a/src/diffusers/models/upsampling.py
+++ b/src/diffusers/models/upsampling.py
@@ -300,14 +300,14 @@ def _upsample_2d(
 
             output = upfirdn2d_native(
                 inverse_conv,
-                torch.tensor(kernel, device=inverse_conv.device),
+                kernel.to(device=inverse_conv.device, dtype=inverse_conv.dtype),
                 pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2 + 1),
             )
         else:
             pad_value = kernel.shape[0] - factor
             output = upfirdn2d_native(
                 hidden_states,
-                torch.tensor(kernel, device=hidden_states.device),
+                kernel.to(device=hidden_states.device, dtype=hidden_states.dtype),
                 up=factor,
                 pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2),
             )
@@ -508,7 +508,7 @@ def upsample_2d(
     pad_value = kernel.shape[0] - factor
     output = upfirdn2d_native(
         hidden_states,
-        kernel.to(device=hidden_states.device),
+        kernel.to(device=hidden_states.device, dtype=hidden_states.dtype),
         up=factor,
         pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2),
     )
diff --git a/tests/models/autoencoders/test_models_autoencoder_tiny.py b/tests/models/autoencoders/test_models_autoencoder_tiny.py
index 7fdab4aeb910..43dda6187505 100644
--- a/tests/models/autoencoders/test_models_autoencoder_tiny.py
+++ b/tests/models/autoencoders/test_models_autoencoder_tiny.py
@@ -76,7 +76,12 @@ def get_dummy_inputs(self) -> dict:
 
 
 class TestAutoencoderTiny(AutoencoderTinyTesterConfig, ModelTesterMixin):
-    pass
+    @pytest.mark.skip(
+        "`forward` round-trips the latents through a uint8 byte tensor (`.byte()` / `/ 255.0`), which upcasts to "
+        "float32 regardless of the model dtype, so full fp16/bf16 forward inference is not possible."
+    )
+    def test_from_save_pretrained_dtype_inference(self):
+        pass
 
 
 class TestAutoencoderTinyTraining(AutoencoderTinyTesterConfig, TrainingTesterMixin):
diff --git a/tests/models/autoencoders/test_models_consistency_decoder_vae.py b/tests/models/autoencoders/test_models_consistency_decoder_vae.py
index 906baa60a9dc..2220da59c77d 100644
--- a/tests/models/autoencoders/test_models_consistency_decoder_vae.py
+++ b/tests/models/autoencoders/test_models_consistency_decoder_vae.py
@@ -16,6 +16,7 @@
 import gc
 
 import numpy as np
+import pytest
 import torch
 
 from diffusers import ConsistencyDecoderVAE, StableDiffusionPipeline
@@ -86,7 +87,13 @@ def get_dummy_inputs(self) -> dict:
 
 
 class TestConsistencyDecoderVAE(ConsistencyDecoderVAETesterConfig, ModelTesterMixin):
-    pass
+    @pytest.mark.skip(
+        "`forward` decodes through an iterative, RNG-driven consistency-decoding loop whose output is not "
+        "reproducible across two model instances and amplifies fp16/bf16 nondeterminism, so a low-precision "
+        "output-equivalence check is not meaningful."
+    )
+    def test_from_save_pretrained_dtype_inference(self):
+        pass
 
 
 class TestConsistencyDecoderVAETraining(ConsistencyDecoderVAETesterConfig, TrainingTesterMixin):
diff --git a/tests/models/controlnets/test_models_controlnet_cosmos.py b/tests/models/controlnets/test_models_controlnet_cosmos.py
index 9bef488a8106..e7ea6362213d 100644
--- a/tests/models/controlnets/test_models_controlnet_cosmos.py
+++ b/tests/models/controlnets/test_models_controlnet_cosmos.py
@@ -283,6 +283,10 @@ def test_training(self):
     def test_training_with_ema(self):
         super().test_training_with_ema()
 
+    @pytest.mark.skip("ControlNet outputs list of control blocks, not single tensor for MSE loss.")
+    def test_mixed_precision_training(self):
+        super().test_mixed_precision_training()
+
     @pytest.mark.skip("ControlNet output doesn't have .sample attribute.")
     def test_gradient_checkpointing_equivalence(self):
         super().test_gradient_checkpointing_equivalence()
diff --git a/tests/models/testing_utils/common.py b/tests/models/testing_utils/common.py
index 5726dba9c600..c468d19ad2a1 100644
--- a/tests/models/testing_utils/common.py
+++ b/tests/models/testing_utils/common.py
@@ -135,8 +135,9 @@ def cast_inputs_to_dtype(inputs, current_dtype, target_dtype):
         return inputs.to(target_dtype) if inputs.dtype == current_dtype else inputs
     if isinstance(inputs, dict):
         return {k: cast_inputs_to_dtype(v, current_dtype, target_dtype) for k, v in inputs.items()}
-    if isinstance(inputs, list):
-        return [cast_inputs_to_dtype(v, current_dtype, target_dtype) for v in inputs]
+    if isinstance(inputs, (list, tuple)):
+        # Preserve the container type so models that branch on it (e.g. `isinstance(..., tuple)`) still see a tuple.
+        return type(inputs)(cast_inputs_to_dtype(v, current_dtype, target_dtype) for v in inputs)
 
     return inputs
 
@@ -479,7 +480,11 @@ def test_keep_in_fp32_modules(self, tmp_path):
     )
     @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
     @torch.no_grad()
-    def test_from_save_pretrained_dtype_inference(self, tmp_path, dtype, atol=1e-4, rtol=0):
+    def test_from_save_pretrained_dtype_inference(self, tmp_path, dtype):
+        # Low-precision inference is inherently lossy, and models that keep some modules in fp32 diverge further from
+        # the fully-cast reference. Tolerances reflect the dtype's precision rather than a tight fp32-style threshold.
+        atol = 3e-2 if dtype == torch.bfloat16 else 1e-2
+        rtol = 0
         model = self.model_class(**self.get_init_dict())
         model.to(torch_device)
         fp32_modules = model._keep_in_fp32_modules or []
diff --git a/tests/models/transformers/test_models_transformer_z_image.py b/tests/models/transformers/test_models_transformer_z_image.py
index 3a0fe18bc692..67a8fde0f411 100644
--- a/tests/models/transformers/test_models_transformer_z_image.py
+++ b/tests/models/transformers/test_models_transformer_z_image.py
@@ -250,6 +250,10 @@ def test_training(self):
     def test_training_with_ema(self):
         pass
 
+    @pytest.mark.skip("Model output `sample` is a list of tensors; mixed-precision training computes MSE loss on it.")
+    def test_mixed_precision_training(self):
+        pass
+
     @pytest.mark.skip("Test is not supported for handling main inputs that are lists.")
     def test_gradient_checkpointing_equivalence(self, loss_tolerance=1e-5, param_grad_tol=5e-5, skip=None):
         pass
diff --git a/tests/testing_utils.py b/tests/testing_utils.py
index a8306b3d65f8..86887d7af6e9 100644
--- a/tests/testing_utils.py
+++ b/tests/testing_utils.py
@@ -165,6 +165,17 @@ def assert_tensors_close(
     if not is_torch_available():
         raise ValueError("PyTorch needs to be installed to use this function.")
 
+    # Some models (e.g. Z-Image, Cosmos ControlNet) return a list/tuple of tensors as their output. Compare these
+    # element-wise so the same helper works regardless of whether the output is a single tensor or a sequence.
+    if isinstance(actual, (list, tuple)) or isinstance(expected, (list, tuple)):
+        if not (isinstance(actual, (list, tuple)) and isinstance(expected, (list, tuple))):
+            raise AssertionError(f"{msg} Type mismatch: actual {type(actual)} vs expected {type(expected)}")
+        if len(actual) != len(expected):
+            raise AssertionError(f"{msg} Length mismatch: actual {len(actual)} vs expected {len(expected)}")
+        for i, (a, e) in enumerate(zip(actual, expected)):
+            assert_tensors_close(a, e, atol=atol, rtol=rtol, msg=f"{msg} [element {i}]")
+        return
+
     if actual.shape != expected.shape:
         raise AssertionError(f"{msg} Shape mismatch: actual {actual.shape} vs expected {expected.shape}")
 

From d241cdc63ce6f66ccabb8b904ebd2652817a9e51 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 17 Jun 2026 07:47:21 +0000
Subject: [PATCH 4/7] Revert "fix extracter."

This reverts commit 178c4cb99be7ca9f28aeee556e37e9436c5e6f66.
---
 utils/extract_tests_from_mixin.py | 40 ++++++++++---------------------
 1 file changed, 13 insertions(+), 27 deletions(-)

diff --git a/utils/extract_tests_from_mixin.py b/utils/extract_tests_from_mixin.py
index 04b157ff502c..c8b65b96ee16 100644
--- a/utils/extract_tests_from_mixin.py
+++ b/utils/extract_tests_from_mixin.py
@@ -30,46 +30,32 @@ def generate_pytest_pattern(test_methods: List[str]) -> str:
     return " or ".join(test_methods)
 
 
-def generate_pattern_for_mixins(mixin_classes: List[Type]) -> str:
+def generate_pattern_for_mixin(mixin_class: Type) -> str:
     """
-    Generate a pytest pattern covering the test methods of all the given mixin classes.
+    Generate pytest pattern for a specific mixin class.
     """
-    test_methods = set()
-    for mixin_class in mixin_classes:
-        test_methods.update(get_test_methods_from_class(mixin_class))
-    return generate_pytest_pattern(sorted(test_methods))
+    if mixin_cls is None:
+        return ""
+    test_methods = get_test_methods_from_class(mixin_class)
+    return generate_pytest_pattern(test_methods)
 
 
 if __name__ == "__main__":
-    mixin_classes = []
+    mixin_cls = None
     if args.type == "pipeline":
         from tests.pipelines.test_pipelines_common import PipelineTesterMixin
 
-        mixin_classes = [PipelineTesterMixin]
+        mixin_cls = PipelineTesterMixin
 
     elif args.type == "models":
-        # The model tester suite is split across several mixins under `tests/models/testing_utils`,
-        # so aggregate their test methods to reconstruct the full coverage.
-        from tests.models.testing_utils import (
-            AttentionTesterMixin,
-            LoraTesterMixin,
-            MemoryTesterMixin,
-            ModelTesterMixin,
-            TrainingTesterMixin,
-        )
-
-        mixin_classes = [
-            ModelTesterMixin,
-            MemoryTesterMixin,
-            TrainingTesterMixin,
-            AttentionTesterMixin,
-            LoraTesterMixin,
-        ]
+        from tests.models.test_modeling_common import ModelTesterMixin
+
+        mixin_cls = ModelTesterMixin
 
     elif args.type == "lora":
         from tests.lora.utils import PeftLoraLoaderMixinTests
 
-        mixin_classes = [PeftLoraLoaderMixinTests]
+        mixin_cls = PeftLoraLoaderMixinTests
 
-    pattern = generate_pattern_for_mixins(mixin_classes)
+    pattern = generate_pattern_for_mixin(mixin_cls)
     print(pattern)

From d40db9d491f0632c5eff0652c6d07629d6bfd94c Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 17 Jun 2026 07:50:02 +0000
Subject: [PATCH 5/7] Revert "port final set of model tests and others"

This reverts commit a92c70c08116511e0977c0e71595219d32834f02.
---
 tests/models/test_modeling_common.py          | 2172 ++++++++++++++++-
 .../test_models_dit_transformer2d.py          |  109 +-
 .../test_models_pixart_transformer2d.py       |  116 +-
 .../models/transformers/test_models_prior.py  |  105 +-
 .../test_models_transformer_allegro.py        |   87 +-
 .../test_models_transformer_aura_flow.py      |   89 +-
 .../test_models_transformer_cogvideox.py      |  161 +-
 .../test_models_transformer_cogview3plus.py   |  108 +-
 .../test_models_transformer_cogview4.py       |   95 +-
 .../test_models_transformer_consisid.py       |   88 +-
 .../test_models_transformer_latte.py          |   97 +-
 .../test_models_transformer_motif_video.py    |    2 +-
 .../test_models_transformer_sana_video.py     |  103 +-
 .../test_models_transformer_temporal.py       |   76 +-
 tests/others/test_utils.py                    |    2 +-
 15 files changed, 2630 insertions(+), 780 deletions(-)

diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
index 7e7822ac16ea..8575439649d7 100644
--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -13,46 +13,225 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
+import gc
+import glob
 import inspect
-import logging
+import json
 import os
+import re
 import tempfile
+import traceback
+import unittest
 import unittest.mock as mock
 import uuid
+from collections import defaultdict
+from typing import Dict, List, Tuple
 
+import numpy as np
 import pytest
 import requests_mock
+import safetensors.torch
 import torch
+import torch.nn as nn
+from accelerate.utils.modeling import _get_proper_dtype, compute_module_sizes, dtype_byte_size
 from huggingface_hub import ModelCard, delete_repo, snapshot_download, try_to_load_from_cache
 from huggingface_hub.utils import HfHubHTTPError, is_jinja_available
+from parameterized import parameterized
 
 from diffusers.models import FluxTransformer2DModel, SD3Transformer2DModel, UNet2DConditionModel
+from diffusers.models.attention_processor import (
+    AttnProcessor,
+    AttnProcessor2_0,
+    AttnProcessorNPU,
+    XFormersAttnProcessor,
+)
+from diffusers.models.auto_model import AutoModel
+from diffusers.models.modeling_outputs import BaseOutput
+from diffusers.training_utils import EMAModel
+from diffusers.utils import (
+    SAFE_WEIGHTS_INDEX_NAME,
+    WEIGHTS_INDEX_NAME,
+    is_peft_available,
+    is_torch_npu_available,
+    is_xformers_available,
+    logging,
+)
+from diffusers.utils.hub_utils import _add_variant
+from diffusers.utils.torch_utils import get_torch_cuda_device_capability
 
 from ..others.test_utils import TOKEN, USER, is_staging_test
 from ..testing_utils import (
     CaptureLogger,
+    _check_safetensors_serialization,
+    backend_empty_cache,
+    backend_max_memory_allocated,
+    backend_reset_peak_memory_stats,
+    backend_synchronize,
+    check_if_dicts_are_equal,
+    get_python_version,
+    is_torch_compile,
+    numpy_cosine_similarity_distance,
+    require_peft_backend,
+    require_peft_version_greater,
+    require_torch_2,
     require_torch_accelerator,
+    require_torch_accelerator_with_training,
+    require_torch_multi_accelerator,
+    require_torch_version_greater,
+    run_test_in_subprocess,
+    slow,
+    torch_all_close,
     torch_device,
 )
 
 
-class TestModelUtils:
+if is_peft_available():
+    from peft.tuners.tuners_utils import BaseTunerLayer
+
+
+def caculate_expected_num_shards(index_map_path):
+    with open(index_map_path) as f:
+        weight_map_dict = json.load(f)["weight_map"]
+    first_key = list(weight_map_dict.keys())[0]
+    weight_loc = weight_map_dict[first_key]  # e.g., diffusion_pytorch_model-00001-of-00002.safetensors
+    expected_num_shards = int(weight_loc.split("-")[-1].split(".")[0])
+    return expected_num_shards
+
+
+def check_if_lora_correctly_set(model) -> bool:
+    """
+    Checks if the LoRA layers are correctly set with peft
+    """
+    for module in model.modules():
+        if isinstance(module, BaseTunerLayer):
+            return True
+    return False
+
+
+def normalize_output(out):
+    out0 = out[0] if isinstance(out, (BaseOutput, tuple)) else out
+    return torch.stack(out0) if isinstance(out0, list) else out0
+
+
+# Will be run via run_test_in_subprocess
+def _test_from_save_pretrained_dynamo(in_queue, out_queue, timeout):
+    error = None
+    try:
+        init_dict, model_class = in_queue.get(timeout=timeout)
+
+        model = model_class(**init_dict)
+        model.to(torch_device)
+        model = torch.compile(model)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_pretrained(tmpdirname, safe_serialization=False)
+            new_model = model_class.from_pretrained(tmpdirname)
+            new_model.to(torch_device)
+
+        assert new_model.__class__ == model_class
+    except Exception:
+        error = f"{traceback.format_exc()}"
+
+    results = {"error": error}
+    out_queue.put(results, timeout=timeout)
+    out_queue.join()
+
+
+def named_persistent_module_tensors(
+    module: nn.Module,
+    recurse: bool = False,
+):
+    """
+    A helper function that gathers all the tensors (parameters + persistent buffers) of a given module.
+
+    Args:
+        module (`torch.nn.Module`):
+            The module we want the tensors on.
+        recurse (`bool`, *optional`, defaults to `False`):
+            Whether or not to go look in every submodule or just return the direct parameters and buffers.
+    """
+    yield from module.named_parameters(recurse=recurse)
+
+    for named_buffer in module.named_buffers(recurse=recurse):
+        name, _ = named_buffer
+        # Get parent by splitting on dots and traversing the model
+        parent = module
+        if "." in name:
+            parent_name = name.rsplit(".", 1)[0]
+            for part in parent_name.split("."):
+                parent = getattr(parent, part)
+            name = name.split(".")[-1]
+        if name not in parent._non_persistent_buffers_set:
+            yield named_buffer
+
+
+def compute_module_persistent_sizes(
+    model: nn.Module,
+    dtype: str | torch.device | None = None,
+    special_dtypes: dict[str, str | torch.device] | None = None,
+):
+    """
+    Compute the size of each submodule of a given model (parameters + persistent buffers).
+    """
+    if dtype is not None:
+        dtype = _get_proper_dtype(dtype)
+        dtype_size = dtype_byte_size(dtype)
+    if special_dtypes is not None:
+        special_dtypes = {key: _get_proper_dtype(dtyp) for key, dtyp in special_dtypes.items()}
+        special_dtypes_size = {key: dtype_byte_size(dtyp) for key, dtyp in special_dtypes.items()}
+    module_sizes = defaultdict(int)
+
+    module_list = []
+
+    module_list = named_persistent_module_tensors(model, recurse=True)
+
+    for name, tensor in module_list:
+        if special_dtypes is not None and name in special_dtypes:
+            size = tensor.numel() * special_dtypes_size[name]
+        elif dtype is None:
+            size = tensor.numel() * dtype_byte_size(tensor.dtype)
+        elif str(tensor.dtype).startswith(("torch.uint", "torch.int", "torch.bool")):
+            # According to the code in set_module_tensor_to_device, these types won't be converted
+            # so use their original size here
+            size = tensor.numel() * dtype_byte_size(tensor.dtype)
+        else:
+            size = tensor.numel() * min(dtype_size, dtype_byte_size(tensor.dtype))
+        name_parts = name.split(".")
+        for idx in range(len(name_parts) + 1):
+            module_sizes[".".join(name_parts[:idx])] += size
+
+    return module_sizes
+
+
+def cast_maybe_tensor_dtype(maybe_tensor, current_dtype, target_dtype):
+    if torch.is_tensor(maybe_tensor):
+        return maybe_tensor.to(target_dtype) if maybe_tensor.dtype == current_dtype else maybe_tensor
+    if isinstance(maybe_tensor, dict):
+        return {k: cast_maybe_tensor_dtype(v, current_dtype, target_dtype) for k, v in maybe_tensor.items()}
+    if isinstance(maybe_tensor, list):
+        return [cast_maybe_tensor_dtype(v, current_dtype, target_dtype) for v in maybe_tensor]
+    return maybe_tensor
+
+
+class ModelUtilsTest(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+
     def test_missing_key_loading_warning_message(self):
-        logger = logging.getLogger("diffusers.models.modeling_utils")
-        with CaptureLogger(logger) as cap_logger:
+        with self.assertLogs("diffusers.models.modeling_utils", level="WARNING") as logs:
             UNet2DConditionModel.from_pretrained("hf-internal-testing/stable-diffusion-broken", subfolder="unet")
 
         # make sure that error message states what keys are missing
-        assert "conv_out.bias" in cap_logger.out
+        assert "conv_out.bias" in " ".join(logs.output)
 
-    @pytest.mark.parametrize(
-        "repo_id, subfolder, use_local",
+    @parameterized.expand(
         [
             ("hf-internal-testing/tiny-stable-diffusion-pipe-variants-all-kinds", "unet", False),
             ("hf-internal-testing/tiny-stable-diffusion-pipe-variants-all-kinds", "unet", True),
             ("hf-internal-testing/tiny-sd-unet-with-sharded-ckpt", None, False),
             ("hf-internal-testing/tiny-sd-unet-with-sharded-ckpt", None, True),
-        ],
+        ]
     )
     def test_variant_sharded_ckpt_legacy_format_raises_warning(self, repo_id, subfolder, use_local):
         def load_model(path):
@@ -61,7 +240,7 @@ def load_model(path):
                 kwargs["subfolder"] = subfolder
             return UNet2DConditionModel.from_pretrained(path, **kwargs)
 
-        with pytest.warns(FutureWarning) as warning:
+        with self.assertWarns(FutureWarning) as warning:
             if use_local:
                 with tempfile.TemporaryDirectory() as tmpdirname:
                     tmpdirname = snapshot_download(repo_id=repo_id)
@@ -69,20 +248,19 @@ def load_model(path):
             else:
                 _ = load_model(repo_id)
 
-        warning_messages = " ".join(str(w.message) for w in warning)
-        assert "This serialization format is now deprecated to standardize the serialization" in warning_messages
+        warning_messages = " ".join(str(w.message) for w in warning.warnings)
+        self.assertIn("This serialization format is now deprecated to standardize the serialization", warning_messages)
 
     # Local tests are already covered down below.
-    @pytest.mark.parametrize(
-        "repo_id, subfolder, variant",
+    @parameterized.expand(
         [
             ("hf-internal-testing/tiny-sd-unet-sharded-latest-format", None, "fp16"),
             ("hf-internal-testing/tiny-sd-unet-sharded-latest-format-subfolder", "unet", "fp16"),
             ("hf-internal-testing/tiny-sd-unet-sharded-no-variants", None, None),
             ("hf-internal-testing/tiny-sd-unet-sharded-no-variants-subfolder", "unet", None),
-        ],
+        ]
     )
-    def test_variant_sharded_ckpt_loads_from_hub(self, repo_id, subfolder, variant):
+    def test_variant_sharded_ckpt_loads_from_hub(self, repo_id, subfolder, variant=None):
         def load_model():
             kwargs = {}
             if variant:
@@ -134,7 +312,7 @@ def test_local_files_only_with_sharded_checkpoint(self):
             with mock.patch("huggingface_hub.hf_api.get_session", return_value=client_mock):
                 # Should fail with local_files_only=False (network required)
                 # We would make a network call with model_info
-                with pytest.raises(OSError):
+                with self.assertRaises(OSError):
                     FluxTransformer2DModel.from_pretrained(
                         repo_id, subfolder="transformer", cache_dir=tmpdir, local_files_only=False
                     )
@@ -156,19 +334,19 @@ def test_local_files_only_with_sharded_checkpoint(self):
             os.remove(cached_shard_file)
 
             # Attempting to load from cache should raise an error
-            with pytest.raises(OSError) as context:
+            with self.assertRaises(OSError) as context:
                 FluxTransformer2DModel.from_pretrained(
                     repo_id, subfolder="transformer", cache_dir=tmpdir, local_files_only=True
                 )
 
             # Verify error mentions the missing shard
-            error_msg = str(context.value)
+            error_msg = str(context.exception)
             assert cached_shard_file in error_msg or "required according to the checkpoint index" in error_msg, (
                 f"Expected error about missing shard, got: {error_msg}"
             )
 
-    @pytest.mark.skip(reason="Flaky behaviour on CI. Re-enable after migrating to new runners")
-    @pytest.mark.skipif(torch_device == "mps", reason="Test not supported for MPS.")
+    @unittest.skip("Flaky behaviour on CI. Re-enable after migrating to new runners")
+    @unittest.skipIf(torch_device == "mps", reason="Test not supported for MPS.")
     def test_one_request_upon_cached(self):
         use_safetensors = False
 
@@ -201,7 +379,7 @@ def test_one_request_upon_cached(self):
             )
 
     def test_weight_overwrite(self):
-        with tempfile.TemporaryDirectory() as tmpdirname, pytest.raises(ValueError) as error_context:
+        with tempfile.TemporaryDirectory() as tmpdirname, self.assertRaises(ValueError) as error_context:
             UNet2DConditionModel.from_pretrained(
                 "hf-internal-testing/tiny-stable-diffusion-torch",
                 subfolder="unet",
@@ -210,7 +388,7 @@ def test_weight_overwrite(self):
             )
 
         # make sure that error message states what keys are missing
-        assert "Cannot load" in str(error_context.value)
+        assert "Cannot load" in str(error_context.exception)
 
         with tempfile.TemporaryDirectory() as tmpdirname:
             model = UNet2DConditionModel.from_pretrained(
@@ -242,9 +420,9 @@ def test_keep_modules_in_fp32(self):
             for name, module in model.named_modules():
                 if isinstance(module, torch.nn.Linear):
                     if name in model._keep_in_fp32_modules:
-                        assert module.weight.dtype == torch.float32
+                        self.assertTrue(module.weight.dtype == torch.float32)
                     else:
-                        assert module.weight.dtype == torch_dtype
+                        self.assertTrue(module.weight.dtype == torch_dtype)
 
         def get_dummy_inputs():
             batch_size = 2
@@ -308,8 +486,1542 @@ def test_forward_with_norm_groups(self):
         assert output.shape == expected_shape, "Input and output shapes do not match"
 
 
+class ModelTesterMixin:
+    main_input_name = None  # overwrite in model specific tester class
+    base_precision = 1e-3
+    forward_requires_fresh_args = False
+    model_split_percents = [0.5, 0.7, 0.9]
+    uses_custom_attn_processor = False
+
+    def check_device_map_is_respected(self, model, device_map):
+        for param_name, param in model.named_parameters():
+            # Find device in device_map
+            while len(param_name) > 0 and param_name not in device_map:
+                param_name = ".".join(param_name.split(".")[:-1])
+            if param_name not in device_map:
+                raise ValueError("device map is incomplete, it does not contain any device for `param_name`.")
+
+            param_device = device_map[param_name]
+            if param_device in ["cpu", "disk"]:
+                self.assertEqual(param.device, torch.device("meta"))
+            else:
+                self.assertEqual(param.device, torch.device(param_device))
+
+    def test_from_save_pretrained(self, expected_max_diff=5e-5):
+        if self.forward_requires_fresh_args:
+            model = self.model_class(**self.init_dict)
+        else:
+            init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+            model = self.model_class(**init_dict)
+
+        if hasattr(model, "set_default_attn_processor"):
+            model.set_default_attn_processor()
+        model.to(torch_device)
+        model.eval()
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_pretrained(tmpdirname, safe_serialization=False)
+            new_model = self.model_class.from_pretrained(tmpdirname)
+            if hasattr(new_model, "set_default_attn_processor"):
+                new_model.set_default_attn_processor()
+            new_model.to(torch_device)
+
+        with torch.no_grad():
+            if self.forward_requires_fresh_args:
+                image = model(**self.inputs_dict(0))
+            else:
+                image = model(**inputs_dict)
+
+            if isinstance(image, dict):
+                image = image.to_tuple()[0]
+
+            if self.forward_requires_fresh_args:
+                new_image = new_model(**self.inputs_dict(0))
+            else:
+                new_image = new_model(**inputs_dict)
+
+            if isinstance(new_image, dict):
+                new_image = new_image.to_tuple()[0]
+
+            image = normalize_output(image)
+            new_image = normalize_output(new_image)
+
+        max_diff = (image - new_image).abs().max().item()
+        self.assertLessEqual(max_diff, expected_max_diff, "Models give different forward passes")
+
+    def test_getattr_is_correct(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+
+        # save some things to test
+        model.dummy_attribute = 5
+        model.register_to_config(test_attribute=5)
+
+        logger = logging.get_logger("diffusers.models.modeling_utils")
+        # 30 for warning
+        logger.setLevel(30)
+        with CaptureLogger(logger) as cap_logger:
+            assert hasattr(model, "dummy_attribute")
+            assert getattr(model, "dummy_attribute") == 5
+            assert model.dummy_attribute == 5
+
+        # no warning should be thrown
+        assert cap_logger.out == ""
+
+        logger = logging.get_logger("diffusers.models.modeling_utils")
+        # 30 for warning
+        logger.setLevel(30)
+        with CaptureLogger(logger) as cap_logger:
+            assert hasattr(model, "save_pretrained")
+            fn = model.save_pretrained
+            fn_1 = getattr(model, "save_pretrained")
+
+            assert fn == fn_1
+        # no warning should be thrown
+        assert cap_logger.out == ""
+
+        # warning should be thrown
+        with self.assertWarns(FutureWarning):
+            assert model.test_attribute == 5
+
+        with self.assertWarns(FutureWarning):
+            assert getattr(model, "test_attribute") == 5
+
+        with self.assertRaises(AttributeError) as error:
+            model.does_not_exist
+
+        assert str(error.exception) == f"'{type(model).__name__}' object has no attribute 'does_not_exist'"
+
+    @unittest.skipIf(
+        torch_device != "npu" or not is_torch_npu_available(),
+        reason="torch npu flash attention is only available with NPU and `torch_npu` installed",
+    )
+    def test_set_torch_npu_flash_attn_processor_determinism(self):
+        torch.use_deterministic_algorithms(False)
+        if self.forward_requires_fresh_args:
+            model = self.model_class(**self.init_dict)
+        else:
+            init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+            model = self.model_class(**init_dict)
+        model.to(torch_device)
+
+        if not hasattr(model, "set_attn_processor"):
+            # If not has `set_attn_processor`, skip test
+            return
+
+        model.set_default_attn_processor()
+        assert all(type(proc) == AttnProcessorNPU for proc in model.attn_processors.values())
+        with torch.no_grad():
+            if self.forward_requires_fresh_args:
+                output = model(**self.inputs_dict(0))[0]
+            else:
+                output = model(**inputs_dict)[0]
+
+        model.enable_npu_flash_attention()
+        assert all(type(proc) == AttnProcessorNPU for proc in model.attn_processors.values())
+        with torch.no_grad():
+            if self.forward_requires_fresh_args:
+                output_2 = model(**self.inputs_dict(0))[0]
+            else:
+                output_2 = model(**inputs_dict)[0]
+
+        model.set_attn_processor(AttnProcessorNPU())
+        assert all(type(proc) == AttnProcessorNPU for proc in model.attn_processors.values())
+        with torch.no_grad():
+            if self.forward_requires_fresh_args:
+                output_3 = model(**self.inputs_dict(0))[0]
+            else:
+                output_3 = model(**inputs_dict)[0]
+
+        torch.use_deterministic_algorithms(True)
+
+        assert torch.allclose(output, output_2, atol=self.base_precision)
+        assert torch.allclose(output, output_3, atol=self.base_precision)
+        assert torch.allclose(output_2, output_3, atol=self.base_precision)
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_set_xformers_attn_processor_for_determinism(self):
+        torch.use_deterministic_algorithms(False)
+        if self.forward_requires_fresh_args:
+            model = self.model_class(**self.init_dict)
+        else:
+            init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+            model = self.model_class(**init_dict)
+        model.to(torch_device)
+
+        if not hasattr(model, "set_attn_processor"):
+            # If not has `set_attn_processor`, skip test
+            return
+
+        if not hasattr(model, "set_default_attn_processor"):
+            # If not has `set_attn_processor`, skip test
+            return
+
+        model.set_default_attn_processor()
+        assert all(type(proc) == AttnProcessor for proc in model.attn_processors.values())
+        with torch.no_grad():
+            if self.forward_requires_fresh_args:
+                output = model(**self.inputs_dict(0))[0]
+            else:
+                output = model(**inputs_dict)[0]
+
+        model.enable_xformers_memory_efficient_attention()
+        assert all(type(proc) == XFormersAttnProcessor for proc in model.attn_processors.values())
+        with torch.no_grad():
+            if self.forward_requires_fresh_args:
+                output_2 = model(**self.inputs_dict(0))[0]
+            else:
+                output_2 = model(**inputs_dict)[0]
+
+        model.set_attn_processor(XFormersAttnProcessor())
+        assert all(type(proc) == XFormersAttnProcessor for proc in model.attn_processors.values())
+        with torch.no_grad():
+            if self.forward_requires_fresh_args:
+                output_3 = model(**self.inputs_dict(0))[0]
+            else:
+                output_3 = model(**inputs_dict)[0]
+
+        torch.use_deterministic_algorithms(True)
+
+        assert torch.allclose(output, output_2, atol=self.base_precision)
+        assert torch.allclose(output, output_3, atol=self.base_precision)
+        assert torch.allclose(output_2, output_3, atol=self.base_precision)
+
+    @require_torch_accelerator
+    def test_set_attn_processor_for_determinism(self):
+        if self.uses_custom_attn_processor:
+            return
+
+        torch.use_deterministic_algorithms(False)
+        if self.forward_requires_fresh_args:
+            model = self.model_class(**self.init_dict)
+        else:
+            init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+            model = self.model_class(**init_dict)
+
+        model.to(torch_device)
+
+        if not hasattr(model, "set_attn_processor"):
+            # If not has `set_attn_processor`, skip test
+            return
+
+        assert all(type(proc) == AttnProcessor2_0 for proc in model.attn_processors.values())
+        with torch.no_grad():
+            if self.forward_requires_fresh_args:
+                output_1 = model(**self.inputs_dict(0))[0]
+            else:
+                output_1 = model(**inputs_dict)[0]
+
+        model.set_default_attn_processor()
+        assert all(type(proc) == AttnProcessor for proc in model.attn_processors.values())
+        with torch.no_grad():
+            if self.forward_requires_fresh_args:
+                output_2 = model(**self.inputs_dict(0))[0]
+            else:
+                output_2 = model(**inputs_dict)[0]
+
+        model.set_attn_processor(AttnProcessor2_0())
+        assert all(type(proc) == AttnProcessor2_0 for proc in model.attn_processors.values())
+        with torch.no_grad():
+            if self.forward_requires_fresh_args:
+                output_4 = model(**self.inputs_dict(0))[0]
+            else:
+                output_4 = model(**inputs_dict)[0]
+
+        model.set_attn_processor(AttnProcessor())
+        assert all(type(proc) == AttnProcessor for proc in model.attn_processors.values())
+        with torch.no_grad():
+            if self.forward_requires_fresh_args:
+                output_5 = model(**self.inputs_dict(0))[0]
+            else:
+                output_5 = model(**inputs_dict)[0]
+
+        torch.use_deterministic_algorithms(True)
+
+        # make sure that outputs match
+        assert torch.allclose(output_2, output_1, atol=self.base_precision)
+        assert torch.allclose(output_2, output_4, atol=self.base_precision)
+        assert torch.allclose(output_2, output_5, atol=self.base_precision)
+
+    def test_from_save_pretrained_variant(self, expected_max_diff=5e-5):
+        if self.forward_requires_fresh_args:
+            model = self.model_class(**self.init_dict)
+        else:
+            init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+            model = self.model_class(**init_dict)
+
+        if hasattr(model, "set_default_attn_processor"):
+            model.set_default_attn_processor()
+
+        model.to(torch_device)
+        model.eval()
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_pretrained(tmpdirname, variant="fp16", safe_serialization=False)
+            new_model = self.model_class.from_pretrained(tmpdirname, variant="fp16")
+            if hasattr(new_model, "set_default_attn_processor"):
+                new_model.set_default_attn_processor()
+
+            # non-variant cannot be loaded
+            with self.assertRaises(OSError) as error_context:
+                self.model_class.from_pretrained(tmpdirname)
+
+            # make sure that error message states what keys are missing
+            assert "Error no file named diffusion_pytorch_model.bin found in directory" in str(error_context.exception)
+
+            new_model.to(torch_device)
+
+        with torch.no_grad():
+            if self.forward_requires_fresh_args:
+                image = model(**self.inputs_dict(0))
+            else:
+                image = model(**inputs_dict)
+            if isinstance(image, dict):
+                image = image.to_tuple()[0]
+
+            if self.forward_requires_fresh_args:
+                new_image = new_model(**self.inputs_dict(0))
+            else:
+                new_image = new_model(**inputs_dict)
+
+            if isinstance(new_image, dict):
+                new_image = new_image.to_tuple()[0]
+
+            image = normalize_output(image)
+            new_image = normalize_output(new_image)
+
+        max_diff = (image - new_image).abs().max().item()
+        self.assertLessEqual(max_diff, expected_max_diff, "Models give different forward passes")
+
+    @is_torch_compile
+    @require_torch_2
+    @unittest.skipIf(
+        get_python_version == (3, 12),
+        reason="Torch Dynamo isn't yet supported for Python 3.12.",
+    )
+    def test_from_save_pretrained_dynamo(self):
+        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
+        inputs = [init_dict, self.model_class]
+        run_test_in_subprocess(test_case=self, target_func=_test_from_save_pretrained_dynamo, inputs=inputs)
+
+    def test_from_save_pretrained_dtype(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        for dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            if torch_device == "mps" and dtype == torch.bfloat16:
+                continue
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.to(dtype)
+                model.save_pretrained(tmpdirname, safe_serialization=False)
+                new_model = self.model_class.from_pretrained(tmpdirname, low_cpu_mem_usage=True, torch_dtype=dtype)
+                assert new_model.dtype == dtype
+                if (
+                    hasattr(self.model_class, "_keep_in_fp32_modules")
+                    and self.model_class._keep_in_fp32_modules is None
+                ):
+                    new_model = self.model_class.from_pretrained(
+                        tmpdirname, low_cpu_mem_usage=False, torch_dtype=dtype
+                    )
+                    assert new_model.dtype == dtype
+
+    def test_determinism(self, expected_max_diff=1e-5):
+        if self.forward_requires_fresh_args:
+            model = self.model_class(**self.init_dict)
+        else:
+            init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+            model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            if self.forward_requires_fresh_args:
+                first = model(**self.inputs_dict(0))
+            else:
+                first = model(**inputs_dict)
+            if isinstance(first, dict):
+                first = first.to_tuple()[0]
+
+            if self.forward_requires_fresh_args:
+                second = model(**self.inputs_dict(0))
+            else:
+                second = model(**inputs_dict)
+            if isinstance(second, dict):
+                second = second.to_tuple()[0]
+
+            first = normalize_output(first)
+            second = normalize_output(second)
+
+        out_1 = first.cpu().numpy()
+        out_2 = second.cpu().numpy()
+        out_1 = out_1[~np.isnan(out_1)]
+        out_2 = out_2[~np.isnan(out_2)]
+        max_diff = np.amax(np.abs(out_1 - out_2))
+        self.assertLessEqual(max_diff, expected_max_diff)
+
+    def test_output(self, expected_output_shape=None):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            output = model(**inputs_dict)
+
+            if isinstance(output, dict):
+                output = output.to_tuple()[0]
+            if isinstance(output, list):
+                output = torch.stack(output)
+
+        self.assertIsNotNone(output)
+
+        # input & output have to have the same shape
+        input_tensor = inputs_dict[self.main_input_name]
+        if isinstance(input_tensor, list):
+            input_tensor = torch.stack(input_tensor)
+
+        if expected_output_shape is None:
+            expected_shape = input_tensor.shape
+            self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
+        else:
+            self.assertEqual(output.shape, expected_output_shape, "Input and output shapes do not match")
+
+    def test_model_from_pretrained(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        # test if the model can be loaded from the config
+        # and has all the expected shape
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_pretrained(tmpdirname, safe_serialization=False)
+            new_model = self.model_class.from_pretrained(tmpdirname)
+            new_model.to(torch_device)
+            new_model.eval()
+
+        # check if all parameters shape are the same
+        for param_name in model.state_dict().keys():
+            param_1 = model.state_dict()[param_name]
+            param_2 = new_model.state_dict()[param_name]
+            self.assertEqual(param_1.shape, param_2.shape)
+
+        with torch.no_grad():
+            output_1 = model(**inputs_dict)
+
+            if isinstance(output_1, dict):
+                output_1 = output_1.to_tuple()[0]
+            if isinstance(output_1, list):
+                output_1 = torch.stack(output_1)
+
+            output_2 = new_model(**inputs_dict)
+
+            if isinstance(output_2, dict):
+                output_2 = output_2.to_tuple()[0]
+            if isinstance(output_2, list):
+                output_2 = torch.stack(output_2)
+
+        self.assertEqual(output_1.shape, output_2.shape)
+
+    @require_torch_accelerator_with_training
+    def test_training(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.train()
+        output = model(**inputs_dict)
+
+        if isinstance(output, dict):
+            output = output.to_tuple()[0]
+
+        input_tensor = inputs_dict[self.main_input_name]
+        noise = torch.randn((input_tensor.shape[0],) + self.output_shape).to(torch_device)
+        loss = torch.nn.functional.mse_loss(output, noise)
+        loss.backward()
+
+    @require_torch_accelerator_with_training
+    def test_ema_training(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.train()
+        ema_model = EMAModel(model.parameters())
+
+        output = model(**inputs_dict)
+
+        if isinstance(output, dict):
+            output = output.to_tuple()[0]
+
+        input_tensor = inputs_dict[self.main_input_name]
+        noise = torch.randn((input_tensor.shape[0],) + self.output_shape).to(torch_device)
+        loss = torch.nn.functional.mse_loss(output, noise)
+        loss.backward()
+        ema_model.step(model.parameters())
+
+    def test_outputs_equivalence(self):
+        def set_nan_tensor_to_zero(t):
+            # Temporary fallback until `aten::_index_put_impl_` is implemented in mps
+            # Track progress in https://github.com/pytorch/pytorch/issues/77764
+            device = t.device
+            if device.type == "mps":
+                t = t.to("cpu")
+            t[t != t] = 0
+            return t.to(device)
+
+        def recursive_check(tuple_object, dict_object):
+            if isinstance(tuple_object, (List, Tuple)):
+                for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object.values()):
+                    recursive_check(tuple_iterable_value, dict_iterable_value)
+            elif isinstance(tuple_object, Dict):
+                for tuple_iterable_value, dict_iterable_value in zip(tuple_object.values(), dict_object.values()):
+                    recursive_check(tuple_iterable_value, dict_iterable_value)
+            elif tuple_object is None:
+                return
+            else:
+                self.assertTrue(
+                    torch.allclose(
+                        set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5
+                    ),
+                    msg=(
+                        "Tuple and dict output are not equal. Difference:"
+                        f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:"
+                        f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has"
+                        f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}."
+                    ),
+                )
+
+        if self.forward_requires_fresh_args:
+            model = self.model_class(**self.init_dict)
+        else:
+            init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+            model = self.model_class(**init_dict)
+
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            if self.forward_requires_fresh_args:
+                outputs_dict = model(**self.inputs_dict(0))
+                outputs_tuple = model(**self.inputs_dict(0), return_dict=False)
+            else:
+                outputs_dict = model(**inputs_dict)
+                outputs_tuple = model(**inputs_dict, return_dict=False)
+
+        recursive_check(outputs_tuple, outputs_dict)
+
+    @require_torch_accelerator_with_training
+    def test_enable_disable_gradient_checkpointing(self):
+        # Skip test if model does not support gradient checkpointing
+        if not self.model_class._supports_gradient_checkpointing:
+            pytest.skip("Gradient checkpointing is not supported.")
+
+        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
+
+        # at init model should have gradient checkpointing disabled
+        model = self.model_class(**init_dict)
+        self.assertFalse(model.is_gradient_checkpointing)
+
+        # check enable works
+        model.enable_gradient_checkpointing()
+        self.assertTrue(model.is_gradient_checkpointing)
+
+        # check disable works
+        model.disable_gradient_checkpointing()
+        self.assertFalse(model.is_gradient_checkpointing)
+
+    @require_torch_accelerator_with_training
+    def test_effective_gradient_checkpointing(self, loss_tolerance=1e-5, param_grad_tol=5e-5, skip: set[str] = {}):
+        # Skip test if model does not support gradient checkpointing
+        if not self.model_class._supports_gradient_checkpointing:
+            pytest.skip("Gradient checkpointing is not supported.")
+
+        # enable deterministic behavior for gradient checkpointing
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        inputs_dict_copy = copy.deepcopy(inputs_dict)
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+
+        assert not model.is_gradient_checkpointing and model.training
+
+        out = model(**inputs_dict).sample
+        # run the backwards pass on the model. For backwards pass, for simplicity purpose,
+        # we won't calculate the loss and rather backprop on out.sum()
+        model.zero_grad()
+
+        labels = torch.randn_like(out)
+        loss = (out - labels).mean()
+        loss.backward()
+
+        # re-instantiate the model now enabling gradient checkpointing
+        torch.manual_seed(0)
+        model_2 = self.model_class(**init_dict)
+        # clone model
+        model_2.load_state_dict(model.state_dict())
+        model_2.to(torch_device)
+        model_2.enable_gradient_checkpointing()
+
+        assert model_2.is_gradient_checkpointing and model_2.training
+
+        out_2 = model_2(**inputs_dict_copy).sample
+        # run the backwards pass on the model. For backwards pass, for simplicity purpose,
+        # we won't calculate the loss and rather backprop on out.sum()
+        model_2.zero_grad()
+        loss_2 = (out_2 - labels).mean()
+        loss_2.backward()
+
+        # compare the output and parameters gradients
+        self.assertTrue((loss - loss_2).abs() < loss_tolerance)
+        named_params = dict(model.named_parameters())
+        named_params_2 = dict(model_2.named_parameters())
+
+        for name, param in named_params.items():
+            if "post_quant_conv" in name:
+                continue
+            if name in skip:
+                continue
+            # TODO(aryan): remove the below lines after looking into easyanimate transformer a little more
+            # It currently errors out the gradient checkpointing test because the gradients for attn2.to_out is None
+            if param.grad is None:
+                continue
+            self.assertTrue(torch_all_close(param.grad.data, named_params_2[name].grad.data, atol=param_grad_tol))
+
+    @unittest.skipIf(torch_device == "mps", "This test is not supported for MPS devices.")
+    def test_gradient_checkpointing_is_applied(
+        self, expected_set=None, attention_head_dim=None, num_attention_heads=None, block_out_channels=None
+    ):
+        # Skip test if model does not support gradient checkpointing
+        if not self.model_class._supports_gradient_checkpointing:
+            pytest.skip("Gradient checkpointing is not supported.")
+
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        if attention_head_dim is not None:
+            init_dict["attention_head_dim"] = attention_head_dim
+        if num_attention_heads is not None:
+            init_dict["num_attention_heads"] = num_attention_heads
+        if block_out_channels is not None:
+            init_dict["block_out_channels"] = block_out_channels
+
+        model_class_copy = copy.copy(self.model_class)
+        model = model_class_copy(**init_dict)
+        model.enable_gradient_checkpointing()
+
+        modules_with_gc_enabled = {}
+        for submodule in model.modules():
+            if hasattr(submodule, "gradient_checkpointing"):
+                self.assertTrue(submodule.gradient_checkpointing)
+                modules_with_gc_enabled[submodule.__class__.__name__] = True
+
+        assert set(modules_with_gc_enabled.keys()) == expected_set
+        assert all(modules_with_gc_enabled.values()), "All modules should be enabled"
+
+    def test_deprecated_kwargs(self):
+        has_kwarg_in_model_class = "kwargs" in inspect.signature(self.model_class.__init__).parameters
+        has_deprecated_kwarg = len(self.model_class._deprecated_kwargs) > 0
+
+        if has_kwarg_in_model_class and not has_deprecated_kwarg:
+            raise ValueError(
+                f"{self.model_class} has `**kwargs` in its __init__ method but has not defined any deprecated kwargs"
+                " under the `_deprecated_kwargs` class attribute. Make sure to either remove `**kwargs` if there are"
+                " no deprecated arguments or add the deprecated argument with `_deprecated_kwargs ="
+                " [<deprecated_argument>]`"
+            )
+
+        if not has_kwarg_in_model_class and has_deprecated_kwarg:
+            raise ValueError(
+                f"{self.model_class} doesn't have `**kwargs` in its __init__ method but has defined deprecated kwargs"
+                " under the `_deprecated_kwargs` class attribute. Make sure to either add the `**kwargs` argument to"
+                f" {self.model_class}.__init__ if there are deprecated arguments or remove the deprecated argument"
+                " from `_deprecated_kwargs = [<deprecated_argument>]`"
+            )
+
+    @parameterized.expand([(4, 4, True), (4, 8, False), (8, 4, False)])
+    @torch.no_grad()
+    @unittest.skipIf(not is_peft_available(), "Only with PEFT")
+    def test_save_load_lora_adapter(self, rank, lora_alpha, use_dora=False):
+        from peft import LoraConfig
+        from peft.utils import get_peft_model_state_dict
+
+        from diffusers.loaders.peft import PeftAdapterMixin
+
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict).to(torch_device)
+
+        if not issubclass(model.__class__, PeftAdapterMixin):
+            pytest.skip(f"PEFT is not supported for this model ({model.__class__.__name__}).")
+
+        torch.manual_seed(0)
+        output_no_lora = model(**inputs_dict, return_dict=False)[0]
+        if isinstance(output_no_lora, list):
+            output_no_lora = torch.stack(output_no_lora)
+
+        denoiser_lora_config = LoraConfig(
+            r=rank,
+            lora_alpha=lora_alpha,
+            target_modules=["to_q", "to_k", "to_v", "to_out.0"],
+            init_lora_weights=False,
+            use_dora=use_dora,
+        )
+        model.add_adapter(denoiser_lora_config)
+        self.assertTrue(check_if_lora_correctly_set(model), "LoRA layers not set correctly")
+
+        torch.manual_seed(0)
+        outputs_with_lora = model(**inputs_dict, return_dict=False)[0]
+        if isinstance(outputs_with_lora, list):
+            outputs_with_lora = torch.stack(outputs_with_lora)
+
+        self.assertFalse(torch.allclose(output_no_lora, outputs_with_lora, atol=1e-4, rtol=1e-4))
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            model.save_lora_adapter(tmpdir)
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
+
+            state_dict_loaded = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
+
+            model.unload_lora()
+            self.assertFalse(check_if_lora_correctly_set(model), "LoRA layers not set correctly")
+
+            model.load_lora_adapter(tmpdir, prefix=None, use_safetensors=True)
+            state_dict_retrieved = get_peft_model_state_dict(model, adapter_name="default_0")
+
+            for k in state_dict_loaded:
+                loaded_v = state_dict_loaded[k]
+                retrieved_v = state_dict_retrieved[k].to(loaded_v.device)
+                self.assertTrue(torch.allclose(loaded_v, retrieved_v))
+
+            self.assertTrue(check_if_lora_correctly_set(model), "LoRA layers not set correctly")
+
+        torch.manual_seed(0)
+        outputs_with_lora_2 = model(**inputs_dict, return_dict=False)[0]
+        if isinstance(outputs_with_lora_2, list):
+            outputs_with_lora_2 = torch.stack(outputs_with_lora_2)
+
+        self.assertFalse(torch.allclose(output_no_lora, outputs_with_lora_2, atol=1e-4, rtol=1e-4))
+        self.assertTrue(torch.allclose(outputs_with_lora, outputs_with_lora_2, atol=1e-4, rtol=1e-4))
+
+    @unittest.skipIf(not is_peft_available(), "Only with PEFT")
+    def test_lora_wrong_adapter_name_raises_error(self):
+        from peft import LoraConfig
+
+        from diffusers.loaders.peft import PeftAdapterMixin
+
+        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict).to(torch_device)
+
+        if not issubclass(model.__class__, PeftAdapterMixin):
+            pytest.skip(f"PEFT is not supported for this model ({model.__class__.__name__}).")
+
+        denoiser_lora_config = LoraConfig(
+            r=4,
+            lora_alpha=4,
+            target_modules=["to_q", "to_k", "to_v", "to_out.0"],
+            init_lora_weights=False,
+            use_dora=False,
+        )
+        model.add_adapter(denoiser_lora_config)
+        self.assertTrue(check_if_lora_correctly_set(model), "LoRA layers not set correctly")
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            wrong_name = "foo"
+            with self.assertRaises(ValueError) as err_context:
+                model.save_lora_adapter(tmpdir, adapter_name=wrong_name)
+
+            self.assertTrue(f"Adapter name {wrong_name} not found in the model." in str(err_context.exception))
+
+    @parameterized.expand([(4, 4, True), (4, 8, False), (8, 4, False)])
+    @torch.no_grad()
+    @unittest.skipIf(not is_peft_available(), "Only with PEFT")
+    def test_lora_adapter_metadata_is_loaded_correctly(self, rank, lora_alpha, use_dora):
+        from peft import LoraConfig
+
+        from diffusers.loaders.peft import PeftAdapterMixin
+
+        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict).to(torch_device)
+
+        if not issubclass(model.__class__, PeftAdapterMixin):
+            pytest.skip(f"PEFT is not supported for this model ({model.__class__.__name__}).")
+
+        denoiser_lora_config = LoraConfig(
+            r=rank,
+            lora_alpha=lora_alpha,
+            target_modules=["to_q", "to_k", "to_v", "to_out.0"],
+            init_lora_weights=False,
+            use_dora=use_dora,
+        )
+        model.add_adapter(denoiser_lora_config)
+        metadata = model.peft_config["default"].to_dict()
+        self.assertTrue(check_if_lora_correctly_set(model), "LoRA layers not set correctly")
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            model.save_lora_adapter(tmpdir)
+            model_file = os.path.join(tmpdir, "pytorch_lora_weights.safetensors")
+            self.assertTrue(os.path.isfile(model_file))
+
+            model.unload_lora()
+            self.assertFalse(check_if_lora_correctly_set(model), "LoRA layers not set correctly")
+
+            model.load_lora_adapter(tmpdir, prefix=None, use_safetensors=True)
+            parsed_metadata = model.peft_config["default_0"].to_dict()
+            check_if_dicts_are_equal(metadata, parsed_metadata)
+
+    @torch.no_grad()
+    @unittest.skipIf(not is_peft_available(), "Only with PEFT")
+    def test_lora_adapter_wrong_metadata_raises_error(self):
+        from peft import LoraConfig
+
+        from diffusers.loaders.lora_base import LORA_ADAPTER_METADATA_KEY
+        from diffusers.loaders.peft import PeftAdapterMixin
+
+        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict).to(torch_device)
+
+        if not issubclass(model.__class__, PeftAdapterMixin):
+            pytest.skip(f"PEFT is not supported for this model ({model.__class__.__name__}).")
+
+        denoiser_lora_config = LoraConfig(
+            r=4,
+            lora_alpha=4,
+            target_modules=["to_q", "to_k", "to_v", "to_out.0"],
+            init_lora_weights=False,
+            use_dora=False,
+        )
+        model.add_adapter(denoiser_lora_config)
+        self.assertTrue(check_if_lora_correctly_set(model), "LoRA layers not set correctly")
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            model.save_lora_adapter(tmpdir)
+            model_file = os.path.join(tmpdir, "pytorch_lora_weights.safetensors")
+            self.assertTrue(os.path.isfile(model_file))
+
+            # Perturb the metadata in the state dict.
+            loaded_state_dict = safetensors.torch.load_file(model_file)
+            metadata = {"format": "pt"}
+            lora_adapter_metadata = denoiser_lora_config.to_dict()
+            lora_adapter_metadata.update({"foo": 1, "bar": 2})
+            for key, value in lora_adapter_metadata.items():
+                if isinstance(value, set):
+                    lora_adapter_metadata[key] = list(value)
+            metadata[LORA_ADAPTER_METADATA_KEY] = json.dumps(lora_adapter_metadata, indent=2, sort_keys=True)
+            safetensors.torch.save_file(loaded_state_dict, model_file, metadata=metadata)
+
+            model.unload_lora()
+            self.assertFalse(check_if_lora_correctly_set(model), "LoRA layers not set correctly")
+
+            with self.assertRaises(TypeError) as err_context:
+                model.load_lora_adapter(tmpdir, prefix=None, use_safetensors=True)
+            self.assertTrue("`LoraConfig` class could not be instantiated" in str(err_context.exception))
+
+    @require_torch_accelerator
+    def test_cpu_offload(self):
+        if self.model_class._no_split_modules is None:
+            pytest.skip("Test not supported for this model as `_no_split_modules` is not set.")
+
+        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**config).eval()
+        model = model.to(torch_device)
+
+        torch.manual_seed(0)
+        base_output = model(**inputs_dict)
+        base_normalized_output = normalize_output(base_output)
+
+        model_size = compute_module_sizes(model)[""]
+        max_gpu_sizes = [int(p * model_size) for p in self.model_split_percents[1:]]
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.cpu().save_pretrained(tmp_dir)
+
+            for max_size in max_gpu_sizes:
+                max_memory = {0: max_size, "cpu": model_size * 2}
+                new_model = self.model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory)
+
+                # Making sure part of the model will actually end up offloaded
+                self.assertSetEqual(set(new_model.hf_device_map.values()), {0, "cpu"})
+
+                self.check_device_map_is_respected(new_model, new_model.hf_device_map)
+
+                torch.manual_seed(0)
+                new_output = new_model(**inputs_dict)
+                new_normalized_output = normalize_output(new_output)
+
+                self.assertTrue(torch.allclose(base_normalized_output, new_normalized_output, atol=1e-5))
+
+    @require_torch_accelerator
+    def test_disk_offload_without_safetensors(self):
+        if self.model_class._no_split_modules is None:
+            pytest.skip("Test not supported for this model as `_no_split_modules` is not set.")
+        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**config).eval()
+
+        model = model.to(torch_device)
+
+        torch.manual_seed(0)
+        base_output = model(**inputs_dict)
+        base_normalized_output = normalize_output(base_output)
+
+        model_size = compute_module_sizes(model)[""]
+        max_size = int(self.model_split_percents[0] * model_size)
+        # Force disk offload by setting very small CPU memory
+        max_memory = {0: max_size, "cpu": int(0.1 * max_size)}
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.cpu().save_pretrained(tmp_dir, safe_serialization=False)
+            with self.assertRaises(ValueError):
+                # This errors out because it's missing an offload folder
+                new_model = self.model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory)
+
+            new_model = self.model_class.from_pretrained(
+                tmp_dir, device_map="auto", max_memory=max_memory, offload_folder=tmp_dir
+            )
+
+            self.check_device_map_is_respected(new_model, new_model.hf_device_map)
+            torch.manual_seed(0)
+            new_output = new_model(**inputs_dict)
+            new_normalized_output = normalize_output(new_output)
+            self.assertTrue(torch.allclose(base_normalized_output, new_normalized_output, atol=1e-5))
+
+    @require_torch_accelerator
+    def test_disk_offload_with_safetensors(self):
+        if self.model_class._no_split_modules is None:
+            pytest.skip("Test not supported for this model as `_no_split_modules` is not set.")
+        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**config).eval()
+
+        model = model.to(torch_device)
+
+        torch.manual_seed(0)
+        base_output = model(**inputs_dict)
+        base_normalized_output = normalize_output(base_output)
+
+        model_size = compute_module_sizes(model)[""]
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.cpu().save_pretrained(tmp_dir)
+
+            max_size = int(self.model_split_percents[0] * model_size)
+            max_memory = {0: max_size, "cpu": max_size}
+            new_model = self.model_class.from_pretrained(
+                tmp_dir, device_map="auto", offload_folder=tmp_dir, max_memory=max_memory
+            )
+
+            self.check_device_map_is_respected(new_model, new_model.hf_device_map)
+            torch.manual_seed(0)
+            new_output = new_model(**inputs_dict)
+            new_normalized_output = normalize_output(new_output)
+
+            self.assertTrue(torch.allclose(base_normalized_output, new_normalized_output, atol=1e-5))
+
+    @require_torch_multi_accelerator
+    def test_model_parallelism(self):
+        if self.model_class._no_split_modules is None:
+            pytest.skip("Test not supported for this model as `_no_split_modules` is not set.")
+        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**config).eval()
+
+        model = model.to(torch_device)
+
+        torch.manual_seed(0)
+        base_output = model(**inputs_dict)
+
+        model_size = compute_module_sizes(model)[""]
+        # We test several splits of sizes to make sure it works.
+        max_gpu_sizes = [int(p * model_size) for p in self.model_split_percents[1:]]
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.cpu().save_pretrained(tmp_dir)
+
+            for max_size in max_gpu_sizes:
+                max_memory = {0: max_size, 1: model_size * 2, "cpu": model_size * 2}
+                new_model = self.model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory)
+                # Making sure part of the model will actually end up offloaded
+                self.assertSetEqual(set(new_model.hf_device_map.values()), {0, 1})
+
+                self.check_device_map_is_respected(new_model, new_model.hf_device_map)
+
+                torch.manual_seed(0)
+                new_output = new_model(**inputs_dict)
+
+                self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
+
+    @require_torch_accelerator
+    def test_sharded_checkpoints(self):
+        torch.manual_seed(0)
+        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**config).eval()
+        model = model.to(torch_device)
+
+        base_output = model(**inputs_dict)
+        base_normalized_output = normalize_output(base_output)
+
+        model_size = compute_module_persistent_sizes(model)[""]
+        max_shard_size = int((model_size * 0.75) / (2**10))  # Convert to KB as these test models are small.
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.cpu().save_pretrained(tmp_dir, max_shard_size=f"{max_shard_size}KB")
+            self.assertTrue(os.path.exists(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)))
+
+            # Now check if the right number of shards exists. First, let's get the number of shards.
+            # Since this number can be dependent on the model being tested, it's important that we calculate it
+            # instead of hardcoding it.
+            expected_num_shards = caculate_expected_num_shards(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME))
+            actual_num_shards = len([file for file in os.listdir(tmp_dir) if file.endswith(".safetensors")])
+            self.assertTrue(actual_num_shards == expected_num_shards)
+
+            new_model = self.model_class.from_pretrained(tmp_dir).eval()
+            new_model = new_model.to(torch_device)
+
+            torch.manual_seed(0)
+            if "generator" in inputs_dict:
+                _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+            new_output = new_model(**inputs_dict)
+            new_normalized_output = normalize_output(new_output)
+
+            self.assertTrue(torch.allclose(base_normalized_output, new_normalized_output, atol=1e-5))
+
+    @require_torch_accelerator
+    def test_sharded_checkpoints_with_variant(self):
+        torch.manual_seed(0)
+        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**config).eval()
+        model = model.to(torch_device)
+
+        base_output = model(**inputs_dict)
+        base_normalized_output = normalize_output(base_output)
+
+        model_size = compute_module_persistent_sizes(model)[""]
+        max_shard_size = int((model_size * 0.75) / (2**10))  # Convert to KB as these test models are small.
+        variant = "fp16"
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            # It doesn't matter if the actual model is in fp16 or not. Just adding the variant and
+            # testing if loading works with the variant when the checkpoint is sharded should be
+            # enough.
+            model.cpu().save_pretrained(tmp_dir, max_shard_size=f"{max_shard_size}KB", variant=variant)
+
+            index_filename = _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)
+            self.assertTrue(os.path.exists(os.path.join(tmp_dir, index_filename)))
+
+            # Now check if the right number of shards exists. First, let's get the number of shards.
+            # Since this number can be dependent on the model being tested, it's important that we calculate it
+            # instead of hardcoding it.
+            expected_num_shards = caculate_expected_num_shards(os.path.join(tmp_dir, index_filename))
+            actual_num_shards = len([file for file in os.listdir(tmp_dir) if file.endswith(".safetensors")])
+            self.assertTrue(actual_num_shards == expected_num_shards)
+
+            new_model = self.model_class.from_pretrained(tmp_dir, variant=variant).eval()
+            new_model = new_model.to(torch_device)
+
+            torch.manual_seed(0)
+            if "generator" in inputs_dict:
+                _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+            new_output = new_model(**inputs_dict)
+            new_normalized_output = normalize_output(new_output)
+
+            self.assertTrue(torch.allclose(base_normalized_output, new_normalized_output, atol=1e-5))
+
+    @require_torch_accelerator
+    def test_sharded_checkpoints_with_parallel_loading(self):
+        torch.manual_seed(0)
+        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**config).eval()
+        model = model.to(torch_device)
+
+        base_output = model(**inputs_dict)
+        base_normalized_output = normalize_output(base_output)
+
+        model_size = compute_module_persistent_sizes(model)[""]
+        max_shard_size = int((model_size * 0.75) / (2**10))  # Convert to KB as these test models are small.
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.cpu().save_pretrained(tmp_dir, max_shard_size=f"{max_shard_size}KB")
+            self.assertTrue(os.path.exists(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)))
+
+            # Now check if the right number of shards exists. First, let's get the number of shards.
+            # Since this number can be dependent on the model being tested, it's important that we calculate it
+            # instead of hardcoding it.
+            expected_num_shards = caculate_expected_num_shards(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME))
+            actual_num_shards = len([file for file in os.listdir(tmp_dir) if file.endswith(".safetensors")])
+            self.assertTrue(actual_num_shards == expected_num_shards)
+
+            # Load with parallel loading
+            os.environ["HF_ENABLE_PARALLEL_LOADING"] = "yes"
+            new_model = self.model_class.from_pretrained(tmp_dir).eval()
+            new_model = new_model.to(torch_device)
+
+            torch.manual_seed(0)
+            if "generator" in inputs_dict:
+                _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+            new_output = new_model(**inputs_dict)
+            new_normalized_output = normalize_output(new_output)
+
+            self.assertTrue(torch.allclose(base_normalized_output, new_normalized_output, atol=1e-5))
+            # set to no.
+            os.environ["HF_ENABLE_PARALLEL_LOADING"] = "no"
+
+    @require_torch_accelerator
+    def test_sharded_checkpoints_device_map(self):
+        if self.model_class._no_split_modules is None:
+            pytest.skip("Test not supported for this model as `_no_split_modules` is not set.")
+        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**config).eval()
+        model = model.to(torch_device)
+
+        torch.manual_seed(0)
+        base_output = model(**inputs_dict)
+        base_normalized_output = normalize_output(base_output)
+
+        model_size = compute_module_persistent_sizes(model)[""]
+        max_shard_size = int((model_size * 0.75) / (2**10))  # Convert to KB as these test models are small.
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.cpu().save_pretrained(tmp_dir, max_shard_size=f"{max_shard_size}KB")
+            self.assertTrue(os.path.exists(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)))
+
+            # Now check if the right number of shards exists. First, let's get the number of shards.
+            # Since this number can be dependent on the model being tested, it's important that we calculate it
+            # instead of hardcoding it.
+            expected_num_shards = caculate_expected_num_shards(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME))
+            actual_num_shards = len([file for file in os.listdir(tmp_dir) if file.endswith(".safetensors")])
+            self.assertTrue(actual_num_shards == expected_num_shards)
+
+            new_model = self.model_class.from_pretrained(tmp_dir, device_map="auto")
+
+            torch.manual_seed(0)
+            if "generator" in inputs_dict:
+                _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+            new_output = new_model(**inputs_dict)
+            new_normalized_output = normalize_output(new_output)
+
+            self.assertTrue(torch.allclose(base_normalized_output, new_normalized_output, atol=1e-5))
+
+    # This test is okay without a GPU because we're not running any execution. We're just serializing
+    # and check if the resultant files are following an expected format.
+    def test_variant_sharded_ckpt_right_format(self):
+        for use_safe in [True, False]:
+            extension = ".safetensors" if use_safe else ".bin"
+            config, _ = self.prepare_init_args_and_inputs_for_common()
+            model = self.model_class(**config).eval()
+
+            model_size = compute_module_persistent_sizes(model)[""]
+            max_shard_size = int((model_size * 0.75) / (2**10))  # Convert to KB as these test models are small.
+            variant = "fp16"
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                model.cpu().save_pretrained(
+                    tmp_dir, variant=variant, max_shard_size=f"{max_shard_size}KB", safe_serialization=use_safe
+                )
+                index_variant = _add_variant(SAFE_WEIGHTS_INDEX_NAME if use_safe else WEIGHTS_INDEX_NAME, variant)
+                self.assertTrue(os.path.exists(os.path.join(tmp_dir, index_variant)))
+
+                # Now check if the right number of shards exists. First, let's get the number of shards.
+                # Since this number can be dependent on the model being tested, it's important that we calculate it
+                # instead of hardcoding it.
+                expected_num_shards = caculate_expected_num_shards(os.path.join(tmp_dir, index_variant))
+                actual_num_shards = len([file for file in os.listdir(tmp_dir) if file.endswith(extension)])
+                self.assertTrue(actual_num_shards == expected_num_shards)
+
+                # Check if the variant is present as a substring in the checkpoints.
+                shard_files = [
+                    file
+                    for file in os.listdir(tmp_dir)
+                    if file.endswith(extension) or ("index" in file and "json" in file)
+                ]
+                assert all(variant in f for f in shard_files)
+
+                # Check if the sharded checkpoints were serialized in the right format.
+                shard_files = [file for file in os.listdir(tmp_dir) if file.endswith(extension)]
+                # Example: diffusion_pytorch_model.fp16-00001-of-00002.safetensors
+                assert all(f.split(".")[1].split("-")[0] == variant for f in shard_files)
+
+    def test_layerwise_casting_training(self):
+        def test_fn(storage_dtype, compute_dtype):
+            if torch.device(torch_device).type == "cpu" and compute_dtype == torch.bfloat16:
+                pytest.skip("Skipping test because CPU doesn't go well with bfloat16.")
+            init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+            model = self.model_class(**init_dict)
+            model = model.to(torch_device, dtype=compute_dtype)
+            model.enable_layerwise_casting(storage_dtype=storage_dtype, compute_dtype=compute_dtype)
+            model.train()
+
+            inputs_dict = cast_maybe_tensor_dtype(inputs_dict, torch.float32, compute_dtype)
+            with torch.amp.autocast(device_type=torch.device(torch_device).type):
+                output = model(**inputs_dict)
+
+                if isinstance(output, dict):
+                    output = output.to_tuple()[0]
+
+                input_tensor = inputs_dict[self.main_input_name]
+                noise = torch.randn((input_tensor.shape[0],) + self.output_shape).to(torch_device)
+                noise = cast_maybe_tensor_dtype(noise, torch.float32, compute_dtype)
+                loss = torch.nn.functional.mse_loss(output, noise)
+
+            loss.backward()
+
+        test_fn(torch.float16, torch.float32)
+        test_fn(torch.float8_e4m3fn, torch.float32)
+        test_fn(torch.float8_e5m2, torch.float32)
+        test_fn(torch.float8_e4m3fn, torch.bfloat16)
+
+    @torch.no_grad()
+    def test_layerwise_casting_inference(self):
+        from diffusers.hooks._common import _GO_LC_SUPPORTED_PYTORCH_LAYERS
+        from diffusers.hooks.layerwise_casting import DEFAULT_SKIP_MODULES_PATTERN
+
+        torch.manual_seed(0)
+        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**config)
+        model.eval()
+        model.to(torch_device)
+        base_slice = model(**inputs_dict)[0]
+        base_slice = normalize_output(base_slice)
+        base_slice = base_slice.detach().flatten().cpu().numpy()
+
+        def check_linear_dtype(module, storage_dtype, compute_dtype):
+            patterns_to_check = DEFAULT_SKIP_MODULES_PATTERN
+            if getattr(module, "_skip_layerwise_casting_patterns", None) is not None:
+                patterns_to_check += tuple(module._skip_layerwise_casting_patterns)
+            for name, submodule in module.named_modules():
+                if not isinstance(submodule, _GO_LC_SUPPORTED_PYTORCH_LAYERS):
+                    continue
+                dtype_to_check = storage_dtype
+                if any(re.search(pattern, name) for pattern in patterns_to_check):
+                    dtype_to_check = compute_dtype
+                if getattr(submodule, "weight", None) is not None:
+                    self.assertEqual(submodule.weight.dtype, dtype_to_check)
+                if getattr(submodule, "bias", None) is not None:
+                    self.assertEqual(submodule.bias.dtype, dtype_to_check)
+
+        def test_layerwise_casting(storage_dtype, compute_dtype):
+            torch.manual_seed(0)
+            config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+            inputs_dict = cast_maybe_tensor_dtype(inputs_dict, torch.float32, compute_dtype)
+            model = self.model_class(**config).eval()
+            model = model.to(torch_device, dtype=compute_dtype)
+            model.enable_layerwise_casting(storage_dtype=storage_dtype, compute_dtype=compute_dtype)
+
+            check_linear_dtype(model, storage_dtype, compute_dtype)
+            output = model(**inputs_dict)[0]
+            output = normalize_output(output)
+            output = output.float().flatten().detach().cpu().numpy()
+
+            # The precision test is not very important for fast tests. In most cases, the outputs will not be the same.
+            # We just want to make sure that the layerwise casting is working as expected.
+            self.assertTrue(numpy_cosine_similarity_distance(base_slice, output) < 1.0)
+
+        test_layerwise_casting(torch.float16, torch.float32)
+        test_layerwise_casting(torch.float8_e4m3fn, torch.float32)
+        test_layerwise_casting(torch.float8_e5m2, torch.float32)
+        test_layerwise_casting(torch.float8_e4m3fn, torch.bfloat16)
+
+    @require_torch_accelerator
+    @torch.no_grad()
+    def test_layerwise_casting_memory(self):
+        MB_TOLERANCE = 0.2
+        LEAST_COMPUTE_CAPABILITY = 8.0
+
+        def reset_memory_stats():
+            gc.collect()
+            backend_synchronize(torch_device)
+            backend_empty_cache(torch_device)
+            backend_reset_peak_memory_stats(torch_device)
+
+        def get_memory_usage(storage_dtype, compute_dtype):
+            torch.manual_seed(0)
+            config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+            inputs_dict = cast_maybe_tensor_dtype(inputs_dict, torch.float32, compute_dtype)
+            model = self.model_class(**config).eval()
+            model = model.to(torch_device, dtype=compute_dtype)
+            model.enable_layerwise_casting(storage_dtype=storage_dtype, compute_dtype=compute_dtype)
+
+            reset_memory_stats()
+            model(**inputs_dict)
+            model_memory_footprint = model.get_memory_footprint()
+            peak_inference_memory_allocated_mb = backend_max_memory_allocated(torch_device) / 1024**2
+
+            return model_memory_footprint, peak_inference_memory_allocated_mb
+
+        fp32_memory_footprint, fp32_max_memory = get_memory_usage(torch.float32, torch.float32)
+        fp8_e4m3_fp32_memory_footprint, fp8_e4m3_fp32_max_memory = get_memory_usage(torch.float8_e4m3fn, torch.float32)
+        fp8_e4m3_bf16_memory_footprint, fp8_e4m3_bf16_max_memory = get_memory_usage(
+            torch.float8_e4m3fn, torch.bfloat16
+        )
+
+        compute_capability = get_torch_cuda_device_capability() if torch_device == "cuda" else None
+        self.assertTrue(fp8_e4m3_bf16_memory_footprint < fp8_e4m3_fp32_memory_footprint < fp32_memory_footprint)
+        # NOTE: the following assertion would fail on our CI (running Tesla T4) due to bf16 using more memory than fp32.
+        # On other devices, such as DGX (Ampere) and Audace (Ada), the test passes. So, we conditionally check it.
+        if compute_capability and compute_capability >= LEAST_COMPUTE_CAPABILITY:
+            self.assertTrue(fp8_e4m3_bf16_max_memory < fp8_e4m3_fp32_max_memory)
+        # On this dummy test case with a small model, sometimes fp8_e4m3_fp32 max memory usage is higher than fp32 by a few
+        # bytes. This only happens for some models, so we allow a small tolerance.
+        # For any real model being tested, the order would be fp8_e4m3_bf16 < fp8_e4m3_fp32 < fp32.
+        self.assertTrue(
+            fp8_e4m3_fp32_max_memory < fp32_max_memory
+            or abs(fp8_e4m3_fp32_max_memory - fp32_max_memory) < MB_TOLERANCE
+        )
+
+    @parameterized.expand([False, True])
+    @require_torch_accelerator
+    def test_group_offloading(self, record_stream):
+        for cls in inspect.getmro(self.__class__):
+            if "test_group_offloading" in cls.__dict__ and cls is not ModelTesterMixin:
+                # Skip this test if it is overwritten by child class. We need to do this because parameterized
+                # materializes the test methods on invocation which cannot be overridden.
+                pytest.skip("Model does not support group offloading.")
+
+        if not self.model_class._supports_group_offloading:
+            pytest.skip("Model does not support group offloading.")
+
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        torch.manual_seed(0)
+
+        @torch.no_grad()
+        def run_forward(model):
+            self.assertTrue(
+                all(
+                    module._diffusers_hook.get_hook("group_offloading") is not None
+                    for module in model.modules()
+                    if hasattr(module, "_diffusers_hook")
+                )
+            )
+            model.eval()
+            return model(**inputs_dict)[0]
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        output_without_group_offloading = run_forward(model)
+        output_without_group_offloading = normalize_output(output_without_group_offloading)
+
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict)
+        model.enable_group_offload(torch_device, offload_type="block_level", num_blocks_per_group=1)
+        output_with_group_offloading1 = run_forward(model)
+        output_with_group_offloading1 = normalize_output(output_with_group_offloading1)
+
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict)
+        model.enable_group_offload(torch_device, offload_type="block_level", num_blocks_per_group=1, non_blocking=True)
+        output_with_group_offloading2 = run_forward(model)
+        output_with_group_offloading2 = normalize_output(output_with_group_offloading2)
+
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict)
+        model.enable_group_offload(torch_device, offload_type="leaf_level")
+        output_with_group_offloading3 = run_forward(model)
+        output_with_group_offloading3 = normalize_output(output_with_group_offloading3)
+
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict)
+        model.enable_group_offload(
+            torch_device, offload_type="leaf_level", use_stream=True, record_stream=record_stream
+        )
+        output_with_group_offloading4 = run_forward(model)
+        output_with_group_offloading4 = normalize_output(output_with_group_offloading4)
+
+        self.assertTrue(torch.allclose(output_without_group_offloading, output_with_group_offloading1, atol=1e-5))
+        self.assertTrue(torch.allclose(output_without_group_offloading, output_with_group_offloading2, atol=1e-5))
+        self.assertTrue(torch.allclose(output_without_group_offloading, output_with_group_offloading3, atol=1e-5))
+        self.assertTrue(torch.allclose(output_without_group_offloading, output_with_group_offloading4, atol=1e-5))
+
+    @parameterized.expand([(False, "block_level"), (True, "leaf_level")])
+    @require_torch_accelerator
+    @torch.no_grad()
+    def test_group_offloading_with_layerwise_casting(self, record_stream, offload_type):
+        if not self.model_class._supports_group_offloading:
+            pytest.skip("Model does not support group offloading.")
+
+        torch.manual_seed(0)
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+
+        model.to(torch_device)
+        model.eval()
+        _ = model(**inputs_dict)[0]
+
+        torch.manual_seed(0)
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        storage_dtype, compute_dtype = torch.float16, torch.float32
+        inputs_dict = cast_maybe_tensor_dtype(inputs_dict, torch.float32, compute_dtype)
+        model = self.model_class(**init_dict)
+        model.eval()
+        additional_kwargs = {} if offload_type == "leaf_level" else {"num_blocks_per_group": 1}
+        model.enable_group_offload(
+            torch_device, offload_type=offload_type, use_stream=True, record_stream=record_stream, **additional_kwargs
+        )
+        model.enable_layerwise_casting(storage_dtype=storage_dtype, compute_dtype=compute_dtype)
+        _ = model(**inputs_dict)[0]
+
+    @parameterized.expand([("block_level", False), ("leaf_level", True)])
+    @require_torch_accelerator
+    @torch.no_grad()
+    @torch.inference_mode()
+    def test_group_offloading_with_disk(self, offload_type, record_stream, atol=1e-5):
+        for cls in inspect.getmro(self.__class__):
+            if "test_group_offloading_with_disk" in cls.__dict__ and cls is not ModelTesterMixin:
+                # Skip this test if it is overwritten by child class. We need to do this because parameterized
+                # materializes the test methods on invocation which cannot be overridden.
+                pytest.skip("Model does not support group offloading with disk yet.")
+
+        if not self.model_class._supports_group_offloading:
+            pytest.skip("Model does not support group offloading.")
+
+        def _has_generator_arg(model):
+            sig = inspect.signature(model.forward)
+            params = sig.parameters
+            return "generator" in params
+
+        def _run_forward(model, inputs_dict):
+            accepts_generator = _has_generator_arg(model)
+            if accepts_generator:
+                inputs_dict["generator"] = torch.manual_seed(0)
+            torch.manual_seed(0)
+            return model(**inputs_dict)[0]
+
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict)
+
+        model.eval()
+        model.to(torch_device)
+        output_without_group_offloading = _run_forward(model, inputs_dict)
+        output_without_group_offloading = normalize_output(output_without_group_offloading)
+
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict)
+        model.eval()
+
+        num_blocks_per_group = None if offload_type == "leaf_level" else 1
+        additional_kwargs = {} if offload_type == "leaf_level" else {"num_blocks_per_group": num_blocks_per_group}
+        with tempfile.TemporaryDirectory() as tmpdir:
+            model.enable_group_offload(
+                torch_device,
+                offload_type=offload_type,
+                offload_to_disk_path=tmpdir,
+                use_stream=True,
+                record_stream=record_stream,
+                **additional_kwargs,
+            )
+            has_safetensors = glob.glob(f"{tmpdir}/*.safetensors")
+            self.assertTrue(has_safetensors, "No safetensors found in the directory.")
+
+            # For "leaf-level", there is a prefetching hook which makes this check a bit non-deterministic
+            # in nature. So, skip it.
+            if offload_type != "leaf_level":
+                is_correct, extra_files, missing_files = _check_safetensors_serialization(
+                    module=model,
+                    offload_to_disk_path=tmpdir,
+                    offload_type=offload_type,
+                    num_blocks_per_group=num_blocks_per_group,
+                    block_modules=model._group_offload_block_modules
+                    if hasattr(model, "_group_offload_block_modules")
+                    else None,
+                )
+                if not is_correct:
+                    if extra_files:
+                        raise ValueError(f"Found extra files: {', '.join(extra_files)}")
+                    elif missing_files:
+                        raise ValueError(f"Following files are missing: {', '.join(missing_files)}")
+
+            output_with_group_offloading = _run_forward(model, inputs_dict)
+            output_with_group_offloading = normalize_output(output_with_group_offloading)
+            self.assertTrue(torch.allclose(output_without_group_offloading, output_with_group_offloading, atol=atol))
+
+    def test_auto_model(self, expected_max_diff=5e-5):
+        if self.forward_requires_fresh_args:
+            model = self.model_class(**self.init_dict)
+        else:
+            init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+            model = self.model_class(**init_dict)
+
+        model = model.eval()
+        model = model.to(torch_device)
+
+        if hasattr(model, "set_default_attn_processor"):
+            model.set_default_attn_processor()
+
+        with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as tmpdirname:
+            model.save_pretrained(tmpdirname, safe_serialization=False)
+
+            auto_model = AutoModel.from_pretrained(tmpdirname)
+            if hasattr(auto_model, "set_default_attn_processor"):
+                auto_model.set_default_attn_processor()
+
+        auto_model = auto_model.eval()
+        auto_model = auto_model.to(torch_device)
+
+        with torch.no_grad():
+            if self.forward_requires_fresh_args:
+                output_original = model(**self.inputs_dict(0))
+                output_auto = auto_model(**self.inputs_dict(0))
+            else:
+                output_original = model(**inputs_dict)
+                output_auto = auto_model(**inputs_dict)
+
+        if isinstance(output_original, dict):
+            output_original = output_original.to_tuple()[0]
+        if isinstance(output_auto, dict):
+            output_auto = output_auto.to_tuple()[0]
+
+        if isinstance(output_original, list):
+            output_original = torch.stack(output_original)
+        if isinstance(output_auto, list):
+            output_auto = torch.stack(output_auto)
+
+        output_original, output_auto = output_original.float(), output_auto.float()
+
+        max_diff = (output_original - output_auto).abs().max().item()
+        self.assertLessEqual(
+            max_diff,
+            expected_max_diff,
+            f"AutoModel forward pass diff: {max_diff} exceeds threshold {expected_max_diff}",
+        )
+
+    @parameterized.expand(
+        [
+            (-1, "You can't pass device_map as a negative int"),
+            ("foo", "When passing device_map as a string, the value needs to be a device name"),
+        ]
+    )
+    def test_wrong_device_map_raises_error(self, device_map, msg_substring):
+        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            model.save_pretrained(tmpdir)
+            with self.assertRaises(ValueError) as err_ctx:
+                _ = self.model_class.from_pretrained(tmpdir, device_map=device_map)
+
+        assert msg_substring in str(err_ctx.exception)
+
+    @parameterized.expand([0, torch_device, torch.device(torch_device)])
+    @require_torch_accelerator
+    def test_passing_non_dict_device_map_works(self, device_map):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict).eval()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            model.save_pretrained(tmpdir)
+            loaded_model = self.model_class.from_pretrained(tmpdir, device_map=device_map)
+            _ = loaded_model(**inputs_dict)
+
+    @parameterized.expand([("", torch_device), ("", torch.device(torch_device))])
+    @require_torch_accelerator
+    def test_passing_dict_device_map_works(self, name, device):
+        # There are other valid dict-based `device_map` values too. It's best to refer to
+        # the docs for those: https://huggingface.co/docs/accelerate/en/concept_guides/big_model_inference#the-devicemap.
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict).eval()
+        device_map = {name: device}
+        with tempfile.TemporaryDirectory() as tmpdir:
+            model.save_pretrained(tmpdir)
+            loaded_model = self.model_class.from_pretrained(tmpdir, device_map=device_map)
+            _ = loaded_model(**inputs_dict)
+
+
 @is_staging_test
-class TestModelPushToHub:
+class ModelPushToHubTester(unittest.TestCase):
     identifier = uuid.uuid4()
     repo_id = f"test-model-{identifier}"
     org_repo_id = f"valid_org/{repo_id}-org"
@@ -329,7 +2041,7 @@ def test_push_to_hub(self):
 
         new_model = UNet2DConditionModel.from_pretrained(f"{USER}/{self.repo_id}")
         for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            assert torch.equal(p1, p2)
+            self.assertTrue(torch.equal(p1, p2))
 
         # Push to hub via save_pretrained to a separate repo. Reusing `self.repo_id` after
         # deleting it makes the staging server's LFS GC reject the next commit with
@@ -340,7 +2052,7 @@ def test_push_to_hub(self):
 
         new_model = UNet2DConditionModel.from_pretrained(f"{USER}/{save_repo_id}")
         for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            assert torch.equal(p1, p2)
+            self.assertTrue(torch.equal(p1, p2))
 
         # Reset repos
         delete_repo(token=TOKEN, repo_id=self.repo_id)
@@ -361,7 +2073,7 @@ def test_push_to_hub_in_organization(self):
 
         new_model = UNet2DConditionModel.from_pretrained(self.org_repo_id)
         for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            assert torch.equal(p1, p2)
+            self.assertTrue(torch.equal(p1, p2))
 
         # Push to hub via save_pretrained to a separate repo. Reusing `self.org_repo_id` after
         # deleting it makes the staging server's LFS GC reject the next commit with
@@ -372,13 +2084,13 @@ def test_push_to_hub_in_organization(self):
 
         new_model = UNet2DConditionModel.from_pretrained(save_org_repo_id)
         for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            assert torch.equal(p1, p2)
+            self.assertTrue(torch.equal(p1, p2))
 
         # Reset repos
         delete_repo(token=TOKEN, repo_id=self.org_repo_id)
         delete_repo(save_org_repo_id, token=TOKEN)
 
-    @pytest.mark.skipif(
+    @unittest.skipIf(
         not is_jinja_available(),
         reason="Model card tests cannot be performed without Jinja installed.",
     )
@@ -403,3 +2115,403 @@ def test_push_to_hub_library_name(self):
 
         # Reset repo
         delete_repo(repo_id, token=TOKEN)
+
+
+@require_torch_accelerator
+@require_torch_2
+@is_torch_compile
+@slow
+@require_torch_version_greater("2.7.1")
+class TorchCompileTesterMixin:
+    different_shapes_for_compilation = None
+
+    def setUp(self):
+        # clean up the VRAM before each test
+        super().setUp()
+        torch.compiler.reset()
+        gc.collect()
+        backend_empty_cache(torch_device)
+
+    def tearDown(self):
+        # clean up the VRAM after each test in case of CUDA runtime errors
+        super().tearDown()
+        torch.compiler.reset()
+        gc.collect()
+        backend_empty_cache(torch_device)
+
+    def test_torch_compile_recompilation_and_graph_break(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        model = self.model_class(**init_dict).to(torch_device)
+        model.eval()
+        model = torch.compile(model, fullgraph=True)
+
+        with (
+            torch._inductor.utils.fresh_inductor_cache(),
+            torch._dynamo.config.patch(error_on_recompile=True),
+            torch.no_grad(),
+        ):
+            _ = model(**inputs_dict)
+            _ = model(**inputs_dict)
+
+    def test_torch_compile_repeated_blocks(self):
+        if self.model_class._repeated_blocks is None:
+            pytest.skip("Skipping test as the model class doesn't have `_repeated_blocks` set.")
+
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        model = self.model_class(**init_dict).to(torch_device)
+        model.eval()
+        model.compile_repeated_blocks(fullgraph=True)
+
+        recompile_limit = 1
+        if self.model_class.__name__ == "UNet2DConditionModel":
+            recompile_limit = 2
+        elif self.model_class.__name__ == "ZImageTransformer2DModel":
+            recompile_limit = 3
+
+        with (
+            torch._inductor.utils.fresh_inductor_cache(),
+            torch._dynamo.config.patch(recompile_limit=recompile_limit),
+            torch.no_grad(),
+        ):
+            _ = model(**inputs_dict)
+            _ = model(**inputs_dict)
+
+    def test_compile_with_group_offloading(self):
+        if not self.model_class._supports_group_offloading:
+            pytest.skip("Model does not support group offloading.")
+
+        torch._dynamo.config.cache_size_limit = 10000
+
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        model.eval()
+        # TODO: Can test for other group offloading kwargs later if needed.
+        group_offload_kwargs = {
+            "onload_device": torch_device,
+            "offload_device": "cpu",
+            "offload_type": "block_level",
+            "num_blocks_per_group": 1,
+            "use_stream": True,
+            "non_blocking": True,
+        }
+        model.enable_group_offload(**group_offload_kwargs)
+        model.compile()
+
+        with torch.no_grad():
+            _ = model(**inputs_dict)
+            _ = model(**inputs_dict)
+
+    def test_compile_on_different_shapes(self):
+        if self.different_shapes_for_compilation is None:
+            pytest.skip(f"Skipping as `different_shapes_for_compilation` is not set for {self.__class__.__name__}.")
+        torch.fx.experimental._config.use_duck_shape = False
+
+        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict).to(torch_device)
+        model.eval()
+        model = torch.compile(model, fullgraph=True, dynamic=True)
+
+        for height, width in self.different_shapes_for_compilation:
+            with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
+                inputs_dict = self.prepare_dummy_input(height=height, width=width)
+                _ = model(**inputs_dict)
+
+    def test_compile_works_with_aot(self):
+        from torch._inductor.package import load_package
+
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        model = self.model_class(**init_dict).to(torch_device)
+        exported_model = torch.export.export(model, args=(), kwargs=inputs_dict)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            package_path = os.path.join(tmpdir, f"{self.model_class.__name__}.pt2")
+            _ = torch._inductor.aoti_compile_and_package(exported_model, package_path=package_path)
+            assert os.path.exists(package_path)
+            loaded_binary = load_package(package_path, run_single_threaded=True)
+
+        model.forward = loaded_binary
+
+        with torch.no_grad():
+            _ = model(**inputs_dict)
+            _ = model(**inputs_dict)
+
+
+@slow
+@require_torch_2
+@require_torch_accelerator
+@require_peft_backend
+@require_peft_version_greater("0.14.0")
+@require_torch_version_greater("2.7.1")
+@is_torch_compile
+class LoraHotSwappingForModelTesterMixin:
+    """Test that hotswapping does not result in recompilation on the model directly.
+
+    We're not extensively testing the hotswapping functionality since it is implemented in PEFT and is extensively
+    tested there. The goal of this test is specifically to ensure that hotswapping with diffusers does not require
+    recompilation.
+
+    See
+    https://github.com/huggingface/peft/blob/eaab05e18d51fb4cce20a73c9acd82a00c013b83/tests/test_gpu_examples.py#L4252
+    for the analogous PEFT test.
+
+    """
+
+    different_shapes_for_compilation = None
+
+    def tearDown(self):
+        # It is critical that the dynamo cache is reset for each test. Otherwise, if the test re-uses the same model,
+        # there will be recompilation errors, as torch caches the model when run in the same process.
+        super().tearDown()
+        torch.compiler.reset()
+        gc.collect()
+        backend_empty_cache(torch_device)
+
+    def get_lora_config(self, lora_rank, lora_alpha, target_modules):
+        from peft import LoraConfig
+
+        lora_config = LoraConfig(
+            r=lora_rank,
+            lora_alpha=lora_alpha,
+            target_modules=target_modules,
+            init_lora_weights=False,
+            use_dora=False,
+        )
+        return lora_config
+
+    def get_linear_module_name_other_than_attn(self, model):
+        linear_names = [
+            name for name, module in model.named_modules() if isinstance(module, nn.Linear) and "to_" not in name
+        ]
+        return linear_names[0]
+
+    def check_model_hotswap(self, do_compile, rank0, rank1, target_modules0, target_modules1=None):
+        """
+        Check that hotswapping works on a small unet.
+
+        Steps:
+        - create 2 LoRA adapters and save them
+        - load the first adapter
+        - hotswap the second adapter
+        - check that the outputs are correct
+        - optionally compile the model
+        - optionally check if recompilations happen on different shapes
+
+        Note: We set rank == alpha here because save_lora_adapter does not save the alpha scalings, thus the test would
+        fail if the values are different. Since rank != alpha does not matter for the purpose of this test, this is
+        fine.
+        """
+        different_shapes = self.different_shapes_for_compilation
+        # create 2 adapters with different ranks and alphas
+        torch.manual_seed(0)
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict).to(torch_device)
+
+        alpha0, alpha1 = rank0, rank1
+        max_rank = max([rank0, rank1])
+        if target_modules1 is None:
+            target_modules1 = target_modules0[:]
+        lora_config0 = self.get_lora_config(rank0, alpha0, target_modules0)
+        lora_config1 = self.get_lora_config(rank1, alpha1, target_modules1)
+
+        model.add_adapter(lora_config0, adapter_name="adapter0")
+        with torch.inference_mode():
+            torch.manual_seed(0)
+            output0_before = model(**inputs_dict)["sample"]
+
+        model.add_adapter(lora_config1, adapter_name="adapter1")
+        model.set_adapter("adapter1")
+        with torch.inference_mode():
+            torch.manual_seed(0)
+            output1_before = model(**inputs_dict)["sample"]
+
+        # sanity checks:
+        tol = 5e-3
+        assert not torch.allclose(output0_before, output1_before, atol=tol, rtol=tol)
+        assert not (output0_before == 0).all()
+        assert not (output1_before == 0).all()
+
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            # save the adapter checkpoints
+            model.save_lora_adapter(os.path.join(tmp_dirname, "0"), safe_serialization=True, adapter_name="adapter0")
+            model.save_lora_adapter(os.path.join(tmp_dirname, "1"), safe_serialization=True, adapter_name="adapter1")
+            del model
+
+            # load the first adapter
+            torch.manual_seed(0)
+            init_dict, _ = self.prepare_init_args_and_inputs_for_common()
+            model = self.model_class(**init_dict).to(torch_device)
+
+            if do_compile or (rank0 != rank1):
+                # no need to prepare if the model is not compiled or if the ranks are identical
+                model.enable_lora_hotswap(target_rank=max_rank)
+
+            file_name0 = os.path.join(os.path.join(tmp_dirname, "0"), "pytorch_lora_weights.safetensors")
+            file_name1 = os.path.join(os.path.join(tmp_dirname, "1"), "pytorch_lora_weights.safetensors")
+            model.load_lora_adapter(file_name0, safe_serialization=True, adapter_name="adapter0", prefix=None)
+
+            if do_compile:
+                model = torch.compile(model, mode="reduce-overhead", dynamic=different_shapes is not None)
+
+            with torch.inference_mode():
+                # additionally check if dynamic compilation works.
+                if different_shapes is not None:
+                    for height, width in different_shapes:
+                        new_inputs_dict = self.prepare_dummy_input(height=height, width=width)
+                        _ = model(**new_inputs_dict)
+                else:
+                    output0_after = model(**inputs_dict)["sample"]
+                    assert torch.allclose(output0_before, output0_after, atol=tol, rtol=tol)
+
+            # hotswap the 2nd adapter
+            model.load_lora_adapter(file_name1, adapter_name="adapter0", hotswap=True, prefix=None)
+
+            # we need to call forward to potentially trigger recompilation
+            with torch.inference_mode():
+                if different_shapes is not None:
+                    for height, width in different_shapes:
+                        new_inputs_dict = self.prepare_dummy_input(height=height, width=width)
+                        _ = model(**new_inputs_dict)
+                else:
+                    output1_after = model(**inputs_dict)["sample"]
+                    assert torch.allclose(output1_before, output1_after, atol=tol, rtol=tol)
+
+            # check error when not passing valid adapter name
+            name = "does-not-exist"
+            msg = f"Trying to hotswap LoRA adapter '{name}' but there is no existing adapter by that name"
+            with self.assertRaisesRegex(ValueError, msg):
+                model.load_lora_adapter(file_name1, adapter_name=name, hotswap=True, prefix=None)
+
+    @parameterized.expand([(11, 11), (7, 13), (13, 7)])  # important to test small to large and vice versa
+    def test_hotswapping_model(self, rank0, rank1):
+        self.check_model_hotswap(
+            do_compile=False, rank0=rank0, rank1=rank1, target_modules0=["to_q", "to_k", "to_v", "to_out.0"]
+        )
+
+    @parameterized.expand([(11, 11), (7, 13), (13, 7)])  # important to test small to large and vice versa
+    def test_hotswapping_compiled_model_linear(self, rank0, rank1):
+        # It's important to add this context to raise an error on recompilation
+        target_modules = ["to_q", "to_k", "to_v", "to_out.0"]
+        with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache():
+            self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)
+
+    @parameterized.expand([(11, 11), (7, 13), (13, 7)])  # important to test small to large and vice versa
+    def test_hotswapping_compiled_model_conv2d(self, rank0, rank1):
+        if "unet" not in self.model_class.__name__.lower():
+            pytest.skip("Test only applies to UNet.")
+
+        # It's important to add this context to raise an error on recompilation
+        target_modules = ["conv", "conv1", "conv2"]
+        with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache():
+            self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)
+
+    @parameterized.expand([(11, 11), (7, 13), (13, 7)])  # important to test small to large and vice versa
+    def test_hotswapping_compiled_model_both_linear_and_conv2d(self, rank0, rank1):
+        if "unet" not in self.model_class.__name__.lower():
+            pytest.skip("Test only applies to UNet.")
+
+        # It's important to add this context to raise an error on recompilation
+        target_modules = ["to_q", "conv"]
+        with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache():
+            self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)
+
+    @parameterized.expand([(11, 11), (7, 13), (13, 7)])  # important to test small to large and vice versa
+    def test_hotswapping_compiled_model_both_linear_and_other(self, rank0, rank1):
+        # In `test_hotswapping_compiled_model_both_linear_and_conv2d()`, we check if we can do hotswapping
+        # with `torch.compile()` for models that have both linear and conv layers. In this test, we check
+        # if we can target a linear layer from the transformer blocks and another linear layer from non-attention
+        # block.
+        target_modules = ["to_q"]
+        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+
+        target_modules.append(self.get_linear_module_name_other_than_attn(model))
+        del model
+
+        # It's important to add this context to raise an error on recompilation
+        with torch._dynamo.config.patch(error_on_recompile=True):
+            self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)
+
+    def test_enable_lora_hotswap_called_after_adapter_added_raises(self):
+        # ensure that enable_lora_hotswap is called before loading the first adapter
+        lora_config = self.get_lora_config(8, 8, target_modules=["to_q"])
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict).to(torch_device)
+        model.add_adapter(lora_config)
+
+        msg = re.escape("Call `enable_lora_hotswap` before loading the first adapter.")
+        with self.assertRaisesRegex(RuntimeError, msg):
+            model.enable_lora_hotswap(target_rank=32)
+
+    def test_enable_lora_hotswap_called_after_adapter_added_warning(self):
+        # ensure that enable_lora_hotswap is called before loading the first adapter
+        from diffusers.loaders.peft import logger
+
+        lora_config = self.get_lora_config(8, 8, target_modules=["to_q"])
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict).to(torch_device)
+        model.add_adapter(lora_config)
+        msg = (
+            "It is recommended to call `enable_lora_hotswap` before loading the first adapter to avoid recompilation."
+        )
+        with self.assertLogs(logger=logger, level="WARNING") as cm:
+            model.enable_lora_hotswap(target_rank=32, check_compiled="warn")
+            assert any(msg in log for log in cm.output)
+
+    def test_enable_lora_hotswap_called_after_adapter_added_ignore(self):
+        # check possibility to ignore the error/warning
+        from diffusers.loaders.peft import logger
+
+        lora_config = self.get_lora_config(8, 8, target_modules=["to_q"])
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict).to(torch_device)
+        model.add_adapter(lora_config)
+        # note: assertNoLogs requires Python 3.10+
+        with self.assertNoLogs(logger, level="WARNING"):
+            model.enable_lora_hotswap(target_rank=32, check_compiled="ignore")
+
+    def test_enable_lora_hotswap_wrong_check_compiled_argument_raises(self):
+        # check that wrong argument value raises an error
+        lora_config = self.get_lora_config(8, 8, target_modules=["to_q"])
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict).to(torch_device)
+        model.add_adapter(lora_config)
+        msg = re.escape("check_compiles should be one of 'error', 'warn', or 'ignore', got 'wrong-argument' instead.")
+        with self.assertRaisesRegex(ValueError, msg):
+            model.enable_lora_hotswap(target_rank=32, check_compiled="wrong-argument")
+
+    def test_hotswap_second_adapter_targets_more_layers_raises(self):
+        # check the error and log
+        from diffusers.loaders.peft import logger
+
+        # at the moment, PEFT requires the 2nd adapter to target the same or a subset of layers
+        target_modules0 = ["to_q"]
+        target_modules1 = ["to_q", "to_k"]
+        with self.assertRaises(RuntimeError):  # peft raises RuntimeError
+            with self.assertLogs(logger=logger, level="ERROR") as cm:
+                self.check_model_hotswap(
+                    do_compile=True, rank0=8, rank1=8, target_modules0=target_modules0, target_modules1=target_modules1
+                )
+                assert any("Hotswapping adapter0 was unsuccessful" in log for log in cm.output)
+
+    @parameterized.expand([(11, 11), (7, 13), (13, 7)])
+    @require_torch_version_greater("2.7.1")
+    def test_hotswapping_compile_on_different_shapes(self, rank0, rank1):
+        different_shapes_for_compilation = self.different_shapes_for_compilation
+        if different_shapes_for_compilation is None:
+            pytest.skip(f"Skipping as `different_shapes_for_compilation` is not set for {self.__class__.__name__}.")
+        # Specifying `use_duck_shape=False` instructs the compiler if it should use the same symbolic
+        # variable to represent input sizes that are the same. For more details,
+        # check out this [comment](https://github.com/huggingface/diffusers/pull/11327#discussion_r2047659790).
+        torch.fx.experimental._config.use_duck_shape = False
+
+        target_modules = ["to_q", "to_k", "to_v", "to_out.0"]
+        with torch._dynamo.config.patch(error_on_recompile=True):
+            self.check_model_hotswap(
+                do_compile=True,
+                rank0=rank0,
+                rank1=rank1,
+                target_modules0=target_modules,
+            )
diff --git a/tests/models/transformers/test_models_dit_transformer2d.py b/tests/models/transformers/test_models_dit_transformer2d.py
index f1efb362d104..473a87637578 100644
--- a/tests/models/transformers/test_models_dit_transformer2d.py
+++ b/tests/models/transformers/test_models_dit_transformer2d.py
@@ -13,48 +13,52 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytest
+import unittest
+
 import torch
 
 from diffusers import DiTTransformer2DModel, Transformer2DModel
-from diffusers.utils.torch_utils import randn_tensor
-
-from ...testing_utils import enable_full_determinism, slow, torch_device
-from ..testing_utils import (
-    AttentionTesterMixin,
-    BaseModelTesterConfig,
-    MemoryTesterMixin,
-    ModelTesterMixin,
-    TrainingTesterMixin,
+
+from ...testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    slow,
+    torch_device,
 )
+from ..test_modeling_common import ModelTesterMixin
 
 
 enable_full_determinism()
 
 
-class DiTTransformer2DTesterConfig(BaseModelTesterConfig):
-    @property
-    def model_class(self):
-        return DiTTransformer2DModel
+class DiTTransformer2DModelTests(ModelTesterMixin, unittest.TestCase):
+    model_class = DiTTransformer2DModel
+    main_input_name = "hidden_states"
 
     @property
-    def main_input_name(self) -> str:
-        return "hidden_states"
+    def dummy_input(self):
+        batch_size = 4
+        in_channels = 4
+        sample_size = 8
+        scheduler_num_train_steps = 1000
+        num_class_labels = 4
+
+        hidden_states = floats_tensor((batch_size, in_channels, sample_size, sample_size)).to(torch_device)
+        timesteps = torch.randint(0, scheduler_num_train_steps, size=(batch_size,)).to(torch_device)
+        class_label_ids = torch.randint(0, num_class_labels, size=(batch_size,)).to(torch_device)
+
+        return {"hidden_states": hidden_states, "timestep": timesteps, "class_labels": class_label_ids}
 
     @property
-    def input_shape(self) -> tuple:
+    def input_shape(self):
         return (4, 8, 8)
 
     @property
-    def output_shape(self) -> tuple:
+    def output_shape(self):
         return (8, 8, 8)
 
-    @property
-    def generator(self):
-        return torch.Generator("cpu").manual_seed(0)
-
-    def get_init_dict(self) -> dict:
-        return {
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
             "in_channels": 4,
             "out_channels": 8,
             "activation_fn": "gelu-approximate",
@@ -67,38 +71,26 @@ def get_init_dict(self) -> dict:
             "patch_size": 2,
             "sample_size": 8,
         }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
 
-    def get_dummy_inputs(self, batch_size: int = 4) -> dict[str, torch.Tensor]:
-        in_channels = 4
-        sample_size = 8
-        scheduler_num_train_steps = 1000
-        num_class_labels = 4
-
-        return {
-            "hidden_states": randn_tensor(
-                (batch_size, in_channels, sample_size, sample_size), generator=self.generator, device=torch_device
-            ),
-            "timestep": torch.randint(0, scheduler_num_train_steps, size=(batch_size,), generator=self.generator).to(
-                torch_device
-            ),
-            "class_labels": torch.randint(0, num_class_labels, size=(batch_size,), generator=self.generator).to(
-                torch_device
-            ),
-        }
-
-
-class TestDiTTransformer2D(DiTTransformer2DTesterConfig, ModelTesterMixin):
-    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
-    def test_from_save_pretrained_dtype_inference(self, tmp_path, dtype):
-        # Skip: fp16/bf16 require very high atol to pass, providing little signal.
-        # Dtype preservation is already tested by test_from_save_pretrained_dtype and test_keep_in_fp32_modules.
-        pytest.skip("Tolerance requirements too high for meaningful test")
+    def test_output(self):
+        super().test_output(
+            expected_output_shape=(self.dummy_input[self.main_input_name].shape[0],) + self.output_shape
+        )
 
     def test_correct_class_remapping_from_dict_config(self):
-        init_dict = self.get_init_dict()
+        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
         model = Transformer2DModel.from_config(init_dict)
         assert isinstance(model, DiTTransformer2DModel)
 
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"DiTTransformer2DModel"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
+    def test_effective_gradient_checkpointing(self):
+        super().test_effective_gradient_checkpointing(loss_tolerance=1e-4)
+
     def test_correct_class_remapping_from_pretrained_config(self):
         config = DiTTransformer2DModel.load_config("facebook/DiT-XL-2-256", subfolder="transformer")
         model = Transformer2DModel.from_config(config)
@@ -108,20 +100,3 @@ def test_correct_class_remapping_from_pretrained_config(self):
     def test_correct_class_remapping(self):
         model = Transformer2DModel.from_pretrained("facebook/DiT-XL-2-256", subfolder="transformer")
         assert isinstance(model, DiTTransformer2DModel)
-
-
-class TestDiTTransformer2DMemory(DiTTransformer2DTesterConfig, MemoryTesterMixin):
-    pass
-
-
-class TestDiTTransformer2DAttention(DiTTransformer2DTesterConfig, AttentionTesterMixin):
-    pass
-
-
-class TestDiTTransformer2DTraining(DiTTransformer2DTesterConfig, TrainingTesterMixin):
-    def test_gradient_checkpointing_is_applied(self):
-        expected_set = {"DiTTransformer2DModel"}
-        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
-
-    def test_gradient_checkpointing_equivalence(self):
-        super().test_gradient_checkpointing_equivalence(loss_tolerance=1e-4)
diff --git a/tests/models/transformers/test_models_pixart_transformer2d.py b/tests/models/transformers/test_models_pixart_transformer2d.py
index 879274c52dbd..17c400cf1911 100644
--- a/tests/models/transformers/test_models_pixart_transformer2d.py
+++ b/tests/models/transformers/test_models_pixart_transformer2d.py
@@ -13,53 +13,60 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytest
+import unittest
+
 import torch
 
 from diffusers import PixArtTransformer2DModel, Transformer2DModel
-from diffusers.utils.torch_utils import randn_tensor
-
-from ...testing_utils import enable_full_determinism, slow, torch_device
-from ..testing_utils import (
-    AttentionTesterMixin,
-    BaseModelTesterConfig,
-    MemoryTesterMixin,
-    ModelTesterMixin,
-    TrainingTesterMixin,
+
+from ...testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    slow,
+    torch_device,
 )
+from ..test_modeling_common import ModelTesterMixin
 
 
 enable_full_determinism()
 
 
-class PixArtTransformer2DTesterConfig(BaseModelTesterConfig):
-    @property
-    def model_class(self):
-        return PixArtTransformer2DModel
+class PixArtTransformer2DModelTests(ModelTesterMixin, unittest.TestCase):
+    model_class = PixArtTransformer2DModel
+    main_input_name = "hidden_states"
+    # We override the items here because the transformer under consideration is small.
+    model_split_percents = [0.7, 0.6, 0.6]
 
     @property
-    def main_input_name(self) -> str:
-        return "hidden_states"
+    def dummy_input(self):
+        batch_size = 4
+        in_channels = 4
+        sample_size = 8
+        scheduler_num_train_steps = 1000
+        cross_attention_dim = 8
+        seq_len = 8
 
-    @property
-    def input_shape(self) -> tuple:
-        return (4, 8, 8)
+        hidden_states = floats_tensor((batch_size, in_channels, sample_size, sample_size)).to(torch_device)
+        timesteps = torch.randint(0, scheduler_num_train_steps, size=(batch_size,)).to(torch_device)
+        encoder_hidden_states = floats_tensor((batch_size, seq_len, cross_attention_dim)).to(torch_device)
 
-    @property
-    def output_shape(self) -> tuple:
-        return (8, 8, 8)
+        return {
+            "hidden_states": hidden_states,
+            "timestep": timesteps,
+            "encoder_hidden_states": encoder_hidden_states,
+            "added_cond_kwargs": {"aspect_ratio": None, "resolution": None},
+        }
 
     @property
-    def model_split_percents(self) -> list:
-        # We override the items here because the transformer under consideration is small.
-        return [0.7, 0.6, 0.6]
+    def input_shape(self):
+        return (4, 8, 8)
 
     @property
-    def generator(self):
-        return torch.Generator("cpu").manual_seed(0)
+    def output_shape(self):
+        return (8, 8, 8)
 
-    def get_init_dict(self) -> dict:
-        return {
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
             "sample_size": 8,
             "num_layers": 1,
             "patch_size": 2,
@@ -77,37 +84,20 @@ def get_init_dict(self) -> dict:
             "use_additional_conditions": False,
             "caption_channels": None,
         }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
 
-    def get_dummy_inputs(self, batch_size: int = 4) -> dict[str, torch.Tensor]:
-        in_channels = 4
-        sample_size = 8
-        scheduler_num_train_steps = 1000
-        cross_attention_dim = 8
-        seq_len = 8
-
-        return {
-            "hidden_states": randn_tensor(
-                (batch_size, in_channels, sample_size, sample_size), generator=self.generator, device=torch_device
-            ),
-            "timestep": torch.randint(0, scheduler_num_train_steps, size=(batch_size,), generator=self.generator).to(
-                torch_device
-            ),
-            "encoder_hidden_states": randn_tensor(
-                (batch_size, seq_len, cross_attention_dim), generator=self.generator, device=torch_device
-            ),
-            "added_cond_kwargs": {"aspect_ratio": None, "resolution": None},
-        }
-
+    def test_output(self):
+        super().test_output(
+            expected_output_shape=(self.dummy_input[self.main_input_name].shape[0],) + self.output_shape
+        )
 
-class TestPixArtTransformer2D(PixArtTransformer2DTesterConfig, ModelTesterMixin):
-    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
-    def test_from_save_pretrained_dtype_inference(self, tmp_path, dtype):
-        # Skip: fp16/bf16 require very high atol to pass, providing little signal.
-        # Dtype preservation is already tested by test_from_save_pretrained_dtype and test_keep_in_fp32_modules.
-        pytest.skip("Tolerance requirements too high for meaningful test")
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"PixArtTransformer2DModel"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
 
     def test_correct_class_remapping_from_dict_config(self):
-        init_dict = self.get_init_dict()
+        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
         model = Transformer2DModel.from_config(init_dict)
         assert isinstance(model, PixArtTransformer2DModel)
 
@@ -120,17 +110,3 @@ def test_correct_class_remapping_from_pretrained_config(self):
     def test_correct_class_remapping(self):
         model = Transformer2DModel.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="transformer")
         assert isinstance(model, PixArtTransformer2DModel)
-
-
-class TestPixArtTransformer2DMemory(PixArtTransformer2DTesterConfig, MemoryTesterMixin):
-    pass
-
-
-class TestPixArtTransformer2DAttention(PixArtTransformer2DTesterConfig, AttentionTesterMixin):
-    pass
-
-
-class TestPixArtTransformer2DTraining(PixArtTransformer2DTesterConfig, TrainingTesterMixin):
-    def test_gradient_checkpointing_is_applied(self):
-        expected_set = {"PixArtTransformer2DModel"}
-        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
diff --git a/tests/models/transformers/test_models_prior.py b/tests/models/transformers/test_models_prior.py
index 1da32b77786a..af5ac4bbbd76 100644
--- a/tests/models/transformers/test_models_prior.py
+++ b/tests/models/transformers/test_models_prior.py
@@ -21,69 +21,41 @@
 from parameterized import parameterized
 
 from diffusers import PriorTransformer
-from diffusers.utils.torch_utils import randn_tensor
 
 from ...testing_utils import (
     backend_empty_cache,
     enable_full_determinism,
+    floats_tensor,
     slow,
     torch_all_close,
     torch_device,
 )
-from ..testing_utils import (
-    AttentionTesterMixin,
-    BaseModelTesterConfig,
-    MemoryTesterMixin,
-    ModelTesterMixin,
-    TrainingTesterMixin,
-)
+from ..test_modeling_common import ModelTesterMixin
 
 
 enable_full_determinism()
 
 
-class PriorTransformerTesterConfig(BaseModelTesterConfig):
-    @property
-    def model_class(self):
-        return PriorTransformer
-
-    @property
-    def main_input_name(self) -> str:
-        return "hidden_states"
-
-    @property
-    def input_shape(self) -> tuple:
-        return (4, 8)
-
-    @property
-    def output_shape(self) -> tuple:
-        return (4, 8)
+class PriorTransformerTests(ModelTesterMixin, unittest.TestCase):
+    model_class = PriorTransformer
+    main_input_name = "hidden_states"
 
     @property
-    def generator(self):
-        return torch.Generator("cpu").manual_seed(0)
-
-    def get_init_dict(self) -> dict:
-        return {
-            "num_attention_heads": 2,
-            "attention_head_dim": 4,
-            "num_layers": 2,
-            "embedding_dim": 8,
-            "num_embeddings": 7,
-            "additional_embeddings": 4,
-        }
-
-    def get_dummy_inputs(self, batch_size: int = 4) -> dict:
+    def dummy_input(self):
+        batch_size = 4
         embedding_dim = 8
         num_embeddings = 7
 
+        hidden_states = floats_tensor((batch_size, embedding_dim)).to(torch_device)
+
+        proj_embedding = floats_tensor((batch_size, embedding_dim)).to(torch_device)
+        encoder_hidden_states = floats_tensor((batch_size, num_embeddings, embedding_dim)).to(torch_device)
+
         return {
-            "hidden_states": randn_tensor((batch_size, embedding_dim), generator=self.generator, device=torch_device),
+            "hidden_states": hidden_states,
             "timestep": 2,
-            "proj_embedding": randn_tensor((batch_size, embedding_dim), generator=self.generator, device=torch_device),
-            "encoder_hidden_states": randn_tensor(
-                (batch_size, num_embeddings, embedding_dim), generator=self.generator, device=torch_device
-            ),
+            "proj_embedding": proj_embedding,
+            "encoder_hidden_states": encoder_hidden_states,
         }
 
     def get_dummy_seed_input(self, seed=0):
@@ -93,6 +65,7 @@ def get_dummy_seed_input(self, seed=0):
         num_embeddings = 7
 
         hidden_states = torch.randn((batch_size, embedding_dim)).to(torch_device)
+
         proj_embedding = torch.randn((batch_size, embedding_dim)).to(torch_device)
         encoder_hidden_states = torch.randn((batch_size, num_embeddings, embedding_dim)).to(torch_device)
 
@@ -103,28 +76,48 @@ def get_dummy_seed_input(self, seed=0):
             "encoder_hidden_states": encoder_hidden_states,
         }
 
+    @property
+    def input_shape(self):
+        return (4, 8)
+
+    @property
+    def output_shape(self):
+        return (4, 8)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "num_attention_heads": 2,
+            "attention_head_dim": 4,
+            "num_layers": 2,
+            "embedding_dim": 8,
+            "num_embeddings": 7,
+            "additional_embeddings": 4,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
 
-class TestPriorTransformer(PriorTransformerTesterConfig, ModelTesterMixin):
     def test_from_pretrained_hub(self):
         model, loading_info = PriorTransformer.from_pretrained(
             "hf-internal-testing/prior-dummy", output_loading_info=True
         )
-        assert model is not None
-        assert len(loading_info["missing_keys"]) == 0
+        self.assertIsNotNone(model)
+        self.assertEqual(len(loading_info["missing_keys"]), 0)
 
         model.to(torch_device)
-        hidden_states = model(**self.get_dummy_inputs())[0]
+        hidden_states = model(**self.dummy_input)[0]
 
         assert hidden_states is not None, "Make sure output is not None"
 
     def test_forward_signature(self):
-        model = self.model_class(**self.get_init_dict())
+        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
+
+        model = self.model_class(**init_dict)
         signature = inspect.signature(model.forward)
         # signature.parameters is an OrderedDict => so arg_names order is deterministic
         arg_names = [*signature.parameters.keys()]
 
         expected_arg_names = ["hidden_states", "timestep"]
-        assert arg_names[:2] == expected_arg_names
+        self.assertListEqual(arg_names[:2], expected_arg_names)
 
     def test_output_pretrained(self):
         model = PriorTransformer.from_pretrained("hf-internal-testing/prior-dummy")
@@ -143,19 +136,7 @@ def test_output_pretrained(self):
         # Since the VAE Gaussian prior's generator is seeded on the appropriate device,
         # the expected output slices are not the same for CPU and GPU.
         expected_output_slice = torch.tensor([-1.3436, -0.2870, 0.7538, 0.4368, -0.0239])
-        assert torch_all_close(output_slice, expected_output_slice, rtol=1e-2)
-
-
-class TestPriorTransformerMemory(PriorTransformerTesterConfig, MemoryTesterMixin):
-    pass
-
-
-class TestPriorTransformerAttention(PriorTransformerTesterConfig, AttentionTesterMixin):
-    pass
-
-
-class TestPriorTransformerTraining(PriorTransformerTesterConfig, TrainingTesterMixin):
-    pass
+        self.assertTrue(torch_all_close(output_slice, expected_output_slice, rtol=1e-2))
 
 
 @slow
diff --git a/tests/models/transformers/test_models_transformer_allegro.py b/tests/models/transformers/test_models_transformer_allegro.py
index 0c3e302a3f0d..7c002f87819e 100644
--- a/tests/models/transformers/test_models_transformer_allegro.py
+++ b/tests/models/transformers/test_models_transformer_allegro.py
@@ -12,47 +12,57 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import unittest
+
 import torch
 
 from diffusers import AllegroTransformer3DModel
-from diffusers.utils.torch_utils import randn_tensor
 
-from ...testing_utils import enable_full_determinism, torch_device
-from ..testing_utils import (
-    AttentionTesterMixin,
-    BaseModelTesterConfig,
-    MemoryTesterMixin,
-    ModelTesterMixin,
-    TrainingTesterMixin,
+from ...testing_utils import (
+    enable_full_determinism,
+    torch_device,
 )
+from ..test_modeling_common import ModelTesterMixin
 
 
 enable_full_determinism()
 
 
-class AllegroTransformerTesterConfig(BaseModelTesterConfig):
-    @property
-    def model_class(self):
-        return AllegroTransformer3DModel
+class AllegroTransformerTests(ModelTesterMixin, unittest.TestCase):
+    model_class = AllegroTransformer3DModel
+    main_input_name = "hidden_states"
+    uses_custom_attn_processor = True
 
     @property
-    def main_input_name(self) -> str:
-        return "hidden_states"
+    def dummy_input(self):
+        batch_size = 2
+        num_channels = 4
+        num_frames = 2
+        height = 8
+        width = 8
+        embedding_dim = 16
+        sequence_length = 16
 
-    @property
-    def input_shape(self) -> tuple:
-        return (4, 2, 8, 8)
+        hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim // 2)).to(torch_device)
+        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
+
+        return {
+            "hidden_states": hidden_states,
+            "encoder_hidden_states": encoder_hidden_states,
+            "timestep": timestep,
+        }
 
     @property
-    def output_shape(self) -> tuple:
+    def input_shape(self):
         return (4, 2, 8, 8)
 
     @property
-    def generator(self):
-        return torch.Generator("cpu").manual_seed(0)
+    def output_shape(self):
+        return (4, 2, 8, 8)
 
-    def get_init_dict(self) -> dict:
-        return {
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
             # Product of num_attention_heads * attention_head_dim must be divisible by 16 for 3D positional embeddings.
             "num_attention_heads": 2,
             "attention_head_dim": 8,
@@ -65,38 +75,9 @@ def get_init_dict(self) -> dict:
             "sample_frames": 8,
             "caption_channels": 8,
         }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
 
-    def get_dummy_inputs(self, batch_size: int = 2) -> dict[str, torch.Tensor]:
-        num_channels = 4
-        num_frames = 2
-        height = width = 8
-        embedding_dim = 16
-        sequence_length = 16
-
-        return {
-            "hidden_states": randn_tensor(
-                (batch_size, num_channels, num_frames, height, width), generator=self.generator, device=torch_device
-            ),
-            "encoder_hidden_states": randn_tensor(
-                (batch_size, sequence_length, embedding_dim // 2), generator=self.generator, device=torch_device
-            ),
-            "timestep": torch.randint(0, 1000, size=(batch_size,), generator=self.generator).to(torch_device),
-        }
-
-
-class TestAllegroTransformer(AllegroTransformerTesterConfig, ModelTesterMixin):
-    pass
-
-
-class TestAllegroTransformerMemory(AllegroTransformerTesterConfig, MemoryTesterMixin):
-    pass
-
-
-class TestAllegroTransformerAttention(AllegroTransformerTesterConfig, AttentionTesterMixin):
-    pass
-
-
-class TestAllegroTransformerTraining(AllegroTransformerTesterConfig, TrainingTesterMixin):
     def test_gradient_checkpointing_is_applied(self):
         expected_set = {"AllegroTransformer3DModel"}
         super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
diff --git a/tests/models/transformers/test_models_transformer_aura_flow.py b/tests/models/transformers/test_models_transformer_aura_flow.py
index 3e13945977fd..ae8c3b7234a3 100644
--- a/tests/models/transformers/test_models_transformer_aura_flow.py
+++ b/tests/models/transformers/test_models_transformer_aura_flow.py
@@ -13,52 +13,52 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import unittest
+
 import torch
 
 from diffusers import AuraFlowTransformer2DModel
-from diffusers.utils.torch_utils import randn_tensor
 
 from ...testing_utils import enable_full_determinism, torch_device
-from ..testing_utils import (
-    AttentionTesterMixin,
-    BaseModelTesterConfig,
-    MemoryTesterMixin,
-    ModelTesterMixin,
-    TrainingTesterMixin,
-)
+from ..test_modeling_common import ModelTesterMixin
 
 
 enable_full_determinism()
 
 
-class AuraFlowTransformerTesterConfig(BaseModelTesterConfig):
-    @property
-    def model_class(self):
-        return AuraFlowTransformer2DModel
+class AuraFlowTransformerTests(ModelTesterMixin, unittest.TestCase):
+    model_class = AuraFlowTransformer2DModel
+    main_input_name = "hidden_states"
+    # We override the items here because the transformer under consideration is small.
+    model_split_percents = [0.7, 0.6, 0.6]
 
     @property
-    def main_input_name(self) -> str:
-        return "hidden_states"
+    def dummy_input(self):
+        batch_size = 2
+        num_channels = 4
+        height = width = embedding_dim = 32
+        sequence_length = 256
 
-    @property
-    def input_shape(self) -> tuple:
-        return (4, 32, 32)
+        hidden_states = torch.randn((batch_size, num_channels, height, width)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
+        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
 
-    @property
-    def output_shape(self) -> tuple:
-        return (4, 32, 32)
+        return {
+            "hidden_states": hidden_states,
+            "encoder_hidden_states": encoder_hidden_states,
+            "timestep": timestep,
+        }
 
     @property
-    def model_split_percents(self) -> list:
-        # We override the items here because the transformer under consideration is small.
-        return [0.7, 0.6, 0.6]
+    def input_shape(self):
+        return (4, 32, 32)
 
     @property
-    def generator(self):
-        return torch.Generator("cpu").manual_seed(0)
+    def output_shape(self):
+        return (4, 32, 32)
 
-    def get_init_dict(self) -> dict:
-        return {
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
             "sample_size": 32,
             "patch_size": 2,
             "in_channels": 4,
@@ -71,36 +71,13 @@ def get_init_dict(self) -> dict:
             "out_channels": 4,
             "pos_embed_max_size": 256,
         }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
 
-    def get_dummy_inputs(self, batch_size: int = 2) -> dict[str, torch.Tensor]:
-        num_channels = 4
-        height = width = embedding_dim = 32
-        sequence_length = 256
-
-        return {
-            "hidden_states": randn_tensor(
-                (batch_size, num_channels, height, width), generator=self.generator, device=torch_device
-            ),
-            "encoder_hidden_states": randn_tensor(
-                (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device
-            ),
-            "timestep": torch.randint(0, 1000, size=(batch_size,), generator=self.generator).to(torch_device),
-        }
-
-
-class TestAuraFlowTransformer(AuraFlowTransformerTesterConfig, ModelTesterMixin):
-    pass
-
-
-class TestAuraFlowTransformerMemory(AuraFlowTransformerTesterConfig, MemoryTesterMixin):
-    pass
-
-
-class TestAuraFlowTransformerAttention(AuraFlowTransformerTesterConfig, AttentionTesterMixin):
-    pass
-
-
-class TestAuraFlowTransformerTraining(AuraFlowTransformerTesterConfig, TrainingTesterMixin):
     def test_gradient_checkpointing_is_applied(self):
         expected_set = {"AuraFlowTransformer2DModel"}
         super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
+    @unittest.skip("AuraFlowTransformer2DModel uses its own dedicated attention processor. This test does not apply")
+    def test_set_attn_processor_for_determinism(self):
+        pass
diff --git a/tests/models/transformers/test_models_transformer_cogvideox.py b/tests/models/transformers/test_models_transformer_cogvideox.py
index 97ac1b40621f..f632add7e5a7 100644
--- a/tests/models/transformers/test_models_transformer_cogvideox.py
+++ b/tests/models/transformers/test_models_transformer_cogvideox.py
@@ -13,51 +13,58 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import unittest
+
 import torch
 
 from diffusers import CogVideoXTransformer3DModel
-from diffusers.utils.torch_utils import randn_tensor
-
-from ...testing_utils import enable_full_determinism, torch_device
-from ..testing_utils import (
-    AttentionTesterMixin,
-    BaseModelTesterConfig,
-    MemoryTesterMixin,
-    ModelTesterMixin,
-    TrainingTesterMixin,
+
+from ...testing_utils import (
+    enable_full_determinism,
+    torch_device,
 )
+from ..test_modeling_common import ModelTesterMixin
 
 
 enable_full_determinism()
 
 
-class CogVideoXTransformerTesterConfig(BaseModelTesterConfig):
-    @property
-    def model_class(self):
-        return CogVideoXTransformer3DModel
+class CogVideoXTransformerTests(ModelTesterMixin, unittest.TestCase):
+    model_class = CogVideoXTransformer3DModel
+    main_input_name = "hidden_states"
+    uses_custom_attn_processor = True
+    model_split_percents = [0.7, 0.7, 0.8]
 
     @property
-    def main_input_name(self) -> str:
-        return "hidden_states"
+    def dummy_input(self):
+        batch_size = 2
+        num_channels = 4
+        num_frames = 1
+        height = 8
+        width = 8
+        embedding_dim = 8
+        sequence_length = 8
 
-    @property
-    def input_shape(self) -> tuple:
-        return (1, 4, 8, 8)
+        hidden_states = torch.randn((batch_size, num_frames, num_channels, height, width)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
+        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
 
-    @property
-    def output_shape(self) -> tuple:
-        return (1, 4, 8, 8)
+        return {
+            "hidden_states": hidden_states,
+            "encoder_hidden_states": encoder_hidden_states,
+            "timestep": timestep,
+        }
 
     @property
-    def model_split_percents(self) -> list:
-        return [0.7, 0.7, 0.8]
+    def input_shape(self):
+        return (1, 4, 8, 8)
 
     @property
-    def generator(self):
-        return torch.Generator("cpu").manual_seed(0)
+    def output_shape(self):
+        return (1, 4, 8, 8)
 
-    def get_init_dict(self) -> dict:
-        return {
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
             # Product of num_attention_heads * attention_head_dim must be divisible by 16 for 3D positional embeddings.
             "num_attention_heads": 2,
             "attention_head_dim": 8,
@@ -74,36 +81,49 @@ def get_init_dict(self) -> dict:
             "temporal_compression_ratio": 4,
             "max_text_seq_length": 8,
         }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"CogVideoXTransformer3DModel"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
 
-    def get_dummy_inputs(self, batch_size: int = 2) -> dict[str, torch.Tensor]:
+class CogVideoX1_5TransformerTests(ModelTesterMixin, unittest.TestCase):
+    model_class = CogVideoXTransformer3DModel
+    main_input_name = "hidden_states"
+    uses_custom_attn_processor = True
+
+    @property
+    def dummy_input(self):
+        batch_size = 2
         num_channels = 4
-        num_frames = 1
-        height = width = 8
+        num_frames = 2
+        height = 8
+        width = 8
         embedding_dim = 8
         sequence_length = 8
 
+        hidden_states = torch.randn((batch_size, num_frames, num_channels, height, width)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
+        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
+
         return {
-            "hidden_states": randn_tensor(
-                (batch_size, num_frames, num_channels, height, width), generator=self.generator, device=torch_device
-            ),
-            "encoder_hidden_states": randn_tensor(
-                (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device
-            ),
-            "timestep": torch.randint(0, 1000, size=(batch_size,), generator=self.generator).to(torch_device),
+            "hidden_states": hidden_states,
+            "encoder_hidden_states": encoder_hidden_states,
+            "timestep": timestep,
         }
 
-
-class CogVideoX15TransformerTesterConfig(CogVideoXTransformerTesterConfig):
     @property
-    def output_shape(self) -> tuple:
-        return (2, 4, 8, 8)
+    def input_shape(self):
+        return (1, 4, 8, 8)
 
     @property
-    def model_split_percents(self) -> list:
-        return [0.9]
+    def output_shape(self):
+        return (1, 4, 8, 8)
 
-    def get_init_dict(self) -> dict:
-        return {
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
             # Product of num_attention_heads * attention_head_dim must be divisible by 16 for 3D positional embeddings.
             "num_attention_heads": 2,
             "attention_head_dim": 8,
@@ -121,56 +141,9 @@ def get_init_dict(self) -> dict:
             "max_text_seq_length": 8,
             "use_rotary_positional_embeddings": True,
         }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
 
-    def get_dummy_inputs(self, batch_size: int = 2) -> dict[str, torch.Tensor]:
-        num_channels = 4
-        num_frames = 2
-        height = width = 8
-        embedding_dim = 8
-        sequence_length = 8
-
-        return {
-            "hidden_states": randn_tensor(
-                (batch_size, num_frames, num_channels, height, width), generator=self.generator, device=torch_device
-            ),
-            "encoder_hidden_states": randn_tensor(
-                (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device
-            ),
-            "timestep": torch.randint(0, 1000, size=(batch_size,), generator=self.generator).to(torch_device),
-        }
-
-
-class TestCogVideoXTransformer(CogVideoXTransformerTesterConfig, ModelTesterMixin):
-    pass
-
-
-class TestCogVideoXTransformerMemory(CogVideoXTransformerTesterConfig, MemoryTesterMixin):
-    pass
-
-
-class TestCogVideoXTransformerAttention(CogVideoXTransformerTesterConfig, AttentionTesterMixin):
-    pass
-
-
-class TestCogVideoXTransformerTraining(CogVideoXTransformerTesterConfig, TrainingTesterMixin):
-    def test_gradient_checkpointing_is_applied(self):
-        expected_set = {"CogVideoXTransformer3DModel"}
-        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
-
-
-class TestCogVideoX15Transformer(CogVideoX15TransformerTesterConfig, ModelTesterMixin):
-    pass
-
-
-class TestCogVideoX15TransformerMemory(CogVideoX15TransformerTesterConfig, MemoryTesterMixin):
-    pass
-
-
-class TestCogVideoX15TransformerAttention(CogVideoX15TransformerTesterConfig, AttentionTesterMixin):
-    pass
-
-
-class TestCogVideoX15TransformerTraining(CogVideoX15TransformerTesterConfig, TrainingTesterMixin):
     def test_gradient_checkpointing_is_applied(self):
         expected_set = {"CogVideoXTransformer3DModel"}
         super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
diff --git a/tests/models/transformers/test_models_transformer_cogview3plus.py b/tests/models/transformers/test_models_transformer_cogview3plus.py
index 97ac28a108e1..d38d77531d4c 100644
--- a/tests/models/transformers/test_models_transformer_cogview3plus.py
+++ b/tests/models/transformers/test_models_transformer_cogview3plus.py
@@ -13,52 +13,63 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytest
+import unittest
+
 import torch
 
 from diffusers import CogView3PlusTransformer2DModel
-from diffusers.utils.torch_utils import randn_tensor
 
-from ...testing_utils import enable_full_determinism, torch_device
-from ..testing_utils import (
-    AttentionTesterMixin,
-    BaseModelTesterConfig,
-    MemoryTesterMixin,
-    ModelTesterMixin,
-    TrainingTesterMixin,
+from ...testing_utils import (
+    enable_full_determinism,
+    torch_device,
 )
+from ..test_modeling_common import ModelTesterMixin
 
 
 enable_full_determinism()
 
 
-class CogView3PlusTransformerTesterConfig(BaseModelTesterConfig):
-    @property
-    def model_class(self):
-        return CogView3PlusTransformer2DModel
+class CogView3PlusTransformerTests(ModelTesterMixin, unittest.TestCase):
+    model_class = CogView3PlusTransformer2DModel
+    main_input_name = "hidden_states"
+    uses_custom_attn_processor = True
+    model_split_percents = [0.7, 0.6, 0.6]
 
     @property
-    def main_input_name(self) -> str:
-        return "hidden_states"
+    def dummy_input(self):
+        batch_size = 2
+        num_channels = 4
+        height = 8
+        width = 8
+        embedding_dim = 8
+        sequence_length = 8
 
-    @property
-    def input_shape(self) -> tuple:
-        return (1, 4, 8, 8)
+        hidden_states = torch.randn((batch_size, num_channels, height, width)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
+        original_size = torch.tensor([height * 8, width * 8]).unsqueeze(0).repeat(batch_size, 1).to(torch_device)
+        target_size = torch.tensor([height * 8, width * 8]).unsqueeze(0).repeat(batch_size, 1).to(torch_device)
+        crop_coords = torch.tensor([0, 0]).unsqueeze(0).repeat(batch_size, 1).to(torch_device)
+        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
 
-    @property
-    def output_shape(self) -> tuple:
-        return (1, 4, 8, 8)
+        return {
+            "hidden_states": hidden_states,
+            "encoder_hidden_states": encoder_hidden_states,
+            "original_size": original_size,
+            "target_size": target_size,
+            "crop_coords": crop_coords,
+            "timestep": timestep,
+        }
 
     @property
-    def model_split_percents(self) -> list:
-        return [0.7, 0.6, 0.6]
+    def input_shape(self):
+        return (1, 4, 8, 8)
 
     @property
-    def generator(self):
-        return torch.Generator("cpu").manual_seed(0)
+    def output_shape(self):
+        return (1, 4, 8, 8)
 
-    def get_init_dict(self) -> dict:
-        return {
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
             "patch_size": 2,
             "in_channels": 4,
             "num_layers": 2,
@@ -71,48 +82,9 @@ def get_init_dict(self) -> dict:
             "pos_embed_max_size": 8,
             "sample_size": 8,
         }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
 
-    def get_dummy_inputs(self, batch_size: int = 2) -> dict[str, torch.Tensor]:
-        num_channels = 4
-        height = width = 8
-        embedding_dim = 8
-        sequence_length = 8
-
-        original_size = torch.tensor([height * 8, width * 8]).unsqueeze(0).repeat(batch_size, 1).to(torch_device)
-        target_size = torch.tensor([height * 8, width * 8]).unsqueeze(0).repeat(batch_size, 1).to(torch_device)
-        crop_coords = torch.tensor([0, 0]).unsqueeze(0).repeat(batch_size, 1).to(torch_device)
-
-        return {
-            "hidden_states": randn_tensor(
-                (batch_size, num_channels, height, width), generator=self.generator, device=torch_device
-            ),
-            "encoder_hidden_states": randn_tensor(
-                (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device
-            ),
-            "original_size": original_size,
-            "target_size": target_size,
-            "crop_coords": crop_coords,
-            "timestep": torch.randint(0, 1000, size=(batch_size,), generator=self.generator).to(torch_device),
-        }
-
-
-class TestCogView3PlusTransformer(CogView3PlusTransformerTesterConfig, ModelTesterMixin):
-    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
-    def test_from_save_pretrained_dtype_inference(self, tmp_path, dtype):
-        # Skip: fp16/bf16 require very high atol to pass, providing little signal.
-        # Dtype preservation is already tested by test_from_save_pretrained_dtype and test_keep_in_fp32_modules.
-        pytest.skip("Tolerance requirements too high for meaningful test")
-
-
-class TestCogView3PlusTransformerMemory(CogView3PlusTransformerTesterConfig, MemoryTesterMixin):
-    pass
-
-
-class TestCogView3PlusTransformerAttention(CogView3PlusTransformerTesterConfig, AttentionTesterMixin):
-    pass
-
-
-class TestCogView3PlusTransformerTraining(CogView3PlusTransformerTesterConfig, TrainingTesterMixin):
     def test_gradient_checkpointing_is_applied(self):
         expected_set = {"CogView3PlusTransformer2DModel"}
         super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
diff --git a/tests/models/transformers/test_models_transformer_cogview4.py b/tests/models/transformers/test_models_transformer_cogview4.py
index 0f390cb356e9..084c3b7cea41 100644
--- a/tests/models/transformers/test_models_transformer_cogview4.py
+++ b/tests/models/transformers/test_models_transformer_cogview4.py
@@ -12,47 +12,59 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import unittest
+
 import torch
 
 from diffusers import CogView4Transformer2DModel
-from diffusers.utils.torch_utils import randn_tensor
 
 from ...testing_utils import enable_full_determinism, torch_device
-from ..testing_utils import (
-    AttentionTesterMixin,
-    BaseModelTesterConfig,
-    MemoryTesterMixin,
-    ModelTesterMixin,
-    TrainingTesterMixin,
-)
+from ..test_modeling_common import ModelTesterMixin
 
 
 enable_full_determinism()
 
 
-class CogView4TransformerTesterConfig(BaseModelTesterConfig):
-    @property
-    def model_class(self):
-        return CogView4Transformer2DModel
+class CogView3PlusTransformerTests(ModelTesterMixin, unittest.TestCase):
+    model_class = CogView4Transformer2DModel
+    main_input_name = "hidden_states"
+    uses_custom_attn_processor = True
 
     @property
-    def main_input_name(self) -> str:
-        return "hidden_states"
+    def dummy_input(self):
+        batch_size = 2
+        num_channels = 4
+        height = 8
+        width = 8
+        embedding_dim = 8
+        sequence_length = 8
 
-    @property
-    def input_shape(self) -> tuple:
-        return (4, 8, 8)
+        hidden_states = torch.randn((batch_size, num_channels, height, width)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
+        original_size = torch.tensor([height * 8, width * 8]).unsqueeze(0).repeat(batch_size, 1).to(torch_device)
+        target_size = torch.tensor([height * 8, width * 8]).unsqueeze(0).repeat(batch_size, 1).to(torch_device)
+        crop_coords = torch.tensor([0, 0]).unsqueeze(0).repeat(batch_size, 1).to(torch_device)
+        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
+
+        return {
+            "hidden_states": hidden_states,
+            "encoder_hidden_states": encoder_hidden_states,
+            "timestep": timestep,
+            "original_size": original_size,
+            "target_size": target_size,
+            "crop_coords": crop_coords,
+        }
 
     @property
-    def output_shape(self) -> tuple:
+    def input_shape(self):
         return (4, 8, 8)
 
     @property
-    def generator(self):
-        return torch.Generator("cpu").manual_seed(0)
+    def output_shape(self):
+        return (4, 8, 8)
 
-    def get_init_dict(self) -> dict:
-        return {
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
             "patch_size": 2,
             "in_channels": 4,
             "num_layers": 2,
@@ -63,44 +75,9 @@ def get_init_dict(self) -> dict:
             "time_embed_dim": 8,
             "condition_dim": 4,
         }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
 
-    def get_dummy_inputs(self, batch_size: int = 2) -> dict[str, torch.Tensor]:
-        num_channels = 4
-        height = width = 8
-        embedding_dim = 8
-        sequence_length = 8
-
-        original_size = torch.tensor([height * 8, width * 8]).unsqueeze(0).repeat(batch_size, 1).to(torch_device)
-        target_size = torch.tensor([height * 8, width * 8]).unsqueeze(0).repeat(batch_size, 1).to(torch_device)
-        crop_coords = torch.tensor([0, 0]).unsqueeze(0).repeat(batch_size, 1).to(torch_device)
-
-        return {
-            "hidden_states": randn_tensor(
-                (batch_size, num_channels, height, width), generator=self.generator, device=torch_device
-            ),
-            "encoder_hidden_states": randn_tensor(
-                (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device
-            ),
-            "timestep": torch.randint(0, 1000, size=(batch_size,), generator=self.generator).to(torch_device),
-            "original_size": original_size,
-            "target_size": target_size,
-            "crop_coords": crop_coords,
-        }
-
-
-class TestCogView4Transformer(CogView4TransformerTesterConfig, ModelTesterMixin):
-    pass
-
-
-class TestCogView4TransformerMemory(CogView4TransformerTesterConfig, MemoryTesterMixin):
-    pass
-
-
-class TestCogView4TransformerAttention(CogView4TransformerTesterConfig, AttentionTesterMixin):
-    pass
-
-
-class TestCogView4TransformerTraining(CogView4TransformerTesterConfig, TrainingTesterMixin):
     def test_gradient_checkpointing_is_applied(self):
         expected_set = {"CogView4Transformer2DModel"}
         super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
diff --git a/tests/models/transformers/test_models_transformer_consisid.py b/tests/models/transformers/test_models_transformer_consisid.py
index cb02e8a359b3..77fc172d078a 100644
--- a/tests/models/transformers/test_models_transformer_consisid.py
+++ b/tests/models/transformers/test_models_transformer_consisid.py
@@ -13,46 +13,61 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import unittest
+
 import torch
 
 from diffusers import ConsisIDTransformer3DModel
-from diffusers.utils.torch_utils import randn_tensor
 
-from ...testing_utils import enable_full_determinism, torch_device
-from ..testing_utils import (
-    BaseModelTesterConfig,
-    MemoryTesterMixin,
-    ModelTesterMixin,
-    TrainingTesterMixin,
+from ...testing_utils import (
+    enable_full_determinism,
+    torch_device,
 )
+from ..test_modeling_common import ModelTesterMixin
 
 
 enable_full_determinism()
 
 
-class ConsisIDTransformerTesterConfig(BaseModelTesterConfig):
-    @property
-    def model_class(self):
-        return ConsisIDTransformer3DModel
+class ConsisIDTransformerTests(ModelTesterMixin, unittest.TestCase):
+    model_class = ConsisIDTransformer3DModel
+    main_input_name = "hidden_states"
+    uses_custom_attn_processor = True
 
     @property
-    def main_input_name(self) -> str:
-        return "hidden_states"
+    def dummy_input(self):
+        batch_size = 2
+        num_channels = 4
+        num_frames = 1
+        height = 8
+        width = 8
+        embedding_dim = 8
+        sequence_length = 8
 
-    @property
-    def input_shape(self) -> tuple:
-        return (1, 4, 8, 8)
+        hidden_states = torch.randn((batch_size, num_frames, num_channels, height, width)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
+        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
+        id_vit_hidden = [torch.ones([batch_size, 2, 2]).to(torch_device)] * 1
+        id_cond = torch.ones(batch_size, 2).to(torch_device)
+
+        return {
+            "hidden_states": hidden_states,
+            "encoder_hidden_states": encoder_hidden_states,
+            "timestep": timestep,
+            "id_vit_hidden": id_vit_hidden,
+            "id_cond": id_cond,
+        }
 
     @property
-    def output_shape(self) -> tuple:
+    def input_shape(self):
         return (1, 4, 8, 8)
 
     @property
-    def generator(self):
-        return torch.Generator("cpu").manual_seed(0)
+    def output_shape(self):
+        return (1, 4, 8, 8)
 
-    def get_init_dict(self) -> dict:
-        return {
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
             "num_attention_heads": 2,
             "attention_head_dim": 8,
             "in_channels": 4,
@@ -82,36 +97,9 @@ def get_init_dict(self) -> dict:
             "LFE_ff_mult": 1,
             "LFE_num_scale": 1,
         }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
 
-    def get_dummy_inputs(self, batch_size: int = 2) -> dict[str, torch.Tensor]:
-        num_channels = 4
-        num_frames = 1
-        height = width = 8
-        embedding_dim = 8
-        sequence_length = 8
-
-        return {
-            "hidden_states": randn_tensor(
-                (batch_size, num_frames, num_channels, height, width), generator=self.generator, device=torch_device
-            ),
-            "encoder_hidden_states": randn_tensor(
-                (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device
-            ),
-            "timestep": torch.randint(0, 1000, size=(batch_size,), generator=self.generator).to(torch_device),
-            "id_vit_hidden": [torch.ones([batch_size, 2, 2]).to(torch_device)] * 1,
-            "id_cond": torch.ones(batch_size, 2).to(torch_device),
-        }
-
-
-class TestConsisIDTransformer(ConsisIDTransformerTesterConfig, ModelTesterMixin):
-    pass
-
-
-class TestConsisIDTransformerMemory(ConsisIDTransformerTesterConfig, MemoryTesterMixin):
-    pass
-
-
-class TestConsisIDTransformerTraining(ConsisIDTransformerTesterConfig, TrainingTesterMixin):
     def test_gradient_checkpointing_is_applied(self):
         expected_set = {"ConsisIDTransformer3DModel"}
         super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
diff --git a/tests/models/transformers/test_models_transformer_latte.py b/tests/models/transformers/test_models_transformer_latte.py
index 946e5ce8a5a9..7bf2c52e6269 100644
--- a/tests/models/transformers/test_models_transformer_latte.py
+++ b/tests/models/transformers/test_models_transformer_latte.py
@@ -13,48 +13,56 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytest
+import unittest
+
 import torch
 
 from diffusers import LatteTransformer3DModel
-from diffusers.utils.torch_utils import randn_tensor
-
-from ...testing_utils import enable_full_determinism, torch_device
-from ..testing_utils import (
-    AttentionTesterMixin,
-    BaseModelTesterConfig,
-    MemoryTesterMixin,
-    ModelTesterMixin,
-    TrainingTesterMixin,
+
+from ...testing_utils import (
+    enable_full_determinism,
+    torch_device,
 )
+from ..test_modeling_common import ModelTesterMixin
 
 
 enable_full_determinism()
 
 
-class LatteTransformerTesterConfig(BaseModelTesterConfig):
-    @property
-    def model_class(self):
-        return LatteTransformer3DModel
+class LatteTransformerTests(ModelTesterMixin, unittest.TestCase):
+    model_class = LatteTransformer3DModel
+    main_input_name = "hidden_states"
 
     @property
-    def main_input_name(self) -> str:
-        return "hidden_states"
+    def dummy_input(self):
+        batch_size = 2
+        num_channels = 4
+        num_frames = 1
+        height = width = 8
+        embedding_dim = 8
+        sequence_length = 8
+
+        hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
+        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
+
+        return {
+            "hidden_states": hidden_states,
+            "encoder_hidden_states": encoder_hidden_states,
+            "timestep": timestep,
+            "enable_temporal_attentions": True,
+        }
 
     @property
-    def input_shape(self) -> tuple:
+    def input_shape(self):
         return (4, 1, 8, 8)
 
     @property
-    def output_shape(self) -> tuple:
+    def output_shape(self):
         return (8, 1, 8, 8)
 
-    @property
-    def generator(self):
-        return torch.Generator("cpu").manual_seed(0)
-
-    def get_init_dict(self) -> dict:
-        return {
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
             "sample_size": 8,
             "num_layers": 1,
             "patch_size": 2,
@@ -71,43 +79,14 @@ def get_init_dict(self) -> dict:
             "norm_elementwise_affine": False,
             "norm_eps": 1e-6,
         }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
 
-    def get_dummy_inputs(self, batch_size: int = 2) -> dict[str, torch.Tensor]:
-        num_channels = 4
-        num_frames = 1
-        height = width = 8
-        embedding_dim = 8
-        sequence_length = 8
-
-        return {
-            "hidden_states": randn_tensor(
-                (batch_size, num_channels, num_frames, height, width), generator=self.generator, device=torch_device
-            ),
-            "encoder_hidden_states": randn_tensor(
-                (batch_size, sequence_length, embedding_dim), generator=self.generator, device=torch_device
-            ),
-            "timestep": torch.randint(0, 1000, size=(batch_size,), generator=self.generator).to(torch_device),
-            "enable_temporal_attentions": True,
-        }
-
-
-class TestLatteTransformer(LatteTransformerTesterConfig, ModelTesterMixin):
-    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
-    def test_from_save_pretrained_dtype_inference(self, tmp_path, dtype):
-        # Skip: fp16/bf16 require very high atol to pass, providing little signal.
-        # Dtype preservation is already tested by test_from_save_pretrained_dtype and test_keep_in_fp32_modules.
-        pytest.skip("Tolerance requirements too high for meaningful test")
-
-
-class TestLatteTransformerMemory(LatteTransformerTesterConfig, MemoryTesterMixin):
-    pass
-
-
-class TestLatteTransformerAttention(LatteTransformerTesterConfig, AttentionTesterMixin):
-    pass
-
+    def test_output(self):
+        super().test_output(
+            expected_output_shape=(self.dummy_input[self.main_input_name].shape[0],) + self.output_shape
+        )
 
-class TestLatteTransformerTraining(LatteTransformerTesterConfig, TrainingTesterMixin):
     def test_gradient_checkpointing_is_applied(self):
         expected_set = {"LatteTransformer3DModel"}
         super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
diff --git a/tests/models/transformers/test_models_transformer_motif_video.py b/tests/models/transformers/test_models_transformer_motif_video.py
index 8d8693acda37..d3ac3a874927 100644
--- a/tests/models/transformers/test_models_transformer_motif_video.py
+++ b/tests/models/transformers/test_models_transformer_motif_video.py
@@ -19,10 +19,10 @@
 from diffusers.utils.torch_utils import randn_tensor
 
 from ...testing_utils import enable_full_determinism, torch_device
+from ..test_modeling_common import LoraHotSwappingForModelTesterMixin
 from ..testing_utils import (
     AttentionTesterMixin,
     BaseModelTesterConfig,
-    LoraHotSwappingForModelTesterMixin,
     LoraTesterMixin,
     MemoryTesterMixin,
     ModelTesterMixin,
diff --git a/tests/models/transformers/test_models_transformer_sana_video.py b/tests/models/transformers/test_models_transformer_sana_video.py
index e9d3a2d8da8e..ff564ed8918d 100644
--- a/tests/models/transformers/test_models_transformer_sana_video.py
+++ b/tests/models/transformers/test_models_transformer_sana_video.py
@@ -12,48 +12,57 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytest
+import unittest
+
 import torch
 
 from diffusers import SanaVideoTransformer3DModel
-from diffusers.utils.torch_utils import randn_tensor
-
-from ...testing_utils import enable_full_determinism, torch_device
-from ..testing_utils import (
-    AttentionTesterMixin,
-    BaseModelTesterConfig,
-    MemoryTesterMixin,
-    ModelTesterMixin,
-    TrainingTesterMixin,
+
+from ...testing_utils import (
+    enable_full_determinism,
+    torch_device,
 )
+from ..test_modeling_common import ModelTesterMixin, TorchCompileTesterMixin
 
 
 enable_full_determinism()
 
 
-class SanaVideoTransformer3DTesterConfig(BaseModelTesterConfig):
-    @property
-    def model_class(self):
-        return SanaVideoTransformer3DModel
+class SanaVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase):
+    model_class = SanaVideoTransformer3DModel
+    main_input_name = "hidden_states"
+    uses_custom_attn_processor = True
 
     @property
-    def main_input_name(self) -> str:
-        return "hidden_states"
+    def dummy_input(self):
+        batch_size = 1
+        num_channels = 16
+        num_frames = 2
+        height = 16
+        width = 16
+        text_encoder_embedding_dim = 16
+        sequence_length = 12
 
-    @property
-    def input_shape(self) -> tuple:
-        return (16, 2, 16, 16)
+        hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device)
+        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, text_encoder_embedding_dim)).to(torch_device)
+
+        return {
+            "hidden_states": hidden_states,
+            "encoder_hidden_states": encoder_hidden_states,
+            "timestep": timestep,
+        }
 
     @property
-    def output_shape(self) -> tuple:
+    def input_shape(self):
         return (16, 2, 16, 16)
 
     @property
-    def generator(self):
-        return torch.Generator("cpu").manual_seed(0)
+    def output_shape(self):
+        return (16, 2, 16, 16)
 
-    def get_init_dict(self) -> dict:
-        return {
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
             "in_channels": 16,
             "out_channels": 16,
             "num_attention_heads": 2,
@@ -73,44 +82,16 @@ def get_init_dict(self) -> dict:
             "qk_norm": "rms_norm_across_heads",
             "rope_max_seq_len": 32,
         }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
 
-    def get_dummy_inputs(self, batch_size: int = 1) -> dict[str, torch.Tensor]:
-        num_channels = 16
-        num_frames = 2
-        height = width = 16
-        text_encoder_embedding_dim = 16
-        sequence_length = 12
-
-        return {
-            "hidden_states": randn_tensor(
-                (batch_size, num_channels, num_frames, height, width), generator=self.generator, device=torch_device
-            ),
-            "encoder_hidden_states": randn_tensor(
-                (batch_size, sequence_length, text_encoder_embedding_dim),
-                generator=self.generator,
-                device=torch_device,
-            ),
-            "timestep": torch.randint(0, 1000, size=(batch_size,), generator=self.generator).to(torch_device),
-        }
-
-
-class TestSanaVideoTransformer3D(SanaVideoTransformer3DTesterConfig, ModelTesterMixin):
-    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
-    def test_from_save_pretrained_dtype_inference(self, tmp_path, dtype):
-        # Skip: fp16/bf16 require very high atol to pass, providing little signal.
-        # Dtype preservation is already tested by test_from_save_pretrained_dtype and test_keep_in_fp32_modules.
-        pytest.skip("Tolerance requirements too high for meaningful test")
-
-
-class TestSanaVideoTransformer3DMemory(SanaVideoTransformer3DTesterConfig, MemoryTesterMixin):
-    pass
-
-
-class TestSanaVideoTransformer3DAttention(SanaVideoTransformer3DTesterConfig, AttentionTesterMixin):
-    pass
-
-
-class TestSanaVideoTransformer3DTraining(SanaVideoTransformer3DTesterConfig, TrainingTesterMixin):
     def test_gradient_checkpointing_is_applied(self):
         expected_set = {"SanaVideoTransformer3DModel"}
         super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
+
+class SanaVideoTransformerCompileTests(TorchCompileTesterMixin, unittest.TestCase):
+    model_class = SanaVideoTransformer3DModel
+
+    def prepare_init_args_and_inputs_for_common(self):
+        return SanaVideoTransformer3DTests().prepare_init_args_and_inputs_for_common()
diff --git a/tests/models/transformers/test_models_transformer_temporal.py b/tests/models/transformers/test_models_transformer_temporal.py
index ff917f65cf33..aff83be51124 100644
--- a/tests/models/transformers/test_models_transformer_temporal.py
+++ b/tests/models/transformers/test_models_transformer_temporal.py
@@ -13,77 +13,55 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import unittest
+
 import torch
 
 from diffusers.models.transformers import TransformerTemporalModel
-from diffusers.utils.torch_utils import randn_tensor
 
-from ...testing_utils import enable_full_determinism, torch_device
-from ..testing_utils import (
-    AttentionTesterMixin,
-    BaseModelTesterConfig,
-    MemoryTesterMixin,
-    ModelTesterMixin,
-    TrainingTesterMixin,
+from ...testing_utils import (
+    enable_full_determinism,
+    torch_device,
 )
+from ..test_modeling_common import ModelTesterMixin
 
 
 enable_full_determinism()
 
 
-class TemporalTransformerTesterConfig(BaseModelTesterConfig):
-    @property
-    def model_class(self):
-        return TransformerTemporalModel
+class TemporalTransformerTests(ModelTesterMixin, unittest.TestCase):
+    model_class = TransformerTemporalModel
+    main_input_name = "hidden_states"
 
     @property
-    def main_input_name(self) -> str:
-        return "hidden_states"
+    def dummy_input(self):
+        batch_size = 2
+        num_channels = 4
+        height = width = 32
 
-    @property
-    def input_shape(self) -> tuple:
-        return (4, 32, 32)
+        hidden_states = torch.randn((batch_size, num_channels, height, width)).to(torch_device)
+        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
+
+        return {
+            "hidden_states": hidden_states,
+            "timestep": timestep,
+        }
 
     @property
-    def output_shape(self) -> tuple:
+    def input_shape(self):
         return (4, 32, 32)
 
     @property
-    def generator(self):
-        return torch.Generator("cpu").manual_seed(0)
+    def output_shape(self):
+        return (4, 32, 32)
 
-    def get_init_dict(self) -> dict:
-        return {
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
             "num_attention_heads": 8,
             "attention_head_dim": 4,
             "in_channels": 4,
             "num_layers": 1,
             "norm_num_groups": 1,
         }
-
-    def get_dummy_inputs(self, batch_size: int = 2) -> dict[str, torch.Tensor]:
-        num_channels = 4
-        height = width = 32
-
-        return {
-            "hidden_states": randn_tensor(
-                (batch_size, num_channels, height, width), generator=self.generator, device=torch_device
-            ),
-            "timestep": torch.randint(0, 1000, size=(batch_size,), generator=self.generator).to(torch_device),
-        }
-
-
-class TestTemporalTransformer(TemporalTransformerTesterConfig, ModelTesterMixin):
-    pass
-
-
-class TestTemporalTransformerMemory(TemporalTransformerTesterConfig, MemoryTesterMixin):
-    pass
-
-
-class TestTemporalTransformerAttention(TemporalTransformerTesterConfig, AttentionTesterMixin):
-    pass
-
-
-class TestTemporalTransformerTraining(TemporalTransformerTesterConfig, TrainingTesterMixin):
-    pass
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
diff --git a/tests/others/test_utils.py b/tests/others/test_utils.py
index 5db007b7ed6d..4600f5f3710a 100755
--- a/tests/others/test_utils.py
+++ b/tests/others/test_utils.py
@@ -342,6 +342,6 @@ def is_staging_test(test_case):
     Those tests will run using the staging environment of huggingface.co instead of the real model hub.
     """
     if not _run_staging:
-        return pytest.mark.skip("test is staging test")(test_case)
+        return unittest.skip("test is staging test")(test_case)
     else:
         return pytest.mark.is_staging_test()(test_case)

From 1fef16a8db7e8e89130da5aec432c0b64124bc54 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 17 Jun 2026 08:36:51 +0000
Subject: [PATCH 6/7] fix more

---
 ...test_models_autoencoder_kl_temporal_decoder.py | 12 +++++++++++-
 .../test_models_consistency_decoder_vae.py        |  9 +--------
 tests/models/testing_utils/common.py              | 15 +++++++--------
 .../test_models_transformer_chronoedit.py         | 12 +++++++++++-
 .../test_models_transformer_skyreels_v2.py        | 12 +++++++++++-
 tests/models/unets/test_models_unet_2d.py         | 11 +++++++++++
 6 files changed, 52 insertions(+), 19 deletions(-)

diff --git a/tests/models/autoencoders/test_models_autoencoder_kl_temporal_decoder.py b/tests/models/autoencoders/test_models_autoencoder_kl_temporal_decoder.py
index 7d4ea24d5502..3958fccae936 100644
--- a/tests/models/autoencoders/test_models_autoencoder_kl_temporal_decoder.py
+++ b/tests/models/autoencoders/test_models_autoencoder_kl_temporal_decoder.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import pytest
 import torch
 
 from diffusers import AutoencoderKLTemporalDecoder
@@ -63,7 +64,16 @@ def get_dummy_inputs(self) -> dict:
 
 
 class TestAutoencoderKLTemporalDecoder(AutoencoderKLTemporalDecoderTesterConfig, ModelTesterMixin):
-    pass
+    @pytest.mark.skipif(
+        torch_device not in ["cuda", "xpu"],
+        reason="float16 and bfloat16 can only be use for inference with an accelerator",
+    )
+    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
+    def test_from_save_pretrained_dtype_inference(self, tmp_path, dtype):
+        # fp16/bf16 convolutions are nondeterministic across the two model instances, so relax the tolerance.
+        super().test_from_save_pretrained_dtype_inference(
+            tmp_path, dtype, atol=3e-2 if dtype == torch.bfloat16 else 1e-2
+        )
 
 
 class TestAutoencoderKLTemporalDecoderTraining(AutoencoderKLTemporalDecoderTesterConfig, TrainingTesterMixin):
diff --git a/tests/models/autoencoders/test_models_consistency_decoder_vae.py b/tests/models/autoencoders/test_models_consistency_decoder_vae.py
index 2220da59c77d..906baa60a9dc 100644
--- a/tests/models/autoencoders/test_models_consistency_decoder_vae.py
+++ b/tests/models/autoencoders/test_models_consistency_decoder_vae.py
@@ -16,7 +16,6 @@
 import gc
 
 import numpy as np
-import pytest
 import torch
 
 from diffusers import ConsistencyDecoderVAE, StableDiffusionPipeline
@@ -87,13 +86,7 @@ def get_dummy_inputs(self) -> dict:
 
 
 class TestConsistencyDecoderVAE(ConsistencyDecoderVAETesterConfig, ModelTesterMixin):
-    @pytest.mark.skip(
-        "`forward` decodes through an iterative, RNG-driven consistency-decoding loop whose output is not "
-        "reproducible across two model instances and amplifies fp16/bf16 nondeterminism, so a low-precision "
-        "output-equivalence check is not meaningful."
-    )
-    def test_from_save_pretrained_dtype_inference(self):
-        pass
+    pass
 
 
 class TestConsistencyDecoderVAETraining(ConsistencyDecoderVAETesterConfig, TrainingTesterMixin):
diff --git a/tests/models/testing_utils/common.py b/tests/models/testing_utils/common.py
index c468d19ad2a1..626f1eb7f1bf 100644
--- a/tests/models/testing_utils/common.py
+++ b/tests/models/testing_utils/common.py
@@ -480,11 +480,7 @@ def test_keep_in_fp32_modules(self, tmp_path):
     )
     @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
     @torch.no_grad()
-    def test_from_save_pretrained_dtype_inference(self, tmp_path, dtype):
-        # Low-precision inference is inherently lossy, and models that keep some modules in fp32 diverge further from
-        # the fully-cast reference. Tolerances reflect the dtype's precision rather than a tight fp32-style threshold.
-        atol = 3e-2 if dtype == torch.bfloat16 else 1e-2
-        rtol = 0
+    def test_from_save_pretrained_dtype_inference(self, tmp_path, dtype, atol=1e-4, rtol=0):
         model = self.model_class(**self.get_init_dict())
         model.to(torch_device)
         fp32_modules = model._keep_in_fp32_modules or []
@@ -500,9 +496,12 @@ def test_from_save_pretrained_dtype_inference(self, tmp_path, dtype):
             else:
                 assert param.data.dtype == dtype
 
-        inputs = cast_inputs_to_dtype(self.get_dummy_inputs(), torch.float32, dtype)
-        output = model(**inputs, return_dict=False)[0]
-        output_loaded = model_loaded(**inputs, return_dict=False)[0]
+        # Fetch inputs separately for each forward so that models consuming a generator (e.g. stochastic decoders)
+        # see the same, freshly-seeded RNG state in both passes instead of sharing a single advancing generator.
+        output = model(**cast_inputs_to_dtype(self.get_dummy_inputs(), torch.float32, dtype), return_dict=False)[0]
+        output_loaded = model_loaded(
+            **cast_inputs_to_dtype(self.get_dummy_inputs(), torch.float32, dtype), return_dict=False
+        )[0]
 
         assert_tensors_close(
             output, output_loaded, atol=atol, rtol=rtol, msg=f"Loaded model output differs for {dtype}"
diff --git a/tests/models/transformers/test_models_transformer_chronoedit.py b/tests/models/transformers/test_models_transformer_chronoedit.py
index 29fd99b82f7a..8baca5091b98 100644
--- a/tests/models/transformers/test_models_transformer_chronoedit.py
+++ b/tests/models/transformers/test_models_transformer_chronoedit.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import pytest
 import torch
 
 from diffusers import ChronoEditTransformer3DModel
@@ -92,7 +93,16 @@ def get_dummy_inputs(self, batch_size: int = 1) -> dict[str, torch.Tensor]:
 
 
 class TestChronoEditTransformer(ChronoEditTransformerTesterConfig, ModelTesterMixin):
-    pass
+    @pytest.mark.skipif(
+        torch_device not in ["cuda", "xpu"],
+        reason="float16 and bfloat16 can only be use for inference with an accelerator",
+    )
+    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
+    def test_from_save_pretrained_dtype_inference(self, tmp_path, dtype):
+        # Modules kept in fp32 diverge from the fully-cast reference, so relax the low-precision tolerance.
+        super().test_from_save_pretrained_dtype_inference(
+            tmp_path, dtype, atol=3e-2 if dtype == torch.bfloat16 else 1e-2
+        )
 
 
 class TestChronoEditTransformerTraining(ChronoEditTransformerTesterConfig, TrainingTesterMixin):
diff --git a/tests/models/transformers/test_models_transformer_skyreels_v2.py b/tests/models/transformers/test_models_transformer_skyreels_v2.py
index 96a43d6f8209..0b895ef799dc 100644
--- a/tests/models/transformers/test_models_transformer_skyreels_v2.py
+++ b/tests/models/transformers/test_models_transformer_skyreels_v2.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import pytest
 import torch
 
 from diffusers import SkyReelsV2Transformer3DModel
@@ -87,7 +88,16 @@ def get_dummy_inputs(self, batch_size: int = 1) -> dict[str, torch.Tensor]:
 
 
 class TestSkyReelsV2Transformer(SkyReelsV2TransformerTesterConfig, ModelTesterMixin):
-    pass
+    @pytest.mark.skipif(
+        torch_device not in ["cuda", "xpu"],
+        reason="float16 and bfloat16 can only be use for inference with an accelerator",
+    )
+    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
+    def test_from_save_pretrained_dtype_inference(self, tmp_path, dtype):
+        # Modules kept in fp32 diverge from the fully-cast reference, so relax the low-precision tolerance.
+        super().test_from_save_pretrained_dtype_inference(
+            tmp_path, dtype, atol=3e-2 if dtype == torch.bfloat16 else 1e-2
+        )
 
 
 class TestSkyReelsV2TransformerTraining(SkyReelsV2TransformerTesterConfig, TrainingTesterMixin):
diff --git a/tests/models/unets/test_models_unet_2d.py b/tests/models/unets/test_models_unet_2d.py
index a5cd8abd873a..0399f4301214 100644
--- a/tests/models/unets/test_models_unet_2d.py
+++ b/tests/models/unets/test_models_unet_2d.py
@@ -74,6 +74,17 @@ def get_dummy_inputs(self) -> dict:
 
 
 class TestUnet2DModel(Unet2DModelTesterConfig, ModelTesterMixin):
+    @pytest.mark.skipif(
+        torch_device not in ["cuda", "xpu"],
+        reason="float16 and bfloat16 can only be use for inference with an accelerator",
+    )
+    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
+    def test_from_save_pretrained_dtype_inference(self, tmp_path, dtype):
+        # fp16/bf16 convolutions are nondeterministic across the two model instances, so relax the tolerance.
+        super().test_from_save_pretrained_dtype_inference(
+            tmp_path, dtype, atol=3e-2 if dtype == torch.bfloat16 else 1e-2
+        )
+
     def test_mid_block_attn_groups(self):
         init_dict = self.get_init_dict()
         init_dict["add_attention"] = True

From facce323322409c7cc59721dd57a5ff91fd4d033 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Thu, 18 Jun 2026 10:00:35 +0000
Subject: [PATCH 7/7] address reviewer feedback

---
 .../test_models_transformer_z_image.py        | 90 -------------------
 1 file changed, 90 deletions(-)

diff --git a/tests/models/transformers/test_models_transformer_z_image.py b/tests/models/transformers/test_models_transformer_z_image.py
index 67a8fde0f411..ad4a081557c5 100644
--- a/tests/models/transformers/test_models_transformer_z_image.py
+++ b/tests/models/transformers/test_models_transformer_z_image.py
@@ -131,100 +131,10 @@ def test_determinism(self, atol=1e-5, rtol=0):
             first[mask], second[mask], atol=atol, rtol=rtol, msg="Model outputs are not deterministic"
         )
 
-    def test_from_save_pretrained(self, tmp_path, atol=5e-5, rtol=5e-5):
-        torch.manual_seed(0)
-        model = self.model_class(**self.get_init_dict())
-        model.to(torch_device)
-        model.eval()
-
-        model.save_pretrained(tmp_path)
-        new_model = self.model_class.from_pretrained(tmp_path)
-        new_model.to(torch_device)
-
-        for param_name in model.state_dict().keys():
-            param_1 = model.state_dict()[param_name]
-            param_2 = new_model.state_dict()[param_name]
-            assert param_1.shape == param_2.shape
-
-        inputs_dict = self.get_dummy_inputs()
-        image = _concat_list_output(model(**inputs_dict, return_dict=False)[0])
-        new_image = _concat_list_output(new_model(**inputs_dict, return_dict=False)[0])
-
-        assert_tensors_close(image, new_image, atol=atol, rtol=rtol, msg="Models give different forward passes.")
-
-    @torch.no_grad()
-    def test_from_save_pretrained_variant(self, tmp_path, atol=5e-5, rtol=0):
-        model = self.model_class(**self.get_init_dict())
-        model.to(torch_device)
-        model.eval()
-
-        model.save_pretrained(tmp_path, variant="fp16")
-        new_model = self.model_class.from_pretrained(tmp_path, variant="fp16")
-
-        with pytest.raises(OSError) as exc_info:
-            self.model_class.from_pretrained(tmp_path)
-
-        assert "Error no file named diffusion_pytorch_model.bin found in directory" in str(exc_info.value)
-
-        new_model.to(torch_device)
-
-        inputs_dict = self.get_dummy_inputs()
-        image = _concat_list_output(model(**inputs_dict, return_dict=False)[0])
-        new_image = _concat_list_output(new_model(**inputs_dict, return_dict=False)[0])
-
-        assert_tensors_close(image, new_image, atol=atol, rtol=rtol, msg="Models give different forward passes.")
-
     @pytest.mark.skip("Model output `sample` is a list of tensors, not a single tensor.")
     def test_outputs_equivalence(self, atol=1e-5, rtol=0):
         pass
 
-    def test_sharded_checkpoints_with_parallel_loading(self, tmp_path, atol=1e-5, rtol=0):
-        from diffusers.utils import SAFE_WEIGHTS_INDEX_NAME, constants
-
-        from ..testing_utils.common import calculate_expected_num_shards, compute_module_persistent_sizes
-
-        torch.manual_seed(0)
-        config = self.get_init_dict()
-        inputs_dict = self.get_dummy_inputs()
-        model = self.model_class(**config).eval()
-        model = model.to(torch_device)
-
-        base_output = _concat_list_output(model(**inputs_dict, return_dict=False)[0])
-
-        model_size = compute_module_persistent_sizes(model)[""]
-        max_shard_size = int((model_size * 0.75) / (2**10))
-
-        original_parallel_loading = constants.HF_ENABLE_PARALLEL_LOADING
-        original_parallel_workers = getattr(constants, "HF_PARALLEL_WORKERS", None)
-
-        try:
-            model.cpu().save_pretrained(tmp_path, max_shard_size=f"{max_shard_size}KB")
-            assert os.path.exists(os.path.join(tmp_path, SAFE_WEIGHTS_INDEX_NAME))
-
-            expected_num_shards = calculate_expected_num_shards(os.path.join(tmp_path, SAFE_WEIGHTS_INDEX_NAME))
-            actual_num_shards = len([file for file in os.listdir(tmp_path) if file.endswith(".safetensors")])
-            assert actual_num_shards == expected_num_shards
-
-            constants.HF_ENABLE_PARALLEL_LOADING = False
-            self.model_class.from_pretrained(tmp_path).eval().to(torch_device)
-
-            constants.HF_ENABLE_PARALLEL_LOADING = True
-            constants.DEFAULT_HF_PARALLEL_LOADING_WORKERS = 2
-
-            torch.manual_seed(0)
-            model_parallel = self.model_class.from_pretrained(tmp_path).eval()
-            model_parallel = model_parallel.to(torch_device)
-
-            output_parallel = _concat_list_output(model_parallel(**inputs_dict, return_dict=False)[0])
-
-            assert_tensors_close(
-                base_output, output_parallel, atol=atol, rtol=rtol, msg="Output should match with parallel loading"
-            )
-        finally:
-            constants.HF_ENABLE_PARALLEL_LOADING = original_parallel_loading
-            if original_parallel_workers is not None:
-                constants.HF_PARALLEL_WORKERS = original_parallel_workers
-
 
 class TestZImageTransformerMemory(ZImageTransformerTesterConfig, MemoryTesterMixin):
     """Memory optimization tests for Z-Image Transformer."""