diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh index 4476a403540..78053c33e7a 100755 --- a/.ci/scripts/export_model_artifact.sh +++ b/.ci/scripts/export_model_artifact.sh @@ -415,14 +415,40 @@ if [ "$MODEL_NAME" = "qwen3_5_moe" ]; then # Export to .pte/.ptd (short cache dir avoids objcopy symbol length issues) echo "::group::Export" + EXPORT_LOG=$(mktemp) TORCHINDUCTOR_CACHE_DIR="$INDUCTOR_CACHE" \ python -m executorch.examples.models.qwen3_5_moe.export \ --prequantized "$LOCAL_MODEL_DIR" \ --output-dir "${OUTPUT_DIR}" \ --dense-prefill dequant \ - --moe-activation-dtype int8 + --moe-activation-dtype int8 2>&1 | tee "$EXPORT_LOG" + EXPORT_RC=${PIPESTATUS[0]} echo "::endgroup::" + if [ "$EXPORT_RC" -ne 0 ]; then + echo "ERROR: Qwen3.5 MoE export failed (exit $EXPORT_RC)" + rm -f "$EXPORT_LOG" + exit "$EXPORT_RC" + fi + + # Gate peak GPU memory so we keep the export viable on consumer GPUs + # (e.g. RTX 4090 with 24 GB). The export script prints a machine- + # parseable marker line "EXPORT_GPU_PEAK_MEMORY_MB: ". + EXPORT_GPU_PEAK_MB_LIMIT="${EXPORT_GPU_PEAK_MB_LIMIT:-20480}" + PEAK_LINE=$(grep -E '^EXPORT_GPU_PEAK_MEMORY_MB:' "$EXPORT_LOG" | tail -1) + rm -f "$EXPORT_LOG" + if [ -z "$PEAK_LINE" ]; then + echo "ERROR: export did not emit EXPORT_GPU_PEAK_MEMORY_MB marker; cannot enforce GPU memory budget" + exit 1 + fi + PEAK_MB=$(echo "$PEAK_LINE" | awk '{print $2}') + echo "Export GPU peak memory: ${PEAK_MB} MB (limit ${EXPORT_GPU_PEAK_MB_LIMIT} MB)" + if awk -v p="$PEAK_MB" -v l="$EXPORT_GPU_PEAK_MB_LIMIT" 'BEGIN{exit !(p>l)}'; then + echo "ERROR: export exceeded GPU memory budget (${PEAK_MB} MB > ${EXPORT_GPU_PEAK_MB_LIMIT} MB)" + echo " — this would prevent the model from being exported on a 24 GB consumer GPU." + exit 1 + fi + test -f "${OUTPUT_DIR}/model.pte" test -f "${OUTPUT_DIR}/aoti_cuda_blob.ptd" ls -al "${OUTPUT_DIR}" diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py index 9fe373f1be5..a478b43cf0f 100644 --- a/backends/aoti/aoti_backend.py +++ b/backends/aoti/aoti_backend.py @@ -9,7 +9,7 @@ import typing from abc import ABC, abstractmethod from enum import Enum -from typing import Any, Dict, List, Set +from typing import Any, Dict, List, Optional, Set import torch from executorch.backends.aoti.passes.replace_view_copy_with_view import ( @@ -88,8 +88,14 @@ def save_data_externally(cls) -> bool: return False @classmethod - def get_extra_aoti_compile_context_manager(cls): - """Return extra context manager to apply during aoti_compile stage. By default returns an empty context manager.""" + def get_extra_aoti_compile_context_manager( + cls, compile_specs: Optional[List[CompileSpec]] = None + ): + """Return extra context manager to apply during aoti_compile stage. By default returns an empty context manager. + + Subclasses may inspect ``compile_specs`` to opt into behaviors that + only apply to specific methods/models (e.g. low-memory export). + """ return contextlib.nullcontext() @classmethod @@ -105,6 +111,24 @@ def codesign_so(cls, so_path: str, compile_specs: List[CompileSpec]) -> None: """ return + @classmethod + def release_moved_tensors( + cls, + device_edge_program: ExportedProgram, + compile_specs: List[CompileSpec], + ) -> None: + """Release device memory held by tensors that ``move_to_device_pass`` + placed on the target device. + + Called at the end of ``preprocess`` so that the next ``preprocess`` + call (e.g. for the next method in a multi-method export) can reuse + the freed memory. Override in concrete backends (e.g. ``CudaBackend``) + to actually free device memory. + + Default: no-op. + """ + return + @classmethod @contextlib.contextmanager def collect_unsupported_fallback_kernels(cls, missing_fallback_kernels: Set[str]): @@ -208,7 +232,7 @@ def preprocess( # Compile with fallback kernel collection with cls.collect_unsupported_fallback_kernels( missing_fallback_kernels - ), torch.no_grad(), cls.get_extra_aoti_compile_context_manager(): + ), torch.no_grad(), cls.get_extra_aoti_compile_context_manager(compile_specs): paths = torch._inductor.aot_compile( edge_program_module, tuple(user_input_placeholders), options=options ) @@ -269,6 +293,12 @@ def preprocess( os.remove(so_path) os.remove(blob_path) + # Release device memory held by tensors that ``move_to_device_pass`` + # placed on the target device. Default impl is a no-op; concrete + # backends (e.g. CudaBackend) override this to free GPU memory before + # the next preprocess call (e.g. for the next method). + cls.release_moved_tensors(device_edge_program, compile_specs) + return PreprocessResult( processed_bytes=b"", debug_handle_map={}, diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py index 5c6395c8b5b..a3169680b6d 100644 --- a/backends/cuda/cuda_backend.py +++ b/backends/cuda/cuda_backend.py @@ -5,9 +5,11 @@ # LICENSE file in the root directory of this source tree. +import contextlib import logging import os import shutil +import threading import typing from importlib import resources from typing import Any, Dict, final, List, Optional @@ -27,6 +29,83 @@ from torch.nn.attention import SDPBackend +# --------------------------------------------------------------------------- +# AOTI compile-time CPU clones for mutated buffers +# --------------------------------------------------------------------------- +# +# Inductor's `_unlift_graph` clones every mutated buffer that gets lifted into +# the AOTI graph. By default it clones on whatever device the original tensor +# lives on — which after `move_to_device_pass` is CUDA. For Large models like +# Qwen3.5-MoE that means an extra ~18 GB GPU clone during compile, blowing past +# the 24 GB cap we want to honor for consumer GPUs (RTX 4090 and similar). +# +# The patch below side-steps that by: +# 1. Wrapping `torch._inductor.compile_fx.clone_preserve_strides` so every +# clone the AOTI compile pipeline produces lands on CPU. +# 2. Wrapping `CppWrapperCpu.codegen_device` so the C++ wrapper still records +# the model's original target device (e.g. cuda) in `constants_info_`, +# not the now-CPU storage device. Without this the runtime would refuse +# to load the constants because of a mixed-device mismatch. +# +# The wrappers are scoped via a thread-local guard and are only active while +# `_compile_time_cpu_clones(...)` is on the call stack — they are inert +# anywhere else in the process. + +_CPU_CLONE_GUARD = threading.local() + + +def _is_cpu_clone_active() -> bool: + return getattr(_CPU_CLONE_GUARD, "active", False) + + +@contextlib.contextmanager +def _compile_time_cpu_clones(target_device: torch.device): + """Force AOTI's mutated-buffer clones onto CPU while preserving the + serialized constants' target device.""" + from torch._inductor import compile_fx as _cfx + from torch._inductor.codegen.cpp_wrapper_cpu import CppWrapperCpu as _Cpp + + orig_clone = _cfx.clone_preserve_strides + orig_codegen_device = _Cpp.codegen_device + + def _cpu_clone_preserve_strides(x: torch.Tensor) -> torch.Tensor: + # `clone_preserve_strides` is shared by `_unlift_graph` (clones + # lifted buffers — can be safely kept on CPU) and by autotuning code + # in `triton_heuristics.py` (clones for benchmark — must stay on + # GPU for Triton). Discriminate by caller frame so we only force + # CPU clones for the buffer-lifting path. + import sys + + caller = sys._getframe(1).f_code.co_name + if caller == "_unlift_graph": + return orig_clone(x).cpu() + return orig_clone(x) + + def _codegen_device_target_aware(self, device): + # Translate accidental CPU device strings back to the model target + # device only when a constant we forced to CPU is being serialized. + # Other code paths (extern op args etc.) are pass-through. + if ( + _is_cpu_clone_active() + and self.device != "cpu" + and isinstance(device, torch.device) + and device.type == "cpu" + ): + device = target_device + return orig_codegen_device(self, device) + + _cfx.clone_preserve_strides = _cpu_clone_preserve_strides + _Cpp.codegen_device = _codegen_device_target_aware + prev_active = getattr(_CPU_CLONE_GUARD, "active", False) + _CPU_CLONE_GUARD.active = True + try: + yield + finally: + _CPU_CLONE_GUARD.active = prev_active + _cfx.clone_preserve_strides = orig_clone + _Cpp.codegen_device = orig_codegen_device + + @final @experimental( "This API and all of cuda backend related functionality are experimental." @@ -253,19 +332,97 @@ def get_aoti_compile_options( return options @classmethod - def get_extra_aoti_compile_context_manager(cls): + def get_extra_aoti_compile_context_manager( + cls, compile_specs: Optional[List[CompileSpec]] = None + ): """ - Return SDPA MATH backend context manager for CUDA compilation. - - This context manager plays as a fallback solution for any remaining PyTorch SDPA - operations to use the MATH backend (decomposed SDPA) during AOTInductor compilation. - - Note: - - If SDPA ops are replaced with Triton kernels by ReplaceEdgeOpWithTritonOpPass, - this context manager will have no effect on those ops (they are no longer - PyTorch SDPA ops). - - If SDPA ops are NOT replaced (e.g., when triton_kernel_mode="OFF"), this - context manager will force them to use the MATH backend, causing them to - be automatically decomposed during compilation. + Combine all extra context managers needed during AOTInductor + compilation for the CUDA backend. Each manager is documented at + its own `enter_context` call site below. + + The low-memory export monkey-patch (CPU clones for mutated buffers) + is gated on the ``low_memory_mode`` compile spec — only models that + explicitly opt in (currently Qwen3.5 MoE) get it. Other models go + through the unmodified AOTI codepath, which avoids regressions in + their cuda CI exports. + """ + # Parse compile_specs for low_memory_mode (default OFF). compile_specs + # may be None when called without specs (parity with base default). + low_memory_mode = "OFF" + for spec in compile_specs or []: + if spec.key == "low_memory_mode": + mode = spec.value.decode("utf-8").upper() + if mode not in ["ON", "OFF"]: + raise ValueError( + f"Invalid low_memory_mode: {mode}. Expected 'ON' or 'OFF'." + ) + low_memory_mode = mode + + @contextlib.contextmanager + def _combined(): + with contextlib.ExitStack() as stack: + # Force any remaining PyTorch SDPA ops to use the MATH + # backend during compilation so AOTI can lower / decompose + # them. SDPA ops already replaced by Triton kernels via + # `ReplaceEdgeOpWithTritonOpPass` are unaffected; this is + # only the fallback for the `triton_kernel_mode="OFF"` path. + stack.enter_context(torch.nn.attention.sdpa_kernel([SDPBackend.MATH])) + if low_memory_mode == "ON": + # Force AOTI's mutated-buffer clones onto CPU during + # compile so we stay under tight GPU memory caps (e.g. + # 24 GB on a consumer 4090). See + # `_compile_time_cpu_clones` for details. Only enabled + # for models that explicitly opt in via the + # `low_memory_mode="ON"` compile spec, since the + # monkey-patch can interact poorly with other models' + # AOTI compile pipelines. + stack.enter_context( + _compile_time_cpu_clones(torch.device(cls.get_device_name())) + ) + yield + + return _combined() + + @staticmethod + def _is_low_memory_mode(compile_specs: List[CompileSpec]) -> bool: + """Return True if any compile spec opts into low-memory export.""" + for spec in compile_specs: + if spec.key == "low_memory_mode": + return spec.value.decode("utf-8").upper() == "ON" + return False + + @classmethod + def release_moved_tensors( + cls, + device_edge_program, + compile_specs: List[CompileSpec], + ) -> None: + """ + Free GPU memory held by tensors that ``move_to_device_pass`` placed + on CUDA (params, buffers, and constants of ``device_edge_program``). + + Resizing the underlying storage to 0 returns those bytes to PyTorch's + caching allocator, so the next ``preprocess`` call (e.g. for the + next method in a multi-method export) can reuse them when its own + ``move_to_device_pass`` runs. """ - return torch.nn.attention.sdpa_kernel([SDPBackend.MATH]) + if not torch.cuda.is_available(): + return + + pools = [] + state_dict = getattr(device_edge_program, "state_dict", None) + if state_dict: + pools.append(state_dict.values()) + constants = getattr(device_edge_program, "constants", None) + if constants: + pools.append(constants.values()) + + for pool in pools: + for tensor in pool: + if isinstance(tensor, torch.Tensor) and tensor.is_cuda: + try: + tensor.untyped_storage().resize_(0) + except Exception: + # Some storages may be shared / non-resizable; skip + # them rather than failing the export. + pass diff --git a/examples/models/qwen3_5_moe/export.py b/examples/models/qwen3_5_moe/export.py index 376c233da8b..ed787b3c110 100644 --- a/examples/models/qwen3_5_moe/export.py +++ b/examples/models/qwen3_5_moe/export.py @@ -934,6 +934,7 @@ def _export_cuda(model, config, args): ExecutorchBackendConfig, to_edge_transform_and_lower, ) + from executorch.exir.backend.compile_spec_schema import CompileSpec from executorch.exir.passes import MemoryPlanningPass from torch.export import Dim, export @@ -1007,6 +1008,7 @@ def _export_cuda(model, config, args): CudaPartitioner( [ CudaBackend.generate_method_name_compile_spec("decode"), + CompileSpec("low_memory_mode", b"ON"), ] ) ], @@ -1014,6 +1016,7 @@ def _export_cuda(model, config, args): CudaPartitioner( [ CudaBackend.generate_method_name_compile_spec("prefill"), + CompileSpec("low_memory_mode", b"ON"), ] ) ], @@ -1166,6 +1169,13 @@ def main(): # noqa: C901 # Register FLA Triton kernel (CUDA only) import executorch.backends.cuda.triton.kernels # noqa: F401 + # Reset peak GPU memory stats so we can report the actual peak + # consumed during the export pipeline (load + quantize + lowering) + # at the very end. This is also gated by CI to make sure low-VRAM + # GPUs (e.g. RTX 4090, 24 GB) can still complete the export. + if torch.cuda.is_available(): + torch.cuda.reset_peak_memory_stats(0) + if args.backend == "mlx": if args.prequantized: parser.error("--prequantized is not supported with --backend mlx") @@ -1207,6 +1217,13 @@ def main(): # noqa: C901 export_and_lower(model, config, args) + # Report peak GPU memory consumed during the export so CI / users can + # gate this against a known budget (e.g. 24 GB consumer GPUs). + if args.backend == "cuda" and torch.cuda.is_available(): + peak_mb = torch.cuda.max_memory_allocated(0) / (1024 * 1024) + # Stable, machine-parseable marker for CI grep. + print(f"EXPORT_GPU_PEAK_MEMORY_MB: {peak_mb:.2f}") + if __name__ == "__main__": main()