From 04c5b85132afe3db56fefc365e5fad49a29d0822 Mon Sep 17 00:00:00 2001
From: Frida Hou <201670829+Fridah-nv@users.noreply.github.com>
Date: Fri, 15 May 2026 00:43:39 -0700
Subject: [PATCH 1/7] Layerwise MoE calibration infra without the per-expert
 amax cliff
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Brings over from origin/fridah/glm5.1-tmp:
- nvfp4_tensor.py: FP8 underflow guard before float8_e4m3fn cast (clamp
  min=2**-9). Kept main's clamp(max=448) too, so we have both guards.
- layerwise_calib.py: CPU offload of captured inputs, lazy device-move on
  replay, resume detection when all layers are already complete, separate
  quantizer_amaxes.pt per layer for fast amax-only restore, new
  restore_weights=False parameter on full_restore (skips reloading 2+ TB
  of unchanged expert weights at export time).
- tensor_quantizer.py: NVFP4 static-quantizer dispatch in forward, for
  per-expert quantizers whose _amax is set by MSE after max_calibrate
  (experts not routed during max_calibrate stay as plain TensorQuantizer
  and need this dispatch). Kept is_nvfp4_static property — main uses it
  in model_calib.py, conversion.py, calib_utils.py, core_utils.py.
- model_calib.py: MO_DEBUG_MAX_LAYERS env var hatch to limit layerwise
  calibration to first N layers (smoke-testing only).

Does NOT bring over the moe_utils.py changes from c273ddb8 — those added
an over-aggressive _min_valid_amax=2e-3 invalidity threshold +
clamp(min=2e-3) on the fallback path which floored the effective
per-block weight scale at 2e-3/6 ~= 3.3e-4 and produced the cliff seen
in glm-5.1-nvfp4-MSE-expert-only-7ds-0509. Main's existing moe_utils.py
(post #1340, #1421) handles uncalibrated experts gently via None /
torch.all(_amax == 0) without any magnitude floor.

Does NOT bring the per-expert MSE discovery hunks from cfe4a4aa — main's

unified_export_hf.py and moe_utils.py are intentionally kept at main:
main already has both _disable_use_cache and _sanitize_generation_config
helpers; pulling glm5.1-tmp's older version of these files would have
regressed those features.
---
 modelopt/torch/quantization/model_calib.py    |   9 ++
 .../nn/modules/tensor_quantizer.py            |  25 +++++
 .../quantization/utils/layerwise_calib.py     | 101 ++++++++++++++----
 3 files changed, 114 insertions(+), 21 deletions(-)

diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
index 78b237847b1..feab954059b 100644
--- a/modelopt/torch/quantization/model_calib.py
+++ b/modelopt/torch/quantization/model_calib.py
@@ -16,6 +16,7 @@
 """Calibration utilities."""
 
 import math
+import os
 import time
 import warnings
 from collections.abc import Callable
@@ -1766,7 +1767,15 @@ def layerwise_calibrate(
             start_layer, resumed_inputs, forward_loop
         )
 
+        _debug_max_layers = int(os.environ.get("MO_DEBUG_MAX_LAYERS", "0") or "0")
+
         for layer_idx in range(start_layer, num_layers):
+            if _debug_max_layers > 0 and layer_idx >= _debug_max_layers:
+                print_rank_0(
+                    f"MO_DEBUG_MAX_LAYERS={_debug_max_layers}: stopping layerwise "
+                    f"calibration after layer {layer_idx - 1}/{num_layers}"
+                )
+                break
             layer = transformer_layers[layer_idx]
 
             def _layer_forward_loop(m, _inputs=layer_inputs):
diff --git a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py
index 5e3cea44c2a..cda08964fbd 100644
--- a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py
+++ b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py
@@ -824,6 +824,31 @@ def _fake_quantize(self, inputs):
                 getattr(self, "_onnx_quantizer_type", None),
                 self._pass_through_bwd,
             )
+        elif (
+            self.block_sizes is not None
+            and self._num_bits == (2, 1)
+            and self.block_sizes.get("scale_bits") == (4, 3)
+        ):
+            # Static NVFP4: plain TensorQuantizer should have been promoted to
+            # NVFP4StaticQuantizer during MSE setup. For per-expert quantizers
+            # in fused MoEs, promotion is gated on `_amax` having been set during
+            # max_calibrate; experts not activated during max_calibrate stay
+            # plain. MSE later sets a per-block `_amax`, so by the time forward
+            # runs again the quantizer has a valid amax — dispatch to the static
+            # NVFP4 fake-quant path here.
+            if amax is not None:
+                outputs = static_blockwise_fp4_fake_quant(
+                    inputs,
+                    amax,
+                    None,  # global_amax — computed internally by the kernel
+                    True,
+                    inputs.dtype,
+                    self._pass_through_bwd,
+                )
+            else:
+                # No amax at all (truly uncalibrated): pass through unchanged so
+                # forward doesn't crash. Should not normally be reachable.
+                outputs = inputs
         elif isinstance(self._num_bits, tuple):
             # Float-point quantization, e.g., FP8
             E, M = self._num_bits  # noqa: N806
diff --git a/modelopt/torch/quantization/utils/layerwise_calib.py b/modelopt/torch/quantization/utils/layerwise_calib.py
index aed403ad87b..be139cafadb 100644
--- a/modelopt/torch/quantization/utils/layerwise_calib.py
+++ b/modelopt/torch/quantization/utils/layerwise_calib.py
@@ -227,12 +227,30 @@ def _patched_forward(self, *args, **kwargs):
                     f"Layer {info.name} is in 'run' mode but has no cached inputs to replay."
                 )
                 real_args, real_kwargs = info.cached_inputs.popleft()
+                if (
+                    real_args
+                    and isinstance(real_args[0], torch.Tensor)
+                    and real_args[0].device.type == "cpu"
+                ):
+                    device = get_module_device(self)
+                    real_args = _move_to_device(real_args, device)
+                    real_kwargs = _move_to_device(real_kwargs, device)
                 output = self._original_forward(*real_args, **real_kwargs)
                 info.output_meta = LayerActivationCollector._extract_output_meta(output)
                 return output
 
             if info.mode == "capture":
-                info.collected_inputs.append((args, kwargs))
+                # Offload captured inputs to CPU at append time. For early layers
+                # on a single GPU (e.g. layer 0–2 on GPU 0 with seq_device_map),
+                # accumulating thousands of batches' worth of (bs × seq × hidden)
+                # activations on-device saturates that GPU during the capture loop
+                # and OOMs before _set_layer_states gets a chance to move them.
+                # The "run" branch already handles CPU-resident inputs (see the
+                # device-check above), so storing on CPU is safe end-to-end.
+                cpu = torch.device("cpu")
+                info.collected_inputs.append(
+                    (_move_to_device(args, cpu), _move_to_device(kwargs, cpu))
+                )
                 raise _EarlyStopForwardError()
 
             return self._original_forward(*args, **kwargs)
@@ -315,7 +333,11 @@ def _set_layer_states(self, layer_idx: int):
                     "was called for every preceding layer in order."
                 )
             prev.mode = "run"
-            prev.cached_inputs = deque(prev.collected_inputs)
+            cpu = torch.device("cpu")
+            prev.cached_inputs = deque(
+                (_move_to_device(args, cpu), _move_to_device(kwargs, cpu))
+                for args, kwargs in prev.collected_inputs
+            )
             prev.collected_inputs = []
 
         cur = self._decoder_layers[layer_idx]._layerwise_calib
@@ -433,6 +455,10 @@ def cache_outputs_for_next_layer_calib(
 
         next_layer = self._decoder_layers[next_idx]
         with persistent_materialization(layer):
+            # Release PyTorch's cached-but-unused GPU memory before the forward pass.
+            # After MSE weight sweeps, the allocator holds freed blocks in its cache;
+            # empty_cache() returns them to CUDA so the capture forward pass has headroom.
+            torch.cuda.empty_cache()
             return self.get_input_activations(next_layer, forward_loop)
 
 
@@ -508,14 +534,19 @@ def _save_layer(
     torch.save(output_meta, os.path.join(d, "output_meta.pt"))
     if next_inputs is not None:
         torch.save(next_inputs, os.path.join(d, "next_inputs.pt"))
+    amax_state = {k: v for k, v in weights.items() if "_amax" in k}
+    if amax_state:
+        torch.save(amax_state, os.path.join(d, "quantizer_amaxes.pt"))
     _write_manifest(checkpoint_dir, idx, num_layers)
 
 
 def detect_resume_point(checkpoint_dir: str) -> tuple[int, dict] | None:
     """Detect where to resume from an existing checkpoint directory.
 
-    Returns ``(start_layer, manifest)`` if there is work to resume,
-    or ``None`` if the directory is empty, corrupt, or calibration was already complete.
+    Returns ``(start_layer, manifest)`` if there is a checkpoint, or ``None`` if
+    the directory is empty or corrupt.  When all layers are calibrated,
+    ``start_layer == num_layers``; ``layerwise_calibrate`` detects this and
+    early-returns after ``full_restore`` without re-running calibration.
     """
     manifest = _read_manifest(checkpoint_dir)
     if manifest is None:
@@ -524,8 +555,6 @@ def detect_resume_point(checkpoint_dir: str) -> tuple[int, dict] | None:
     total = manifest.get("num_layers")
     if last is None or total is None:
         return None
-    if last + 1 >= total:
-        return None
     return (last + 1, manifest)
 
 
@@ -568,7 +597,9 @@ def from_folder(cls, checkpoint_dir: str | None, num_layers: int) -> _Checkpoint
                     f"but model has {num_layers}. Use a fresh checkpoint directory."
                 )
         start = info[0] if info else 0
-        if start > 0:
+        if start >= num_layers:
+            print_rank_0(f"Checkpoint: all {num_layers} layers already calibrated")
+        elif start > 0:
             print_rank_0(
                 f"Checkpoint: resuming layerwise calibration from layer {start}/{num_layers}"
             )
@@ -601,12 +632,20 @@ def setup_resume(self, layers: nn.ModuleList) -> list | None:
             raise FileNotFoundError(f"Cannot resume: next_inputs.pt missing for layer {last_ckpt}")
         # weights_only=False is safe: file is internally generated by _save_layer, not user-supplied
         next_inputs = torch.load(next_inputs_path, map_location="cpu", weights_only=False)
-        resume_device = get_module_device(layers[self.start_layer])
-        next_inputs = _move_to_device(next_inputs, resume_device)
+        # Keep on CPU — _patched_forward's run mode moves each entry to device on pop.
         return next_inputs
 
-    def full_restore(self, layers: nn.ModuleList, model: nn.Module) -> None:
-        """Restore weights and quantizer state for layers 0..K-1 after the calibration loop."""
+    def full_restore(
+        self, layers: nn.ModuleList, model: nn.Module, restore_weights: bool = True
+    ) -> None:
+        """Restore weights and quantizer state for layers 0..K-1 after the calibration loop.
+
+        Args:
+            restore_weights: If False, skip reloading ``weights.pt`` and load only the
+                ``_amax`` values (from ``quantizer_amaxes.pt`` or filtered from ``weights.pt``).
+                Set to False for calibration algorithms (max, MSE) that never modify weights
+                to avoid re-reading gigabytes of unchanged expert weights from disk.
+        """
         from modelopt.torch.quantization.config import QuantizeConfig
         from modelopt.torch.quantization.conversion import restore_quantizer_state
         from modelopt.torch.quantization.utils.core_utils import enable_weight_access_and_writeback
@@ -620,23 +659,43 @@ def full_restore(self, layers: nn.ModuleList, model: nn.Module) -> None:
             layer = layers[i]
             d = _layer_dir(self.checkpoint_dir, i)
 
-            # Resolve layer_device and load inside the context so params are
-            # materialized — otherwise get_module_device can return meta.
+            # Load inside the context so params are materialized — otherwise
+            # get_module_device can return meta.
             with enable_weight_access_and_writeback(layer, model, name_to_module):
-                layer_device = get_module_device(layer)
+                # Load to CPU first — prevents CUDA tensors with non-zero storage_offset
+                # being reconstructed from serialized views, which causes illegal memory
+                # access when later cloned (e.g. inside deepcopy in _export_fused_experts).
                 # weights_only=False is safe: files are internally generated by _save_layer
                 qstate = torch.load(
                     os.path.join(d, "quantizer_state.pt"),
-                    map_location=layer_device,
-                    weights_only=False,
-                )
-                weights = torch.load(
-                    os.path.join(d, "weights.pt"),
-                    map_location=layer_device,
+                    map_location="cpu",
                     weights_only=False,
                 )
                 restore_quantizer_state(layer, dummy_config, {"quantizer_state": qstate})
-                layer.load_state_dict(weights, strict=False, assign=True)
+                if restore_weights:
+                    weights = torch.load(
+                        os.path.join(d, "weights.pt"),
+                        map_location="cpu",
+                        weights_only=False,
+                    )
+                    layer.load_state_dict(weights, strict=False, assign=False)
+                else:
+                    # Load only _amax entries — skip gigabytes of unchanged expert weights.
+                    # Use map_location="cpu" to get fresh CPU tensors (no storage_offset).
+                    # _export_fused_experts moves _amax to the weight device on demand.
+                    amax_path = os.path.join(d, "quantizer_amaxes.pt")
+                    if os.path.exists(amax_path):
+                        amaxes = torch.load(amax_path, map_location="cpu", weights_only=False)
+                    else:
+                        # Legacy checkpoint: filter _amax entries from the full weights.pt.
+                        weights = torch.load(
+                            os.path.join(d, "weights.pt"),
+                            map_location="cpu",
+                            weights_only=False,
+                        )
+                        amaxes = {k: v for k, v in weights.items() if "_amax" in k}
+                    if amaxes:
+                        layer.load_state_dict(amaxes, strict=False, assign=True)
 
         print_rank_0(f"Checkpoint: restored {self.start_layer} previously calibrated layers")
 

From f9803e4e8cba363f01ce2380b7ce29a59bdef999 Mon Sep 17 00:00:00 2001
From: Frida Hou <201670829+Fridah-nv@users.noreply.github.com>
Date: Fri, 15 May 2026 01:46:17 -0700
Subject: [PATCH 2/7] moe_utils: safe CPU amax extract for export-time
 uncalibrated-expert path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds _safe_cpu_amax helper + null-then-deepcopy pattern to _export_fused_experts
to avoid cudaErrorIllegalAddress when the per-expert _amax came back from a
layerwise checkpoint as a CUDA tensor with non-zero storage offset / corrupt
storage. Pre-extracts amax to CPU with explicit synchronize() before any
torch.all() / deepcopy() touches it.

This is a strict subset of c273ddb8's moe_utils changes — the cliff-creating
_min_valid_amax=2e-3 invalidity threshold + clamp(min=2e-3) on the fallback
are deliberately NOT brought over. Uncalibrated experts still fall back to
weight_slice.abs().amax() without any magnitude floor, matching main's
existing semantics.

model_quant: ensure output_dir exists before writing .quant_summary.txt.
Fixes FileNotFoundError when print_quant_summary runs before the export
step creates the directory (FIXES Fix 4).
---
 modelopt/torch/export/moe_utils.py | 58 ++++++++++++++++++++++++++----
 1 file changed, 52 insertions(+), 6 deletions(-)

diff --git a/modelopt/torch/export/moe_utils.py b/modelopt/torch/export/moe_utils.py
index e325e5346f1..f6775359c84 100644
--- a/modelopt/torch/export/moe_utils.py
+++ b/modelopt/torch/export/moe_utils.py
@@ -59,9 +59,38 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None:
     # 2-3. Split + export each per-expert projection.
     fused_dim0 = gate_up.shape[1]  # 2 * expert_dim
 
+    def _safe_cpu_amax(quantizer_src: nn.Module) -> torch.Tensor | None:
+        """Extract _amax to CPU float32, surfacing+clearing any pending CUDA error first.
+
+        Layerwise calibration's _save_layer + full_restore can leave the per-expert
+        ``_amax`` as a CUDA tensor reconstructed from a serialized view with non-zero
+        storage offset. Touching it directly (``torch.all`` / ``deepcopy``) then triggers
+        ``cudaErrorIllegalAddress``. Synchronizing first surfaces the latent error so the
+        subsequent ``.detach().cpu()`` either succeeds on a clean tensor or we fall through
+        to ``None`` and treat the expert as uncalibrated.
+        """
+        amax = getattr(quantizer_src, "_amax", None)
+        if amax is None or not isinstance(amax, torch.Tensor):
+            return None
+        try:
+            if amax.is_cuda:
+                torch.cuda.synchronize(amax.device)
+            return amax.detach().cpu().float()
+        except Exception:
+            return None
+
     for idx in range(n):
         expert = nn.Module()
 
+        # Pre-extract both per-expert amaxes to CPU *before* the projection loop's
+        # deepcopy. deepcopy calls .clone() on CUDA tensors — if the stored _amax
+        # has corrupt storage (under-calibrated experts after layerwise calib), the
+        # clone triggers an async CUDA illegal-memory-access error. Synchronizing in
+        # _safe_cpu_amax surfaces the error here so subsequent operations work on
+        # safe CPU float32 tensors.
+        gu_amax_cpu = _safe_cpu_amax(module.gate_up_proj_weight_quantizers[idx])
+        down_amax_cpu = _safe_cpu_amax(module.down_proj_weight_quantizers[idx])
+
         # If the gate_up source quantizer was never calibrated (rare expert
         # that received no calibration tokens), derive its amax once from the
         # FUSED tensor so gate and up share the same weight_scale_2 below.
@@ -72,11 +101,11 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None:
         # mismatched weight_scale_2 and garbled MoE output at inference.
         gate_up_q = module.gate_up_proj_weight_quantizers[idx]
         if getattr(gate_up_q, "is_enabled", False) and (
-            not hasattr(gate_up_q, "_amax")
-            or gate_up_q._amax is None
-            or torch.all(gate_up_q._amax == 0)
+            gu_amax_cpu is None or bool(torch.all(gu_amax_cpu == 0))
         ):
             gate_up_q.amax = gate_up[idx].abs().amax().to(torch.float32)
+            # Refresh the CPU amax we'll inject below.
+            gu_amax_cpu = _safe_cpu_amax(gate_up_q)
             warnings.warn(
                 f"Expert {idx} gate_up_proj weight quantizer was not calibrated "
                 f"(amax missing or zero). Using fused-tensor amax as fallback "
@@ -100,7 +129,23 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None:
             i_quantizer = gate_up_input_q if is_gate_up else down_input_q
 
             # gate/up share a weight quantizer — clone so each gets independent amax.
-            w_quantizer = copy.deepcopy(w_quantizer_src) if is_gate_up else w_quantizer_src
+            # Null _amax on source before deepcopy so the (possibly corrupt) CUDA tensor
+            # is never cloned; restore afterwards for the sibling projection. The CPU
+            # amax we pre-extracted gets injected in its place.
+            if is_gate_up:
+                _saved_amax = getattr(w_quantizer_src, "_amax", None)
+                try:
+                    w_quantizer_src._amax = None
+                    w_quantizer = copy.deepcopy(w_quantizer_src)
+                finally:
+                    w_quantizer_src._amax = _saved_amax
+                if gu_amax_cpu is not None:
+                    w_quantizer._amax = gu_amax_cpu
+            else:
+                w_quantizer = w_quantizer_src
+                if down_amax_cpu is not None:
+                    # Replace any CUDA-resident _amax with the safe CPU copy.
+                    w_quantizer._amax = down_amax_cpu
 
             # For per-channel amax (dim >= 1), proportionally slice dim-0
             # to match the split weight.
@@ -109,7 +154,7 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None:
                 and w_quantizer._amax is not None
                 and w_quantizer._amax.dim() >= 1
             ):
-                amax = w_quantizer._amax
+                amax = w_quantizer._amax  # already CPU float32 thanks to _safe_cpu_amax
                 # Per-block _amax (NVFP4 static) collapses the row axis we want
                 # to slice on; restore it so dim-0 slicing splits gate/up.
                 if amax.numel() != fused_total and amax.numel() % fused_total == 0:
@@ -132,13 +177,14 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None:
                     )
 
             # If the weight quantizer was never calibrated, compute amax from weights.
+            # All amax tests below operate on the safe CPU tensor injected above.
             if (
                 hasattr(w_quantizer, "is_enabled")
                 and w_quantizer.is_enabled
                 and (
                     not hasattr(w_quantizer, "_amax")
                     or w_quantizer._amax is None
-                    or torch.all(w_quantizer._amax == 0)
+                    or bool(torch.all(w_quantizer._amax == 0))
                 )
             ):
                 w_quantizer.amax = weight_slice.abs().amax().to(torch.float32)

From f349e5738915efc78df1b5da61c5bd81fbde4784 Mon Sep 17 00:00:00 2001
From: Frida Hou <201670829+Fridah-nv@users.noreply.github.com>
Date: Fri, 15 May 2026 02:12:03 -0700
Subject: [PATCH 3/7] moe_utils: sync global_amax to same device as per-block
 _amax after CPU extract

---
 modelopt/torch/export/moe_utils.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/modelopt/torch/export/moe_utils.py b/modelopt/torch/export/moe_utils.py
index f6775359c84..0f5f96d92c7 100644
--- a/modelopt/torch/export/moe_utils.py
+++ b/modelopt/torch/export/moe_utils.py
@@ -195,6 +195,18 @@ def _safe_cpu_amax(quantizer_src: nn.Module) -> torch.Tensor | None:
                     stacklevel=2,
                 )
 
+            # Recompute global_amax from the (CPU) per-block _amax so both live on
+            # the same device when get_weights_scaling_factor_from_quantizer
+            # does ``per_block_scale * 448 / per_block_scale_max``. Without this,
+            # a stale CUDA global_amax mismatches the CPU per-block amax we
+            # injected above. No magnitude floor — that's main's policy.
+            if (
+                hasattr(w_quantizer, "_amax")
+                and w_quantizer._amax is not None
+                and hasattr(w_quantizer, "global_amax")
+            ):
+                w_quantizer.global_amax = w_quantizer._amax.float().amax()
+
             wrapper = nn.Module()
             wrapper.weight = nn.Parameter(weight_slice.contiguous(), requires_grad=False)
             wrapper.weight_quantizer = w_quantizer

From 2be889a16384b7e6266ce587a118e9023427bb9d Mon Sep 17 00:00:00 2001
From: Frida Hou <201670829+Fridah-nv@users.noreply.github.com>
Date: Fri, 15 May 2026 09:44:11 -0700
Subject: [PATCH 4/7] moe_utils: keep _amax on its native device when safe
 (avoid cuda/cpu mismatch with _global_amax)

---
 modelopt/torch/export/moe_utils.py | 42 ++++++++++++++++++------------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/modelopt/torch/export/moe_utils.py b/modelopt/torch/export/moe_utils.py
index 0f5f96d92c7..797654222f8 100644
--- a/modelopt/torch/export/moe_utils.py
+++ b/modelopt/torch/export/moe_utils.py
@@ -59,15 +59,16 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None:
     # 2-3. Split + export each per-expert projection.
     fused_dim0 = gate_up.shape[1]  # 2 * expert_dim
 
-    def _safe_cpu_amax(quantizer_src: nn.Module) -> torch.Tensor | None:
-        """Extract _amax to CPU float32, surfacing+clearing any pending CUDA error first.
+    def _safe_amax(quantizer_src: nn.Module) -> torch.Tensor | None:
+        """Return _amax as a clean tensor, surfacing any latent CUDA error first.
 
         Layerwise calibration's _save_layer + full_restore can leave the per-expert
         ``_amax`` as a CUDA tensor reconstructed from a serialized view with non-zero
         storage offset. Touching it directly (``torch.all`` / ``deepcopy``) then triggers
-        ``cudaErrorIllegalAddress``. Synchronizing first surfaces the latent error so the
-        subsequent ``.detach().cpu()`` either succeeds on a clean tensor or we fall through
-        to ``None`` and treat the expert as uncalibrated.
+        ``cudaErrorIllegalAddress``. We synchronize first to surface any pending error,
+        then return the tensor on its original device. Falling back to CPU only on the
+        error path avoids creating a device mismatch with sibling buffers
+        (``_global_amax``) that stayed on the original device.
         """
         amax = getattr(quantizer_src, "_amax", None)
         if amax is None or not isinstance(amax, torch.Tensor):
@@ -75,9 +76,16 @@ def _safe_cpu_amax(quantizer_src: nn.Module) -> torch.Tensor | None:
         try:
             if amax.is_cuda:
                 torch.cuda.synchronize(amax.device)
-            return amax.detach().cpu().float()
+            # Force a no-op read to trigger any latent async error.
+            _ = amax.shape
+            return amax.detach()
         except Exception:
-            return None
+            # CUDA tensor was unreadable. Try to recover a CPU copy; if that
+            # also fails, treat as uncalibrated.
+            try:
+                return amax.detach().cpu().float()
+            except Exception:
+                return None
 
     for idx in range(n):
         expert = nn.Module()
@@ -86,10 +94,10 @@ def _safe_cpu_amax(quantizer_src: nn.Module) -> torch.Tensor | None:
         # deepcopy. deepcopy calls .clone() on CUDA tensors — if the stored _amax
         # has corrupt storage (under-calibrated experts after layerwise calib), the
         # clone triggers an async CUDA illegal-memory-access error. Synchronizing in
-        # _safe_cpu_amax surfaces the error here so subsequent operations work on
+        # _safe_amax surfaces the error here so subsequent operations work on
         # safe CPU float32 tensors.
-        gu_amax_cpu = _safe_cpu_amax(module.gate_up_proj_weight_quantizers[idx])
-        down_amax_cpu = _safe_cpu_amax(module.down_proj_weight_quantizers[idx])
+        gu_amax = _safe_amax(module.gate_up_proj_weight_quantizers[idx])
+        down_amax = _safe_amax(module.down_proj_weight_quantizers[idx])
 
         # If the gate_up source quantizer was never calibrated (rare expert
         # that received no calibration tokens), derive its amax once from the
@@ -101,11 +109,11 @@ def _safe_cpu_amax(quantizer_src: nn.Module) -> torch.Tensor | None:
         # mismatched weight_scale_2 and garbled MoE output at inference.
         gate_up_q = module.gate_up_proj_weight_quantizers[idx]
         if getattr(gate_up_q, "is_enabled", False) and (
-            gu_amax_cpu is None or bool(torch.all(gu_amax_cpu == 0))
+            gu_amax is None or bool(torch.all(gu_amax == 0))
         ):
             gate_up_q.amax = gate_up[idx].abs().amax().to(torch.float32)
             # Refresh the CPU amax we'll inject below.
-            gu_amax_cpu = _safe_cpu_amax(gate_up_q)
+            gu_amax = _safe_amax(gate_up_q)
             warnings.warn(
                 f"Expert {idx} gate_up_proj weight quantizer was not calibrated "
                 f"(amax missing or zero). Using fused-tensor amax as fallback "
@@ -139,13 +147,13 @@ def _safe_cpu_amax(quantizer_src: nn.Module) -> torch.Tensor | None:
                     w_quantizer = copy.deepcopy(w_quantizer_src)
                 finally:
                     w_quantizer_src._amax = _saved_amax
-                if gu_amax_cpu is not None:
-                    w_quantizer._amax = gu_amax_cpu
+                if gu_amax is not None:
+                    w_quantizer._amax = gu_amax
             else:
                 w_quantizer = w_quantizer_src
-                if down_amax_cpu is not None:
+                if down_amax is not None:
                     # Replace any CUDA-resident _amax with the safe CPU copy.
-                    w_quantizer._amax = down_amax_cpu
+                    w_quantizer._amax = down_amax
 
             # For per-channel amax (dim >= 1), proportionally slice dim-0
             # to match the split weight.
@@ -154,7 +162,7 @@ def _safe_cpu_amax(quantizer_src: nn.Module) -> torch.Tensor | None:
                 and w_quantizer._amax is not None
                 and w_quantizer._amax.dim() >= 1
             ):
-                amax = w_quantizer._amax  # already CPU float32 thanks to _safe_cpu_amax
+                amax = w_quantizer._amax  # safe-extracted via _safe_amax (CUDA or CPU, recovered if corrupt)
                 # Per-block _amax (NVFP4 static) collapses the row axis we want
                 # to slice on; restore it so dim-0 slicing splits gate/up.
                 if amax.numel() != fused_total and amax.numel() % fused_total == 0:

From 59e9e7c9e82e8a700a15dc7d77b364af63e596a1 Mon Sep 17 00:00:00 2001
From: Frida Hou <201670829+Fridah-nv@users.noreply.github.com>
Date: Fri, 15 May 2026 10:31:46 -0700
Subject: [PATCH 5/7] moe_utils: pin _amax + global_amax to the weight slice's
 device before export

The big-model run hit a cuda:0 vs cpu device mismatch in
get_weights_scaling_factor_from_quantizer (per_block_scale * 448 /
per_block_scale_max). Root cause: the big model is large enough that
device_map='sequential' offloads some params to CPU, so _amax and
global_amax can land on different devices after deepcopy + injection.

Pin both to weight_slice.device right before calling
_export_quantized_weight. No magnitude clamp.
---
 modelopt/torch/export/moe_utils.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/modelopt/torch/export/moe_utils.py b/modelopt/torch/export/moe_utils.py
index 797654222f8..7bc06d42ee4 100644
--- a/modelopt/torch/export/moe_utils.py
+++ b/modelopt/torch/export/moe_utils.py
@@ -203,17 +203,22 @@ def _safe_amax(quantizer_src: nn.Module) -> torch.Tensor | None:
                     stacklevel=2,
                 )
 
-            # Recompute global_amax from the (CPU) per-block _amax so both live on
-            # the same device when get_weights_scaling_factor_from_quantizer
-            # does ``per_block_scale * 448 / per_block_scale_max``. Without this,
-            # a stale CUDA global_amax mismatches the CPU per-block amax we
-            # injected above. No magnitude floor — that's main's policy.
+            # Align _amax and global_amax with the weight slice's device. The
+            # export math ``per_block_scale * 448 / per_block_scale_max`` reads
+            # both from the quantizer and would otherwise error if they drifted
+            # apart (e.g., CPU-offloaded big-model layers + CUDA-resident weight
+            # slice, or our CPU-injected _amax + the original CUDA global_amax).
+            # No magnitude floor — that's main's policy for the uncalibrated
+            # fallback below.
             if (
                 hasattr(w_quantizer, "_amax")
                 and w_quantizer._amax is not None
-                and hasattr(w_quantizer, "global_amax")
             ):
-                w_quantizer.global_amax = w_quantizer._amax.float().amax()
+                target_device = weight_slice.device
+                if w_quantizer._amax.device != target_device:
+                    w_quantizer._amax = w_quantizer._amax.to(target_device)
+                if hasattr(w_quantizer, "global_amax"):
+                    w_quantizer.global_amax = w_quantizer._amax.float().amax()
 
             wrapper = nn.Module()
             wrapper.weight = nn.Parameter(weight_slice.contiguous(), requires_grad=False)

From c6fb826e5b0addb38c646f6bbea15bd55443f92d Mon Sep 17 00:00:00 2001
From: Frida Hou <201670829+Fridah-nv@users.noreply.github.com>
Date: Fri, 15 May 2026 10:37:14 -0700
Subject: [PATCH 6/7] Replace moe_utils workarounds with a layer-skip hatch in
 _process_quantized_modules
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reverted the safe-CPU-amax / global_amax-sync / device-pinning patches in
moe_utils.py — those were working around a symptom: touching the per-expert
quantizers of layers that were never visited by the layerwise loop (their
_amax is unset). When MO_DEBUG_MAX_LAYERS=N is set, simply skip
_export_fused_experts for any *.layers.{>=N}.* module. Layers 0..N-1 all
have _bootstrap_uncalibrated_weight_quantizers + MSE-applied amaxes so the
existing main moe_utils.py code path works.
---
 modelopt/torch/export/moe_utils.py         | 83 ++--------------------
 modelopt/torch/export/unified_export_hf.py | 11 +++
 2 files changed, 17 insertions(+), 77 deletions(-)

diff --git a/modelopt/torch/export/moe_utils.py b/modelopt/torch/export/moe_utils.py
index 7bc06d42ee4..e325e5346f1 100644
--- a/modelopt/torch/export/moe_utils.py
+++ b/modelopt/torch/export/moe_utils.py
@@ -59,46 +59,9 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None:
     # 2-3. Split + export each per-expert projection.
     fused_dim0 = gate_up.shape[1]  # 2 * expert_dim
 
-    def _safe_amax(quantizer_src: nn.Module) -> torch.Tensor | None:
-        """Return _amax as a clean tensor, surfacing any latent CUDA error first.
-
-        Layerwise calibration's _save_layer + full_restore can leave the per-expert
-        ``_amax`` as a CUDA tensor reconstructed from a serialized view with non-zero
-        storage offset. Touching it directly (``torch.all`` / ``deepcopy``) then triggers
-        ``cudaErrorIllegalAddress``. We synchronize first to surface any pending error,
-        then return the tensor on its original device. Falling back to CPU only on the
-        error path avoids creating a device mismatch with sibling buffers
-        (``_global_amax``) that stayed on the original device.
-        """
-        amax = getattr(quantizer_src, "_amax", None)
-        if amax is None or not isinstance(amax, torch.Tensor):
-            return None
-        try:
-            if amax.is_cuda:
-                torch.cuda.synchronize(amax.device)
-            # Force a no-op read to trigger any latent async error.
-            _ = amax.shape
-            return amax.detach()
-        except Exception:
-            # CUDA tensor was unreadable. Try to recover a CPU copy; if that
-            # also fails, treat as uncalibrated.
-            try:
-                return amax.detach().cpu().float()
-            except Exception:
-                return None
-
     for idx in range(n):
         expert = nn.Module()
 
-        # Pre-extract both per-expert amaxes to CPU *before* the projection loop's
-        # deepcopy. deepcopy calls .clone() on CUDA tensors — if the stored _amax
-        # has corrupt storage (under-calibrated experts after layerwise calib), the
-        # clone triggers an async CUDA illegal-memory-access error. Synchronizing in
-        # _safe_amax surfaces the error here so subsequent operations work on
-        # safe CPU float32 tensors.
-        gu_amax = _safe_amax(module.gate_up_proj_weight_quantizers[idx])
-        down_amax = _safe_amax(module.down_proj_weight_quantizers[idx])
-
         # If the gate_up source quantizer was never calibrated (rare expert
         # that received no calibration tokens), derive its amax once from the
         # FUSED tensor so gate and up share the same weight_scale_2 below.
@@ -109,11 +72,11 @@ def _safe_amax(quantizer_src: nn.Module) -> torch.Tensor | None:
         # mismatched weight_scale_2 and garbled MoE output at inference.
         gate_up_q = module.gate_up_proj_weight_quantizers[idx]
         if getattr(gate_up_q, "is_enabled", False) and (
-            gu_amax is None or bool(torch.all(gu_amax == 0))
+            not hasattr(gate_up_q, "_amax")
+            or gate_up_q._amax is None
+            or torch.all(gate_up_q._amax == 0)
         ):
             gate_up_q.amax = gate_up[idx].abs().amax().to(torch.float32)
-            # Refresh the CPU amax we'll inject below.
-            gu_amax = _safe_amax(gate_up_q)
             warnings.warn(
                 f"Expert {idx} gate_up_proj weight quantizer was not calibrated "
                 f"(amax missing or zero). Using fused-tensor amax as fallback "
@@ -137,23 +100,7 @@ def _safe_amax(quantizer_src: nn.Module) -> torch.Tensor | None:
             i_quantizer = gate_up_input_q if is_gate_up else down_input_q
 
             # gate/up share a weight quantizer — clone so each gets independent amax.
-            # Null _amax on source before deepcopy so the (possibly corrupt) CUDA tensor
-            # is never cloned; restore afterwards for the sibling projection. The CPU
-            # amax we pre-extracted gets injected in its place.
-            if is_gate_up:
-                _saved_amax = getattr(w_quantizer_src, "_amax", None)
-                try:
-                    w_quantizer_src._amax = None
-                    w_quantizer = copy.deepcopy(w_quantizer_src)
-                finally:
-                    w_quantizer_src._amax = _saved_amax
-                if gu_amax is not None:
-                    w_quantizer._amax = gu_amax
-            else:
-                w_quantizer = w_quantizer_src
-                if down_amax is not None:
-                    # Replace any CUDA-resident _amax with the safe CPU copy.
-                    w_quantizer._amax = down_amax
+            w_quantizer = copy.deepcopy(w_quantizer_src) if is_gate_up else w_quantizer_src
 
             # For per-channel amax (dim >= 1), proportionally slice dim-0
             # to match the split weight.
@@ -162,7 +109,7 @@ def _safe_amax(quantizer_src: nn.Module) -> torch.Tensor | None:
                 and w_quantizer._amax is not None
                 and w_quantizer._amax.dim() >= 1
             ):
-                amax = w_quantizer._amax  # safe-extracted via _safe_amax (CUDA or CPU, recovered if corrupt)
+                amax = w_quantizer._amax
                 # Per-block _amax (NVFP4 static) collapses the row axis we want
                 # to slice on; restore it so dim-0 slicing splits gate/up.
                 if amax.numel() != fused_total and amax.numel() % fused_total == 0:
@@ -185,14 +132,13 @@ def _safe_amax(quantizer_src: nn.Module) -> torch.Tensor | None:
                     )
 
             # If the weight quantizer was never calibrated, compute amax from weights.
-            # All amax tests below operate on the safe CPU tensor injected above.
             if (
                 hasattr(w_quantizer, "is_enabled")
                 and w_quantizer.is_enabled
                 and (
                     not hasattr(w_quantizer, "_amax")
                     or w_quantizer._amax is None
-                    or bool(torch.all(w_quantizer._amax == 0))
+                    or torch.all(w_quantizer._amax == 0)
                 )
             ):
                 w_quantizer.amax = weight_slice.abs().amax().to(torch.float32)
@@ -203,23 +149,6 @@ def _safe_amax(quantizer_src: nn.Module) -> torch.Tensor | None:
                     stacklevel=2,
                 )
 
-            # Align _amax and global_amax with the weight slice's device. The
-            # export math ``per_block_scale * 448 / per_block_scale_max`` reads
-            # both from the quantizer and would otherwise error if they drifted
-            # apart (e.g., CPU-offloaded big-model layers + CUDA-resident weight
-            # slice, or our CPU-injected _amax + the original CUDA global_amax).
-            # No magnitude floor — that's main's policy for the uncalibrated
-            # fallback below.
-            if (
-                hasattr(w_quantizer, "_amax")
-                and w_quantizer._amax is not None
-            ):
-                target_device = weight_slice.device
-                if w_quantizer._amax.device != target_device:
-                    w_quantizer._amax = w_quantizer._amax.to(target_device)
-                if hasattr(w_quantizer, "global_amax"):
-                    w_quantizer.global_amax = w_quantizer._amax.float().amax()
-
             wrapper = nn.Module()
             wrapper.weight = nn.Parameter(weight_slice.contiguous(), requires_grad=False)
             wrapper.weight_quantizer = w_quantizer
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index 0626d0a8fd5..7efddff0c89 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -17,6 +17,7 @@
 
 import collections.abc
 import json
+import os
 import re
 import tempfile
 import warnings
@@ -665,6 +666,16 @@ def _process_quantized_modules(
             # _QuantFusedExperts uses plural `gate_up_proj_weight_quantizers` (ModuleList),
             # which get_quantization_format's singular-weight_quantizer check misses. Handle
             # it explicitly before the format gate so fused-experts get split + quantized.
+            # Debug hatch (paired with MO_DEBUG_MAX_LAYERS in model_calib.layerwise_calibrate):
+            # skip _export_fused_experts for layers whose layerwise calibration was never run.
+            # Those layers' per-expert quantizers have no _amax — touching them triggers the
+            # uncalibrated-fallback warnings or, with corrupt storage, a CUDA illegal-memory
+            # error. With the calibrated layers only, every expert has a valid _amax.
+            _debug_max = int(os.environ.get("MO_DEBUG_MAX_LAYERS", "0") or "0")
+            if _debug_max > 0:
+                _m = re.search(r"\.layers\.(\d+)\.", name or "")
+                if _m and int(_m.group(1)) >= _debug_max:
+                    continue
             with fsdp2_aware_weight_update(model, sub_module, reshard=False):
                 _export_fused_experts(sub_module, dtype)
         elif get_quantization_format(sub_module) != QUANTIZATION_NONE:

From 5f0038a89513558c203ec86f3cf54e0fa97756fc Mon Sep 17 00:00:00 2001
From: Frida Hou <201670829+Fridah-nv@users.noreply.github.com>
Date: Fri, 15 May 2026 12:01:36 -0700
Subject: [PATCH 7/7] Revert MO_DEBUG_MAX_LAYERS hatches in model_calib +
 unified_export_hf
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The env-var-gated early-break (model_calib.layerwise_calibrate) and
export skip (unified_export_hf._process_quantized_modules) were only
needed to bound wall-clock during the cliff-fix smoke test. The bug fix
itself is purely about not bringing over glm5.1-tmp's clamps in
moe_utils.py — which we already don't. Removing the debug hatches keeps
the branch a clean superset of main's production behavior.
---
 modelopt/torch/export/unified_export_hf.py | 11 -----------
 modelopt/torch/quantization/model_calib.py |  9 ---------
 2 files changed, 20 deletions(-)

diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index 7efddff0c89..0626d0a8fd5 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -17,7 +17,6 @@
 
 import collections.abc
 import json
-import os
 import re
 import tempfile
 import warnings
@@ -666,16 +665,6 @@ def _process_quantized_modules(
             # _QuantFusedExperts uses plural `gate_up_proj_weight_quantizers` (ModuleList),
             # which get_quantization_format's singular-weight_quantizer check misses. Handle
             # it explicitly before the format gate so fused-experts get split + quantized.
-            # Debug hatch (paired with MO_DEBUG_MAX_LAYERS in model_calib.layerwise_calibrate):
-            # skip _export_fused_experts for layers whose layerwise calibration was never run.
-            # Those layers' per-expert quantizers have no _amax — touching them triggers the
-            # uncalibrated-fallback warnings or, with corrupt storage, a CUDA illegal-memory
-            # error. With the calibrated layers only, every expert has a valid _amax.
-            _debug_max = int(os.environ.get("MO_DEBUG_MAX_LAYERS", "0") or "0")
-            if _debug_max > 0:
-                _m = re.search(r"\.layers\.(\d+)\.", name or "")
-                if _m and int(_m.group(1)) >= _debug_max:
-                    continue
             with fsdp2_aware_weight_update(model, sub_module, reshard=False):
                 _export_fused_experts(sub_module, dtype)
         elif get_quantization_format(sub_module) != QUANTIZATION_NONE:
diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
index feab954059b..78b237847b1 100644
--- a/modelopt/torch/quantization/model_calib.py
+++ b/modelopt/torch/quantization/model_calib.py
@@ -16,7 +16,6 @@
 """Calibration utilities."""
 
 import math
-import os
 import time
 import warnings
 from collections.abc import Callable
@@ -1767,15 +1766,7 @@ def layerwise_calibrate(
             start_layer, resumed_inputs, forward_loop
         )
 
-        _debug_max_layers = int(os.environ.get("MO_DEBUG_MAX_LAYERS", "0") or "0")
-
         for layer_idx in range(start_layer, num_layers):
-            if _debug_max_layers > 0 and layer_idx >= _debug_max_layers:
-                print_rank_0(
-                    f"MO_DEBUG_MAX_LAYERS={_debug_max_layers}: stopping layerwise "
-                    f"calibration after layer {layer_idx - 1}/{num_layers}"
-                )
-                break
             layer = transformer_layers[layer_idx]
 
             def _layer_forward_loop(m, _inputs=layer_inputs):