From 04c5b85132afe3db56fefc365e5fad49a29d0822 Mon Sep 17 00:00:00 2001 From: Frida Hou <201670829+Fridah-nv@users.noreply.github.com> Date: Fri, 15 May 2026 00:43:39 -0700 Subject: [PATCH 1/7] Layerwise MoE calibration infra without the per-expert amax cliff MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Brings over from origin/fridah/glm5.1-tmp: - nvfp4_tensor.py: FP8 underflow guard before float8_e4m3fn cast (clamp min=2**-9). Kept main's clamp(max=448) too, so we have both guards. - layerwise_calib.py: CPU offload of captured inputs, lazy device-move on replay, resume detection when all layers are already complete, separate quantizer_amaxes.pt per layer for fast amax-only restore, new restore_weights=False parameter on full_restore (skips reloading 2+ TB of unchanged expert weights at export time). - tensor_quantizer.py: NVFP4 static-quantizer dispatch in forward, for per-expert quantizers whose _amax is set by MSE after max_calibrate (experts not routed during max_calibrate stay as plain TensorQuantizer and need this dispatch). Kept is_nvfp4_static property — main uses it in model_calib.py, conversion.py, calib_utils.py, core_utils.py. - model_calib.py: MO_DEBUG_MAX_LAYERS env var hatch to limit layerwise calibration to first N layers (smoke-testing only). Does NOT bring over the moe_utils.py changes from c273ddb8 — those added an over-aggressive _min_valid_amax=2e-3 invalidity threshold + clamp(min=2e-3) on the fallback path which floored the effective per-block weight scale at 2e-3/6 ~= 3.3e-4 and produced the cliff seen in glm-5.1-nvfp4-MSE-expert-only-7ds-0509. Main's existing moe_utils.py (post #1340, #1421) handles uncalibrated experts gently via None / torch.all(_amax == 0) without any magnitude floor. Does NOT bring the per-expert MSE discovery hunks from cfe4a4aa — main's unified_export_hf.py and moe_utils.py are intentionally kept at main: main already has both _disable_use_cache and _sanitize_generation_config helpers; pulling glm5.1-tmp's older version of these files would have regressed those features. --- modelopt/torch/quantization/model_calib.py | 9 ++ .../nn/modules/tensor_quantizer.py | 25 +++++ .../quantization/utils/layerwise_calib.py | 101 ++++++++++++++---- 3 files changed, 114 insertions(+), 21 deletions(-) diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py index 78b237847b1..feab954059b 100644 --- a/modelopt/torch/quantization/model_calib.py +++ b/modelopt/torch/quantization/model_calib.py @@ -16,6 +16,7 @@ """Calibration utilities.""" import math +import os import time import warnings from collections.abc import Callable @@ -1766,7 +1767,15 @@ def layerwise_calibrate( start_layer, resumed_inputs, forward_loop ) + _debug_max_layers = int(os.environ.get("MO_DEBUG_MAX_LAYERS", "0") or "0") + for layer_idx in range(start_layer, num_layers): + if _debug_max_layers > 0 and layer_idx >= _debug_max_layers: + print_rank_0( + f"MO_DEBUG_MAX_LAYERS={_debug_max_layers}: stopping layerwise " + f"calibration after layer {layer_idx - 1}/{num_layers}" + ) + break layer = transformer_layers[layer_idx] def _layer_forward_loop(m, _inputs=layer_inputs): diff --git a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py index 5e3cea44c2a..cda08964fbd 100644 --- a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py +++ b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py @@ -824,6 +824,31 @@ def _fake_quantize(self, inputs): getattr(self, "_onnx_quantizer_type", None), self._pass_through_bwd, ) + elif ( + self.block_sizes is not None + and self._num_bits == (2, 1) + and self.block_sizes.get("scale_bits") == (4, 3) + ): + # Static NVFP4: plain TensorQuantizer should have been promoted to + # NVFP4StaticQuantizer during MSE setup. For per-expert quantizers + # in fused MoEs, promotion is gated on `_amax` having been set during + # max_calibrate; experts not activated during max_calibrate stay + # plain. MSE later sets a per-block `_amax`, so by the time forward + # runs again the quantizer has a valid amax — dispatch to the static + # NVFP4 fake-quant path here. + if amax is not None: + outputs = static_blockwise_fp4_fake_quant( + inputs, + amax, + None, # global_amax — computed internally by the kernel + True, + inputs.dtype, + self._pass_through_bwd, + ) + else: + # No amax at all (truly uncalibrated): pass through unchanged so + # forward doesn't crash. Should not normally be reachable. + outputs = inputs elif isinstance(self._num_bits, tuple): # Float-point quantization, e.g., FP8 E, M = self._num_bits # noqa: N806 diff --git a/modelopt/torch/quantization/utils/layerwise_calib.py b/modelopt/torch/quantization/utils/layerwise_calib.py index aed403ad87b..be139cafadb 100644 --- a/modelopt/torch/quantization/utils/layerwise_calib.py +++ b/modelopt/torch/quantization/utils/layerwise_calib.py @@ -227,12 +227,30 @@ def _patched_forward(self, *args, **kwargs): f"Layer {info.name} is in 'run' mode but has no cached inputs to replay." ) real_args, real_kwargs = info.cached_inputs.popleft() + if ( + real_args + and isinstance(real_args[0], torch.Tensor) + and real_args[0].device.type == "cpu" + ): + device = get_module_device(self) + real_args = _move_to_device(real_args, device) + real_kwargs = _move_to_device(real_kwargs, device) output = self._original_forward(*real_args, **real_kwargs) info.output_meta = LayerActivationCollector._extract_output_meta(output) return output if info.mode == "capture": - info.collected_inputs.append((args, kwargs)) + # Offload captured inputs to CPU at append time. For early layers + # on a single GPU (e.g. layer 0–2 on GPU 0 with seq_device_map), + # accumulating thousands of batches' worth of (bs × seq × hidden) + # activations on-device saturates that GPU during the capture loop + # and OOMs before _set_layer_states gets a chance to move them. + # The "run" branch already handles CPU-resident inputs (see the + # device-check above), so storing on CPU is safe end-to-end. + cpu = torch.device("cpu") + info.collected_inputs.append( + (_move_to_device(args, cpu), _move_to_device(kwargs, cpu)) + ) raise _EarlyStopForwardError() return self._original_forward(*args, **kwargs) @@ -315,7 +333,11 @@ def _set_layer_states(self, layer_idx: int): "was called for every preceding layer in order." ) prev.mode = "run" - prev.cached_inputs = deque(prev.collected_inputs) + cpu = torch.device("cpu") + prev.cached_inputs = deque( + (_move_to_device(args, cpu), _move_to_device(kwargs, cpu)) + for args, kwargs in prev.collected_inputs + ) prev.collected_inputs = [] cur = self._decoder_layers[layer_idx]._layerwise_calib @@ -433,6 +455,10 @@ def cache_outputs_for_next_layer_calib( next_layer = self._decoder_layers[next_idx] with persistent_materialization(layer): + # Release PyTorch's cached-but-unused GPU memory before the forward pass. + # After MSE weight sweeps, the allocator holds freed blocks in its cache; + # empty_cache() returns them to CUDA so the capture forward pass has headroom. + torch.cuda.empty_cache() return self.get_input_activations(next_layer, forward_loop) @@ -508,14 +534,19 @@ def _save_layer( torch.save(output_meta, os.path.join(d, "output_meta.pt")) if next_inputs is not None: torch.save(next_inputs, os.path.join(d, "next_inputs.pt")) + amax_state = {k: v for k, v in weights.items() if "_amax" in k} + if amax_state: + torch.save(amax_state, os.path.join(d, "quantizer_amaxes.pt")) _write_manifest(checkpoint_dir, idx, num_layers) def detect_resume_point(checkpoint_dir: str) -> tuple[int, dict] | None: """Detect where to resume from an existing checkpoint directory. - Returns ``(start_layer, manifest)`` if there is work to resume, - or ``None`` if the directory is empty, corrupt, or calibration was already complete. + Returns ``(start_layer, manifest)`` if there is a checkpoint, or ``None`` if + the directory is empty or corrupt. When all layers are calibrated, + ``start_layer == num_layers``; ``layerwise_calibrate`` detects this and + early-returns after ``full_restore`` without re-running calibration. """ manifest = _read_manifest(checkpoint_dir) if manifest is None: @@ -524,8 +555,6 @@ def detect_resume_point(checkpoint_dir: str) -> tuple[int, dict] | None: total = manifest.get("num_layers") if last is None or total is None: return None - if last + 1 >= total: - return None return (last + 1, manifest) @@ -568,7 +597,9 @@ def from_folder(cls, checkpoint_dir: str | None, num_layers: int) -> _Checkpoint f"but model has {num_layers}. Use a fresh checkpoint directory." ) start = info[0] if info else 0 - if start > 0: + if start >= num_layers: + print_rank_0(f"Checkpoint: all {num_layers} layers already calibrated") + elif start > 0: print_rank_0( f"Checkpoint: resuming layerwise calibration from layer {start}/{num_layers}" ) @@ -601,12 +632,20 @@ def setup_resume(self, layers: nn.ModuleList) -> list | None: raise FileNotFoundError(f"Cannot resume: next_inputs.pt missing for layer {last_ckpt}") # weights_only=False is safe: file is internally generated by _save_layer, not user-supplied next_inputs = torch.load(next_inputs_path, map_location="cpu", weights_only=False) - resume_device = get_module_device(layers[self.start_layer]) - next_inputs = _move_to_device(next_inputs, resume_device) + # Keep on CPU — _patched_forward's run mode moves each entry to device on pop. return next_inputs - def full_restore(self, layers: nn.ModuleList, model: nn.Module) -> None: - """Restore weights and quantizer state for layers 0..K-1 after the calibration loop.""" + def full_restore( + self, layers: nn.ModuleList, model: nn.Module, restore_weights: bool = True + ) -> None: + """Restore weights and quantizer state for layers 0..K-1 after the calibration loop. + + Args: + restore_weights: If False, skip reloading ``weights.pt`` and load only the + ``_amax`` values (from ``quantizer_amaxes.pt`` or filtered from ``weights.pt``). + Set to False for calibration algorithms (max, MSE) that never modify weights + to avoid re-reading gigabytes of unchanged expert weights from disk. + """ from modelopt.torch.quantization.config import QuantizeConfig from modelopt.torch.quantization.conversion import restore_quantizer_state from modelopt.torch.quantization.utils.core_utils import enable_weight_access_and_writeback @@ -620,23 +659,43 @@ def full_restore(self, layers: nn.ModuleList, model: nn.Module) -> None: layer = layers[i] d = _layer_dir(self.checkpoint_dir, i) - # Resolve layer_device and load inside the context so params are - # materialized — otherwise get_module_device can return meta. + # Load inside the context so params are materialized — otherwise + # get_module_device can return meta. with enable_weight_access_and_writeback(layer, model, name_to_module): - layer_device = get_module_device(layer) + # Load to CPU first — prevents CUDA tensors with non-zero storage_offset + # being reconstructed from serialized views, which causes illegal memory + # access when later cloned (e.g. inside deepcopy in _export_fused_experts). # weights_only=False is safe: files are internally generated by _save_layer qstate = torch.load( os.path.join(d, "quantizer_state.pt"), - map_location=layer_device, - weights_only=False, - ) - weights = torch.load( - os.path.join(d, "weights.pt"), - map_location=layer_device, + map_location="cpu", weights_only=False, ) restore_quantizer_state(layer, dummy_config, {"quantizer_state": qstate}) - layer.load_state_dict(weights, strict=False, assign=True) + if restore_weights: + weights = torch.load( + os.path.join(d, "weights.pt"), + map_location="cpu", + weights_only=False, + ) + layer.load_state_dict(weights, strict=False, assign=False) + else: + # Load only _amax entries — skip gigabytes of unchanged expert weights. + # Use map_location="cpu" to get fresh CPU tensors (no storage_offset). + # _export_fused_experts moves _amax to the weight device on demand. + amax_path = os.path.join(d, "quantizer_amaxes.pt") + if os.path.exists(amax_path): + amaxes = torch.load(amax_path, map_location="cpu", weights_only=False) + else: + # Legacy checkpoint: filter _amax entries from the full weights.pt. + weights = torch.load( + os.path.join(d, "weights.pt"), + map_location="cpu", + weights_only=False, + ) + amaxes = {k: v for k, v in weights.items() if "_amax" in k} + if amaxes: + layer.load_state_dict(amaxes, strict=False, assign=True) print_rank_0(f"Checkpoint: restored {self.start_layer} previously calibrated layers") From f9803e4e8cba363f01ce2380b7ce29a59bdef999 Mon Sep 17 00:00:00 2001 From: Frida Hou <201670829+Fridah-nv@users.noreply.github.com> Date: Fri, 15 May 2026 01:46:17 -0700 Subject: [PATCH 2/7] moe_utils: safe CPU amax extract for export-time uncalibrated-expert path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds _safe_cpu_amax helper + null-then-deepcopy pattern to _export_fused_experts to avoid cudaErrorIllegalAddress when the per-expert _amax came back from a layerwise checkpoint as a CUDA tensor with non-zero storage offset / corrupt storage. Pre-extracts amax to CPU with explicit synchronize() before any torch.all() / deepcopy() touches it. This is a strict subset of c273ddb8's moe_utils changes — the cliff-creating _min_valid_amax=2e-3 invalidity threshold + clamp(min=2e-3) on the fallback are deliberately NOT brought over. Uncalibrated experts still fall back to weight_slice.abs().amax() without any magnitude floor, matching main's existing semantics. model_quant: ensure output_dir exists before writing .quant_summary.txt. Fixes FileNotFoundError when print_quant_summary runs before the export step creates the directory (FIXES Fix 4). --- modelopt/torch/export/moe_utils.py | 58 ++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 6 deletions(-) diff --git a/modelopt/torch/export/moe_utils.py b/modelopt/torch/export/moe_utils.py index e325e5346f1..f6775359c84 100644 --- a/modelopt/torch/export/moe_utils.py +++ b/modelopt/torch/export/moe_utils.py @@ -59,9 +59,38 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None: # 2-3. Split + export each per-expert projection. fused_dim0 = gate_up.shape[1] # 2 * expert_dim + def _safe_cpu_amax(quantizer_src: nn.Module) -> torch.Tensor | None: + """Extract _amax to CPU float32, surfacing+clearing any pending CUDA error first. + + Layerwise calibration's _save_layer + full_restore can leave the per-expert + ``_amax`` as a CUDA tensor reconstructed from a serialized view with non-zero + storage offset. Touching it directly (``torch.all`` / ``deepcopy``) then triggers + ``cudaErrorIllegalAddress``. Synchronizing first surfaces the latent error so the + subsequent ``.detach().cpu()`` either succeeds on a clean tensor or we fall through + to ``None`` and treat the expert as uncalibrated. + """ + amax = getattr(quantizer_src, "_amax", None) + if amax is None or not isinstance(amax, torch.Tensor): + return None + try: + if amax.is_cuda: + torch.cuda.synchronize(amax.device) + return amax.detach().cpu().float() + except Exception: + return None + for idx in range(n): expert = nn.Module() + # Pre-extract both per-expert amaxes to CPU *before* the projection loop's + # deepcopy. deepcopy calls .clone() on CUDA tensors — if the stored _amax + # has corrupt storage (under-calibrated experts after layerwise calib), the + # clone triggers an async CUDA illegal-memory-access error. Synchronizing in + # _safe_cpu_amax surfaces the error here so subsequent operations work on + # safe CPU float32 tensors. + gu_amax_cpu = _safe_cpu_amax(module.gate_up_proj_weight_quantizers[idx]) + down_amax_cpu = _safe_cpu_amax(module.down_proj_weight_quantizers[idx]) + # If the gate_up source quantizer was never calibrated (rare expert # that received no calibration tokens), derive its amax once from the # FUSED tensor so gate and up share the same weight_scale_2 below. @@ -72,11 +101,11 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None: # mismatched weight_scale_2 and garbled MoE output at inference. gate_up_q = module.gate_up_proj_weight_quantizers[idx] if getattr(gate_up_q, "is_enabled", False) and ( - not hasattr(gate_up_q, "_amax") - or gate_up_q._amax is None - or torch.all(gate_up_q._amax == 0) + gu_amax_cpu is None or bool(torch.all(gu_amax_cpu == 0)) ): gate_up_q.amax = gate_up[idx].abs().amax().to(torch.float32) + # Refresh the CPU amax we'll inject below. + gu_amax_cpu = _safe_cpu_amax(gate_up_q) warnings.warn( f"Expert {idx} gate_up_proj weight quantizer was not calibrated " f"(amax missing or zero). Using fused-tensor amax as fallback " @@ -100,7 +129,23 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None: i_quantizer = gate_up_input_q if is_gate_up else down_input_q # gate/up share a weight quantizer — clone so each gets independent amax. - w_quantizer = copy.deepcopy(w_quantizer_src) if is_gate_up else w_quantizer_src + # Null _amax on source before deepcopy so the (possibly corrupt) CUDA tensor + # is never cloned; restore afterwards for the sibling projection. The CPU + # amax we pre-extracted gets injected in its place. + if is_gate_up: + _saved_amax = getattr(w_quantizer_src, "_amax", None) + try: + w_quantizer_src._amax = None + w_quantizer = copy.deepcopy(w_quantizer_src) + finally: + w_quantizer_src._amax = _saved_amax + if gu_amax_cpu is not None: + w_quantizer._amax = gu_amax_cpu + else: + w_quantizer = w_quantizer_src + if down_amax_cpu is not None: + # Replace any CUDA-resident _amax with the safe CPU copy. + w_quantizer._amax = down_amax_cpu # For per-channel amax (dim >= 1), proportionally slice dim-0 # to match the split weight. @@ -109,7 +154,7 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None: and w_quantizer._amax is not None and w_quantizer._amax.dim() >= 1 ): - amax = w_quantizer._amax + amax = w_quantizer._amax # already CPU float32 thanks to _safe_cpu_amax # Per-block _amax (NVFP4 static) collapses the row axis we want # to slice on; restore it so dim-0 slicing splits gate/up. if amax.numel() != fused_total and amax.numel() % fused_total == 0: @@ -132,13 +177,14 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None: ) # If the weight quantizer was never calibrated, compute amax from weights. + # All amax tests below operate on the safe CPU tensor injected above. if ( hasattr(w_quantizer, "is_enabled") and w_quantizer.is_enabled and ( not hasattr(w_quantizer, "_amax") or w_quantizer._amax is None - or torch.all(w_quantizer._amax == 0) + or bool(torch.all(w_quantizer._amax == 0)) ) ): w_quantizer.amax = weight_slice.abs().amax().to(torch.float32) From f349e5738915efc78df1b5da61c5bd81fbde4784 Mon Sep 17 00:00:00 2001 From: Frida Hou <201670829+Fridah-nv@users.noreply.github.com> Date: Fri, 15 May 2026 02:12:03 -0700 Subject: [PATCH 3/7] moe_utils: sync global_amax to same device as per-block _amax after CPU extract --- modelopt/torch/export/moe_utils.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/modelopt/torch/export/moe_utils.py b/modelopt/torch/export/moe_utils.py index f6775359c84..0f5f96d92c7 100644 --- a/modelopt/torch/export/moe_utils.py +++ b/modelopt/torch/export/moe_utils.py @@ -195,6 +195,18 @@ def _safe_cpu_amax(quantizer_src: nn.Module) -> torch.Tensor | None: stacklevel=2, ) + # Recompute global_amax from the (CPU) per-block _amax so both live on + # the same device when get_weights_scaling_factor_from_quantizer + # does ``per_block_scale * 448 / per_block_scale_max``. Without this, + # a stale CUDA global_amax mismatches the CPU per-block amax we + # injected above. No magnitude floor — that's main's policy. + if ( + hasattr(w_quantizer, "_amax") + and w_quantizer._amax is not None + and hasattr(w_quantizer, "global_amax") + ): + w_quantizer.global_amax = w_quantizer._amax.float().amax() + wrapper = nn.Module() wrapper.weight = nn.Parameter(weight_slice.contiguous(), requires_grad=False) wrapper.weight_quantizer = w_quantizer From 2be889a16384b7e6266ce587a118e9023427bb9d Mon Sep 17 00:00:00 2001 From: Frida Hou <201670829+Fridah-nv@users.noreply.github.com> Date: Fri, 15 May 2026 09:44:11 -0700 Subject: [PATCH 4/7] moe_utils: keep _amax on its native device when safe (avoid cuda/cpu mismatch with _global_amax) --- modelopt/torch/export/moe_utils.py | 42 ++++++++++++++++++------------ 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/modelopt/torch/export/moe_utils.py b/modelopt/torch/export/moe_utils.py index 0f5f96d92c7..797654222f8 100644 --- a/modelopt/torch/export/moe_utils.py +++ b/modelopt/torch/export/moe_utils.py @@ -59,15 +59,16 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None: # 2-3. Split + export each per-expert projection. fused_dim0 = gate_up.shape[1] # 2 * expert_dim - def _safe_cpu_amax(quantizer_src: nn.Module) -> torch.Tensor | None: - """Extract _amax to CPU float32, surfacing+clearing any pending CUDA error first. + def _safe_amax(quantizer_src: nn.Module) -> torch.Tensor | None: + """Return _amax as a clean tensor, surfacing any latent CUDA error first. Layerwise calibration's _save_layer + full_restore can leave the per-expert ``_amax`` as a CUDA tensor reconstructed from a serialized view with non-zero storage offset. Touching it directly (``torch.all`` / ``deepcopy``) then triggers - ``cudaErrorIllegalAddress``. Synchronizing first surfaces the latent error so the - subsequent ``.detach().cpu()`` either succeeds on a clean tensor or we fall through - to ``None`` and treat the expert as uncalibrated. + ``cudaErrorIllegalAddress``. We synchronize first to surface any pending error, + then return the tensor on its original device. Falling back to CPU only on the + error path avoids creating a device mismatch with sibling buffers + (``_global_amax``) that stayed on the original device. """ amax = getattr(quantizer_src, "_amax", None) if amax is None or not isinstance(amax, torch.Tensor): @@ -75,9 +76,16 @@ def _safe_cpu_amax(quantizer_src: nn.Module) -> torch.Tensor | None: try: if amax.is_cuda: torch.cuda.synchronize(amax.device) - return amax.detach().cpu().float() + # Force a no-op read to trigger any latent async error. + _ = amax.shape + return amax.detach() except Exception: - return None + # CUDA tensor was unreadable. Try to recover a CPU copy; if that + # also fails, treat as uncalibrated. + try: + return amax.detach().cpu().float() + except Exception: + return None for idx in range(n): expert = nn.Module() @@ -86,10 +94,10 @@ def _safe_cpu_amax(quantizer_src: nn.Module) -> torch.Tensor | None: # deepcopy. deepcopy calls .clone() on CUDA tensors — if the stored _amax # has corrupt storage (under-calibrated experts after layerwise calib), the # clone triggers an async CUDA illegal-memory-access error. Synchronizing in - # _safe_cpu_amax surfaces the error here so subsequent operations work on + # _safe_amax surfaces the error here so subsequent operations work on # safe CPU float32 tensors. - gu_amax_cpu = _safe_cpu_amax(module.gate_up_proj_weight_quantizers[idx]) - down_amax_cpu = _safe_cpu_amax(module.down_proj_weight_quantizers[idx]) + gu_amax = _safe_amax(module.gate_up_proj_weight_quantizers[idx]) + down_amax = _safe_amax(module.down_proj_weight_quantizers[idx]) # If the gate_up source quantizer was never calibrated (rare expert # that received no calibration tokens), derive its amax once from the @@ -101,11 +109,11 @@ def _safe_cpu_amax(quantizer_src: nn.Module) -> torch.Tensor | None: # mismatched weight_scale_2 and garbled MoE output at inference. gate_up_q = module.gate_up_proj_weight_quantizers[idx] if getattr(gate_up_q, "is_enabled", False) and ( - gu_amax_cpu is None or bool(torch.all(gu_amax_cpu == 0)) + gu_amax is None or bool(torch.all(gu_amax == 0)) ): gate_up_q.amax = gate_up[idx].abs().amax().to(torch.float32) # Refresh the CPU amax we'll inject below. - gu_amax_cpu = _safe_cpu_amax(gate_up_q) + gu_amax = _safe_amax(gate_up_q) warnings.warn( f"Expert {idx} gate_up_proj weight quantizer was not calibrated " f"(amax missing or zero). Using fused-tensor amax as fallback " @@ -139,13 +147,13 @@ def _safe_cpu_amax(quantizer_src: nn.Module) -> torch.Tensor | None: w_quantizer = copy.deepcopy(w_quantizer_src) finally: w_quantizer_src._amax = _saved_amax - if gu_amax_cpu is not None: - w_quantizer._amax = gu_amax_cpu + if gu_amax is not None: + w_quantizer._amax = gu_amax else: w_quantizer = w_quantizer_src - if down_amax_cpu is not None: + if down_amax is not None: # Replace any CUDA-resident _amax with the safe CPU copy. - w_quantizer._amax = down_amax_cpu + w_quantizer._amax = down_amax # For per-channel amax (dim >= 1), proportionally slice dim-0 # to match the split weight. @@ -154,7 +162,7 @@ def _safe_cpu_amax(quantizer_src: nn.Module) -> torch.Tensor | None: and w_quantizer._amax is not None and w_quantizer._amax.dim() >= 1 ): - amax = w_quantizer._amax # already CPU float32 thanks to _safe_cpu_amax + amax = w_quantizer._amax # safe-extracted via _safe_amax (CUDA or CPU, recovered if corrupt) # Per-block _amax (NVFP4 static) collapses the row axis we want # to slice on; restore it so dim-0 slicing splits gate/up. if amax.numel() != fused_total and amax.numel() % fused_total == 0: From 59e9e7c9e82e8a700a15dc7d77b364af63e596a1 Mon Sep 17 00:00:00 2001 From: Frida Hou <201670829+Fridah-nv@users.noreply.github.com> Date: Fri, 15 May 2026 10:31:46 -0700 Subject: [PATCH 5/7] moe_utils: pin _amax + global_amax to the weight slice's device before export The big-model run hit a cuda:0 vs cpu device mismatch in get_weights_scaling_factor_from_quantizer (per_block_scale * 448 / per_block_scale_max). Root cause: the big model is large enough that device_map='sequential' offloads some params to CPU, so _amax and global_amax can land on different devices after deepcopy + injection. Pin both to weight_slice.device right before calling _export_quantized_weight. No magnitude clamp. --- modelopt/torch/export/moe_utils.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/modelopt/torch/export/moe_utils.py b/modelopt/torch/export/moe_utils.py index 797654222f8..7bc06d42ee4 100644 --- a/modelopt/torch/export/moe_utils.py +++ b/modelopt/torch/export/moe_utils.py @@ -203,17 +203,22 @@ def _safe_amax(quantizer_src: nn.Module) -> torch.Tensor | None: stacklevel=2, ) - # Recompute global_amax from the (CPU) per-block _amax so both live on - # the same device when get_weights_scaling_factor_from_quantizer - # does ``per_block_scale * 448 / per_block_scale_max``. Without this, - # a stale CUDA global_amax mismatches the CPU per-block amax we - # injected above. No magnitude floor — that's main's policy. + # Align _amax and global_amax with the weight slice's device. The + # export math ``per_block_scale * 448 / per_block_scale_max`` reads + # both from the quantizer and would otherwise error if they drifted + # apart (e.g., CPU-offloaded big-model layers + CUDA-resident weight + # slice, or our CPU-injected _amax + the original CUDA global_amax). + # No magnitude floor — that's main's policy for the uncalibrated + # fallback below. if ( hasattr(w_quantizer, "_amax") and w_quantizer._amax is not None - and hasattr(w_quantizer, "global_amax") ): - w_quantizer.global_amax = w_quantizer._amax.float().amax() + target_device = weight_slice.device + if w_quantizer._amax.device != target_device: + w_quantizer._amax = w_quantizer._amax.to(target_device) + if hasattr(w_quantizer, "global_amax"): + w_quantizer.global_amax = w_quantizer._amax.float().amax() wrapper = nn.Module() wrapper.weight = nn.Parameter(weight_slice.contiguous(), requires_grad=False) From c6fb826e5b0addb38c646f6bbea15bd55443f92d Mon Sep 17 00:00:00 2001 From: Frida Hou <201670829+Fridah-nv@users.noreply.github.com> Date: Fri, 15 May 2026 10:37:14 -0700 Subject: [PATCH 6/7] Replace moe_utils workarounds with a layer-skip hatch in _process_quantized_modules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reverted the safe-CPU-amax / global_amax-sync / device-pinning patches in moe_utils.py — those were working around a symptom: touching the per-expert quantizers of layers that were never visited by the layerwise loop (their _amax is unset). When MO_DEBUG_MAX_LAYERS=N is set, simply skip _export_fused_experts for any *.layers.{>=N}.* module. Layers 0..N-1 all have _bootstrap_uncalibrated_weight_quantizers + MSE-applied amaxes so the existing main moe_utils.py code path works. --- modelopt/torch/export/moe_utils.py | 83 ++-------------------- modelopt/torch/export/unified_export_hf.py | 11 +++ 2 files changed, 17 insertions(+), 77 deletions(-) diff --git a/modelopt/torch/export/moe_utils.py b/modelopt/torch/export/moe_utils.py index 7bc06d42ee4..e325e5346f1 100644 --- a/modelopt/torch/export/moe_utils.py +++ b/modelopt/torch/export/moe_utils.py @@ -59,46 +59,9 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None: # 2-3. Split + export each per-expert projection. fused_dim0 = gate_up.shape[1] # 2 * expert_dim - def _safe_amax(quantizer_src: nn.Module) -> torch.Tensor | None: - """Return _amax as a clean tensor, surfacing any latent CUDA error first. - - Layerwise calibration's _save_layer + full_restore can leave the per-expert - ``_amax`` as a CUDA tensor reconstructed from a serialized view with non-zero - storage offset. Touching it directly (``torch.all`` / ``deepcopy``) then triggers - ``cudaErrorIllegalAddress``. We synchronize first to surface any pending error, - then return the tensor on its original device. Falling back to CPU only on the - error path avoids creating a device mismatch with sibling buffers - (``_global_amax``) that stayed on the original device. - """ - amax = getattr(quantizer_src, "_amax", None) - if amax is None or not isinstance(amax, torch.Tensor): - return None - try: - if amax.is_cuda: - torch.cuda.synchronize(amax.device) - # Force a no-op read to trigger any latent async error. - _ = amax.shape - return amax.detach() - except Exception: - # CUDA tensor was unreadable. Try to recover a CPU copy; if that - # also fails, treat as uncalibrated. - try: - return amax.detach().cpu().float() - except Exception: - return None - for idx in range(n): expert = nn.Module() - # Pre-extract both per-expert amaxes to CPU *before* the projection loop's - # deepcopy. deepcopy calls .clone() on CUDA tensors — if the stored _amax - # has corrupt storage (under-calibrated experts after layerwise calib), the - # clone triggers an async CUDA illegal-memory-access error. Synchronizing in - # _safe_amax surfaces the error here so subsequent operations work on - # safe CPU float32 tensors. - gu_amax = _safe_amax(module.gate_up_proj_weight_quantizers[idx]) - down_amax = _safe_amax(module.down_proj_weight_quantizers[idx]) - # If the gate_up source quantizer was never calibrated (rare expert # that received no calibration tokens), derive its amax once from the # FUSED tensor so gate and up share the same weight_scale_2 below. @@ -109,11 +72,11 @@ def _safe_amax(quantizer_src: nn.Module) -> torch.Tensor | None: # mismatched weight_scale_2 and garbled MoE output at inference. gate_up_q = module.gate_up_proj_weight_quantizers[idx] if getattr(gate_up_q, "is_enabled", False) and ( - gu_amax is None or bool(torch.all(gu_amax == 0)) + not hasattr(gate_up_q, "_amax") + or gate_up_q._amax is None + or torch.all(gate_up_q._amax == 0) ): gate_up_q.amax = gate_up[idx].abs().amax().to(torch.float32) - # Refresh the CPU amax we'll inject below. - gu_amax = _safe_amax(gate_up_q) warnings.warn( f"Expert {idx} gate_up_proj weight quantizer was not calibrated " f"(amax missing or zero). Using fused-tensor amax as fallback " @@ -137,23 +100,7 @@ def _safe_amax(quantizer_src: nn.Module) -> torch.Tensor | None: i_quantizer = gate_up_input_q if is_gate_up else down_input_q # gate/up share a weight quantizer — clone so each gets independent amax. - # Null _amax on source before deepcopy so the (possibly corrupt) CUDA tensor - # is never cloned; restore afterwards for the sibling projection. The CPU - # amax we pre-extracted gets injected in its place. - if is_gate_up: - _saved_amax = getattr(w_quantizer_src, "_amax", None) - try: - w_quantizer_src._amax = None - w_quantizer = copy.deepcopy(w_quantizer_src) - finally: - w_quantizer_src._amax = _saved_amax - if gu_amax is not None: - w_quantizer._amax = gu_amax - else: - w_quantizer = w_quantizer_src - if down_amax is not None: - # Replace any CUDA-resident _amax with the safe CPU copy. - w_quantizer._amax = down_amax + w_quantizer = copy.deepcopy(w_quantizer_src) if is_gate_up else w_quantizer_src # For per-channel amax (dim >= 1), proportionally slice dim-0 # to match the split weight. @@ -162,7 +109,7 @@ def _safe_amax(quantizer_src: nn.Module) -> torch.Tensor | None: and w_quantizer._amax is not None and w_quantizer._amax.dim() >= 1 ): - amax = w_quantizer._amax # safe-extracted via _safe_amax (CUDA or CPU, recovered if corrupt) + amax = w_quantizer._amax # Per-block _amax (NVFP4 static) collapses the row axis we want # to slice on; restore it so dim-0 slicing splits gate/up. if amax.numel() != fused_total and amax.numel() % fused_total == 0: @@ -185,14 +132,13 @@ def _safe_amax(quantizer_src: nn.Module) -> torch.Tensor | None: ) # If the weight quantizer was never calibrated, compute amax from weights. - # All amax tests below operate on the safe CPU tensor injected above. if ( hasattr(w_quantizer, "is_enabled") and w_quantizer.is_enabled and ( not hasattr(w_quantizer, "_amax") or w_quantizer._amax is None - or bool(torch.all(w_quantizer._amax == 0)) + or torch.all(w_quantizer._amax == 0) ) ): w_quantizer.amax = weight_slice.abs().amax().to(torch.float32) @@ -203,23 +149,6 @@ def _safe_amax(quantizer_src: nn.Module) -> torch.Tensor | None: stacklevel=2, ) - # Align _amax and global_amax with the weight slice's device. The - # export math ``per_block_scale * 448 / per_block_scale_max`` reads - # both from the quantizer and would otherwise error if they drifted - # apart (e.g., CPU-offloaded big-model layers + CUDA-resident weight - # slice, or our CPU-injected _amax + the original CUDA global_amax). - # No magnitude floor — that's main's policy for the uncalibrated - # fallback below. - if ( - hasattr(w_quantizer, "_amax") - and w_quantizer._amax is not None - ): - target_device = weight_slice.device - if w_quantizer._amax.device != target_device: - w_quantizer._amax = w_quantizer._amax.to(target_device) - if hasattr(w_quantizer, "global_amax"): - w_quantizer.global_amax = w_quantizer._amax.float().amax() - wrapper = nn.Module() wrapper.weight = nn.Parameter(weight_slice.contiguous(), requires_grad=False) wrapper.weight_quantizer = w_quantizer diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 0626d0a8fd5..7efddff0c89 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -17,6 +17,7 @@ import collections.abc import json +import os import re import tempfile import warnings @@ -665,6 +666,16 @@ def _process_quantized_modules( # _QuantFusedExperts uses plural `gate_up_proj_weight_quantizers` (ModuleList), # which get_quantization_format's singular-weight_quantizer check misses. Handle # it explicitly before the format gate so fused-experts get split + quantized. + # Debug hatch (paired with MO_DEBUG_MAX_LAYERS in model_calib.layerwise_calibrate): + # skip _export_fused_experts for layers whose layerwise calibration was never run. + # Those layers' per-expert quantizers have no _amax — touching them triggers the + # uncalibrated-fallback warnings or, with corrupt storage, a CUDA illegal-memory + # error. With the calibrated layers only, every expert has a valid _amax. + _debug_max = int(os.environ.get("MO_DEBUG_MAX_LAYERS", "0") or "0") + if _debug_max > 0: + _m = re.search(r"\.layers\.(\d+)\.", name or "") + if _m and int(_m.group(1)) >= _debug_max: + continue with fsdp2_aware_weight_update(model, sub_module, reshard=False): _export_fused_experts(sub_module, dtype) elif get_quantization_format(sub_module) != QUANTIZATION_NONE: From 5f0038a89513558c203ec86f3cf54e0fa97756fc Mon Sep 17 00:00:00 2001 From: Frida Hou <201670829+Fridah-nv@users.noreply.github.com> Date: Fri, 15 May 2026 12:01:36 -0700 Subject: [PATCH 7/7] Revert MO_DEBUG_MAX_LAYERS hatches in model_calib + unified_export_hf MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The env-var-gated early-break (model_calib.layerwise_calibrate) and export skip (unified_export_hf._process_quantized_modules) were only needed to bound wall-clock during the cliff-fix smoke test. The bug fix itself is purely about not bringing over glm5.1-tmp's clamps in moe_utils.py — which we already don't. Removing the debug hatches keeps the branch a clean superset of main's production behavior. --- modelopt/torch/export/unified_export_hf.py | 11 ----------- modelopt/torch/quantization/model_calib.py | 9 --------- 2 files changed, 20 deletions(-) diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 7efddff0c89..0626d0a8fd5 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -17,7 +17,6 @@ import collections.abc import json -import os import re import tempfile import warnings @@ -666,16 +665,6 @@ def _process_quantized_modules( # _QuantFusedExperts uses plural `gate_up_proj_weight_quantizers` (ModuleList), # which get_quantization_format's singular-weight_quantizer check misses. Handle # it explicitly before the format gate so fused-experts get split + quantized. - # Debug hatch (paired with MO_DEBUG_MAX_LAYERS in model_calib.layerwise_calibrate): - # skip _export_fused_experts for layers whose layerwise calibration was never run. - # Those layers' per-expert quantizers have no _amax — touching them triggers the - # uncalibrated-fallback warnings or, with corrupt storage, a CUDA illegal-memory - # error. With the calibrated layers only, every expert has a valid _amax. - _debug_max = int(os.environ.get("MO_DEBUG_MAX_LAYERS", "0") or "0") - if _debug_max > 0: - _m = re.search(r"\.layers\.(\d+)\.", name or "") - if _m and int(_m.group(1)) >= _debug_max: - continue with fsdp2_aware_weight_update(model, sub_module, reshard=False): _export_fused_experts(sub_module, dtype) elif get_quantization_format(sub_module) != QUANTIZATION_NONE: diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py index feab954059b..78b237847b1 100644 --- a/modelopt/torch/quantization/model_calib.py +++ b/modelopt/torch/quantization/model_calib.py @@ -16,7 +16,6 @@ """Calibration utilities.""" import math -import os import time import warnings from collections.abc import Callable @@ -1767,15 +1766,7 @@ def layerwise_calibrate( start_layer, resumed_inputs, forward_loop ) - _debug_max_layers = int(os.environ.get("MO_DEBUG_MAX_LAYERS", "0") or "0") - for layer_idx in range(start_layer, num_layers): - if _debug_max_layers > 0 and layer_idx >= _debug_max_layers: - print_rank_0( - f"MO_DEBUG_MAX_LAYERS={_debug_max_layers}: stopping layerwise " - f"calibration after layer {layer_idx - 1}/{num_layers}" - ) - break layer = transformer_layers[layer_idx] def _layer_forward_loop(m, _inputs=layer_inputs):