From 55c2d3500e5dff8e26c5dd0daae2f472f62872b7 Mon Sep 17 00:00:00 2001
From: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com>
Date: Thu, 5 Feb 2026 18:55:57 +0000
Subject: [PATCH 1/5] export support for NVFP4 static

Signed-off-by: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com>
---
 modelopt/torch/export/quant_utils.py       | 57 +++++++++++++++++++++-
 modelopt/torch/export/unified_export_hf.py | 25 +++++++---
 2 files changed, 74 insertions(+), 8 deletions(-)

diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
index 0d99d44f0..c8ebb1150 100755
--- a/modelopt/torch/export/quant_utils.py
+++ b/modelopt/torch/export/quant_utils.py
@@ -45,7 +45,7 @@
 )
 from modelopt.torch.utils import clear_cuda_cache
 
-from ..quantization.nn import SequentialQuantizer, TensorQuantizer
+from ..quantization.nn import NVFP4StaticQuantizer, SequentialQuantizer, TensorQuantizer
 from .model_config import (
     KV_CACHE_FP8,
     KV_CACHE_INT8,
@@ -299,6 +299,31 @@ def get_weight_scaling_factor(module: nn.Module, weight_name: str = "weight") ->
         return get_scaling_factor(weight_quantizer[0])
 
     quantization_format = get_quantization_format(module)
+
+    # Handle NVFP4 static quantizer (has pre-computed per-block amax values)
+    is_nvfp4_static = isinstance(weight_quantizer, NVFP4StaticQuantizer) or getattr(
+        weight_quantizer, "_is_nvfp4_static_quantizer", False
+    )
+    if is_nvfp4_static:
+        # For static NVFP4, _amax contains per-block amax values and _global_amax is the global scale
+        assert (
+            hasattr(weight_quantizer, "_global_amax") and weight_quantizer._global_amax is not None
+        )
+        global_amax = weight_quantizer._global_amax.float()
+        per_block_amax = weight_quantizer._amax.float()
+
+        block_size = weight_quantizer.block_sizes[-1]
+        weight_scaling_factor_2 = global_amax / (6.0 * 448.0)
+        per_block_scale = per_block_amax / (6.0 * weight_scaling_factor_2.to(per_block_amax.device))
+        per_block_scale[per_block_scale == 0] = 1.0
+
+        # Reshape per_block_scale to match weight's block structure: (rows, num_blocks_per_row)
+        num_blocks_per_row = weight.shape[-1] // block_size
+        expected_shape = (*weight.shape[:-1], num_blocks_per_row)
+        per_block_scale = per_block_scale.view(expected_shape)
+
+        return per_block_scale.to(torch.float8_e4m3fn)
+
     # If NVFP4, we need to return quantized per_block scaling factors
     if quantization_format in [
         QUANTIZATION_NVFP4,
@@ -343,6 +368,16 @@ def get_weight_scaling_factor_2(module: nn.Module, weight_name: str = "weight")
 
     quantization_format = get_quantization_format(module)
 
+    # Handle NVFP4 static quantizer (use _global_amax instead of _amax)
+    is_nvfp4_static = isinstance(weight_quantizer, NVFP4StaticQuantizer) or getattr(
+        weight_quantizer, "_is_nvfp4_static_quantizer", False
+    )
+    if is_nvfp4_static:
+        assert (
+            hasattr(weight_quantizer, "_global_amax") and weight_quantizer._global_amax is not None
+        )
+        return weight_quantizer._global_amax.float() / (6.0 * 448.0)
+
     # Calibrate weight quantizer if amax is not set for all NVFP4 variants
     if quantization_format in [
         QUANTIZATION_NVFP4,
@@ -735,7 +770,7 @@ def process_layer_quant_config(layer_config_dict):
             layer_config = {"quant_algo": "W8A16"}
         elif v == "int8_sq":
             layer_config = {"quant_algo": "W8A8_SQ_PER_CHANNEL"}
-        elif v == "nvfp4":
+        elif v in ["nvfp4", "nvfp4_static"]:
             layer_config = {
                 "quant_algo": "NVFP4",
                 "group_size": block_size_value,
@@ -1423,6 +1458,24 @@ def get_quant_config(
             if block_size == 0:
                 block_size = get_weight_block_size(module)
 
+            # Static NVFP4 uses pre-computed per-block scales from MSE calibration
+            if quantization_format == QUANTIZATION_NVFP4:
+                weight_quantizer = getattr(module, "weight_quantizer", None)
+                if weight_quantizer is None:
+                    # Try to get from first weight attribute
+                    for wn in weight_names:
+                        weight_quantizer = getattr(
+                            module, quantizer_attr_names(wn).weight_quantizer, None
+                        )
+                        if weight_quantizer is not None:
+                            break
+                if weight_quantizer is not None:
+                    is_static = isinstance(weight_quantizer, NVFP4StaticQuantizer) or getattr(
+                        weight_quantizer, "_is_nvfp4_static_quantizer", False
+                    )
+                    if is_static:
+                        quantization_format = "nvfp4_static"
+
             # Construct per layer config dictionary
             layer_config_dict[name + ".quantization"] = quantization_format
             layer_config_dict[name + ".awq_block_size"] = block_size
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index 61bebb51d..289668351 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -50,7 +50,11 @@
 from torch.distributed.fsdp import FSDPModule
 
 from modelopt.torch.quantization import set_quantizer_by_cfg_context
-from modelopt.torch.quantization.nn import SequentialQuantizer, TensorQuantizer
+from modelopt.torch.quantization.nn import (
+    NVFP4StaticQuantizer,
+    SequentialQuantizer,
+    TensorQuantizer,
+)
 from modelopt.torch.quantization.qtensor import MXFP8QTensor, NVFP4QTensor
 from modelopt.torch.quantization.utils import fsdp2_aware_weight_update, quantizer_attr_names
 
@@ -502,11 +506,20 @@ def _export_quantized_weight(
         weight, _ = maybe_transpose_expert_weight_dimensions(
             weight, is_bmm_expert_weight=is_bmm_expert_weight
         )
-        weight_scale = NVFP4QTensor.get_weights_scaling_factor(
-            weight,
-            block_size=block_size,
-            weights_scaling_factor_2=weight_scale_2,
-        )[0]
+
+        # Check if this is a static NVFP4 quantizer (has pre-computed scales from MSE calibration)
+        # For static NVFP4, weight_scale is already computed from static _amax values in get_weight_scaling_factor
+        is_nvfp4_static = isinstance(weight_quantizer, NVFP4StaticQuantizer) or getattr(
+            weight_quantizer, "_is_nvfp4_static_quantizer", False
+        )
+
+        if not is_nvfp4_static:
+            # For dynamic NVFP4, compute scales from weights
+            weight_scale = NVFP4QTensor.get_weights_scaling_factor(
+                weight,
+                block_size=block_size,
+                weights_scaling_factor_2=weight_scale_2,
+            )[0]
 
         quantized_weight = to_quantized_weight(
             weight.to(dtype),

From 642f99bf0215866d55399bdffa3090facfa4736f Mon Sep 17 00:00:00 2001
From: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com>
Date: Thu, 5 Feb 2026 18:56:28 +0000
Subject: [PATCH 2/5] tmp:experimental config

Signed-off-by: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com>
---
 modelopt/torch/quantization/config.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py
index e1b48ee60..8f7c31bac 100644
--- a/modelopt/torch/quantization/config.py
+++ b/modelopt/torch/quantization/config.py
@@ -388,6 +388,32 @@
     "algorithm": "max",
 }
 
+NVFP4_WEIGHT_MSE_FP8_SWEEP_CFG = {
+    "quant_cfg": {
+        "*weight_quantizer": {
+            "num_bits": (2, 1),
+            "block_sizes": {-1: 16, "type": "static", "scale_bits": (4, 3)},
+            "axis": None,
+            "enable": True,
+        },
+        # "*input_quantizer": {
+        #     "enable": False,
+        # },
+        "*input_quantizer": {
+            "num_bits": (2, 1),
+            "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)},
+            "axis": None,
+            "enable": True,
+        },
+        **_default_disabled_quantizer_cfg,
+    },
+    "algorithm": {
+        "method": "mse",
+        "fp8_scale_sweep": True,
+    },
+}
+
+
 NVFP4_AWQ_LITE_CFG = {
     "quant_cfg": {
         "*weight_quantizer": {

From 519dc2a20f9e3d2f8ce1c1a1ac51eb0380020d85 Mon Sep 17 00:00:00 2001
From: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com>
Date: Fri, 6 Feb 2026 02:00:33 +0000
Subject: [PATCH 3/5] fix layer fusion

Signed-off-by: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com>
---
 modelopt/torch/export/quant_utils.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
index c8ebb1150..9e4c14dd0 100755
--- a/modelopt/torch/export/quant_utils.py
+++ b/modelopt/torch/export/quant_utils.py
@@ -1374,6 +1374,22 @@ def preprocess_linear_fusion(modules: list[torch.nn.Module], resmooth_only=False
                 for module in modules:
                     module.weight_quantizer[-1].amax = weight_amax
 
+        # Handle NVFP4StaticQuantizer: unify _global_amax for fused layers
+        elif isinstance(modules[0].weight_quantizer, NVFP4StaticQuantizer) or getattr(
+            modules[0].weight_quantizer, "_is_nvfp4_static_quantizer", False
+        ):
+            global_amax_list = [
+                m.weight_quantizer._global_amax
+                for m in modules
+                if hasattr(m.weight_quantizer, "_global_amax")
+                and m.weight_quantizer._global_amax is not None
+            ]
+            if global_amax_list:
+                unified_global_amax = torch.max(torch.stack(global_amax_list))
+                for module in modules:
+                    if hasattr(module.weight_quantizer, "_global_amax"):
+                        module.weight_quantizer._global_amax = unified_global_amax
+
         elif (
             modules[0].weight_quantizer.is_enabled
             and modules[0].weight_quantizer.amax is not None

From e0606cb3153ebbf14bc9cf7086e0e556bd71f31e Mon Sep 17 00:00:00 2001
From: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com>
Date: Fri, 6 Feb 2026 22:56:02 +0000
Subject: [PATCH 4/5] address reviewers feedback, delegate scaling factor
 calculation to NVFP4QTensor

Signed-off-by: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com>
---
 modelopt/torch/export/quant_utils.py          | 98 ++++++-------------
 modelopt/torch/export/unified_export_hf.py    |  4 +-
 modelopt/torch/quantization/config.py         |  5 +-
 .../quantization/qtensor/nvfp4_tensor.py      | 89 ++++++++++++++++-
 4 files changed, 116 insertions(+), 80 deletions(-)

diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
index 9e4c14dd0..90e4f89c2 100755
--- a/modelopt/torch/export/quant_utils.py
+++ b/modelopt/torch/export/quant_utils.py
@@ -300,40 +300,18 @@ def get_weight_scaling_factor(module: nn.Module, weight_name: str = "weight") ->
 
     quantization_format = get_quantization_format(module)
 
-    # Handle NVFP4 static quantizer (has pre-computed per-block amax values)
-    is_nvfp4_static = isinstance(weight_quantizer, NVFP4StaticQuantizer) or getattr(
-        weight_quantizer, "_is_nvfp4_static_quantizer", False
-    )
-    if is_nvfp4_static:
-        # For static NVFP4, _amax contains per-block amax values and _global_amax is the global scale
-        assert (
-            hasattr(weight_quantizer, "_global_amax") and weight_quantizer._global_amax is not None
-        )
-        global_amax = weight_quantizer._global_amax.float()
-        per_block_amax = weight_quantizer._amax.float()
-
-        block_size = weight_quantizer.block_sizes[-1]
-        weight_scaling_factor_2 = global_amax / (6.0 * 448.0)
-        per_block_scale = per_block_amax / (6.0 * weight_scaling_factor_2.to(per_block_amax.device))
-        per_block_scale[per_block_scale == 0] = 1.0
-
-        # Reshape per_block_scale to match weight's block structure: (rows, num_blocks_per_row)
-        num_blocks_per_row = weight.shape[-1] // block_size
-        expected_shape = (*weight.shape[:-1], num_blocks_per_row)
-        per_block_scale = per_block_scale.view(expected_shape)
-
-        return per_block_scale.to(torch.float8_e4m3fn)
-
-    # If NVFP4, we need to return quantized per_block scaling factors
-    if quantization_format in [
+    # Handle NVFP4 variants (static or dynamic)
+    is_nvfp4_static = isinstance(weight_quantizer, NVFP4StaticQuantizer)
+    if is_nvfp4_static or quantization_format in [
         QUANTIZATION_NVFP4,
         QUANTIZATION_NVFP4_AWQ,
         QUANTIZATION_NVFP4_SVDQUANT,
         QUANTIZATION_W4A8_NVFP4_FP8,
     ]:
-        # Calibrate weight quantizer if amax is not set
-        module_name = f"{type(module).__name__}.{weight_name}"
-        _ensure_weight_quantizer_calibrated(weight_quantizer, weight, module_name)
+        # Calibrate weight quantizer if amax is not set (only needed for dynamic quantizers)
+        if not is_nvfp4_static:
+            module_name = f"{type(module).__name__}.{weight_name}"
+            _ensure_weight_quantizer_calibrated(weight_quantizer, weight, module_name)
 
         if quantization_format == QUANTIZATION_W4A8_NVFP4_FP8:
             # weight_scaling_factor_2 for w4a8 needs to be amax/448, so that the wsf is in range 448/6.
@@ -343,9 +321,10 @@ def get_weight_scaling_factor(module: nn.Module, weight_name: str = "weight") ->
             weight_scaling_factor_2 = NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(
                 weight_quantizer
             )
-        return NVFP4QTensor.get_weights_scaling_factor(
+        # Unified method handles both static and dynamic quantizers
+        return NVFP4QTensor.get_weights_scaling_factor_from_quantizer(
+            weight_quantizer,
             weight,
-            weight_quantizer.block_sizes[-1],
             weight_scaling_factor_2.to(weight.device),
         )[0]
 
@@ -368,37 +347,26 @@ def get_weight_scaling_factor_2(module: nn.Module, weight_name: str = "weight")
 
     quantization_format = get_quantization_format(module)
 
-    # Handle NVFP4 static quantizer (use _global_amax instead of _amax)
-    is_nvfp4_static = isinstance(weight_quantizer, NVFP4StaticQuantizer) or getattr(
-        weight_quantizer, "_is_nvfp4_static_quantizer", False
-    )
-    if is_nvfp4_static:
-        assert (
-            hasattr(weight_quantizer, "_global_amax") and weight_quantizer._global_amax is not None
-        )
-        return weight_quantizer._global_amax.float() / (6.0 * 448.0)
-
-    # Calibrate weight quantizer if amax is not set for all NVFP4 variants
-    if quantization_format in [
+    is_nvfp4_static = isinstance(weight_quantizer, NVFP4StaticQuantizer)
+    if is_nvfp4_static or quantization_format in [
         QUANTIZATION_NVFP4,
         QUANTIZATION_NVFP4_AWQ,
         QUANTIZATION_NVFP4_SVDQUANT,
         QUANTIZATION_W4A8_NVFP4_FP8,
     ]:
-        weight = getattr(module, weight_name)
-        module_name = f"{type(module).__name__}.{weight_name}"
-        _ensure_weight_quantizer_calibrated(weight_quantizer, weight, module_name)
+        # Calibrate weight quantizer if amax is not set (only needed for dynamic quantizers)
+        if not is_nvfp4_static:
+            weight = getattr(module, weight_name)
+            module_name = f"{type(module).__name__}.{weight_name}"
+            _ensure_weight_quantizer_calibrated(weight_quantizer, weight, module_name)
 
-    if quantization_format in [
-        QUANTIZATION_NVFP4,
-        QUANTIZATION_NVFP4_AWQ,
-        QUANTIZATION_NVFP4_SVDQUANT,
-    ]:
-        return NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer)
-    elif quantization_format == QUANTIZATION_W4A8_NVFP4_FP8:
-        # weight_scaling_factor_2 for w4a8 needs to be amax/448, so that the wsf is in range 448/6.
-        # This is because the kernel dequantizes weight to fp8, which is in range 448.
-        return weight_quantizer._amax.float() / 448.0
+        if quantization_format == QUANTIZATION_W4A8_NVFP4_FP8:
+            # weight_scaling_factor_2 for w4a8 needs to be amax/448, so that the wsf is in range 448/6.
+            # This is because the kernel dequantizes weight to fp8, which is in range 448.
+            return weight_quantizer._amax.float() / 448.0
+        else:
+            # Unified method handles both static and dynamic quantizers
+            return NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer)
 
     # SequentialQuantizer is required
     if not isinstance(weight_quantizer, SequentialQuantizer) or not weight_quantizer[-1].is_enabled:
@@ -1374,21 +1342,17 @@ def preprocess_linear_fusion(modules: list[torch.nn.Module], resmooth_only=False
                 for module in modules:
                     module.weight_quantizer[-1].amax = weight_amax
 
-        # Handle NVFP4StaticQuantizer: unify _global_amax for fused layers
-        elif isinstance(modules[0].weight_quantizer, NVFP4StaticQuantizer) or getattr(
-            modules[0].weight_quantizer, "_is_nvfp4_static_quantizer", False
-        ):
+        # Handle NVFP4StaticQuantizer: unify global_amax for fused layers
+        elif isinstance(modules[0].weight_quantizer, NVFP4StaticQuantizer):
             global_amax_list = [
-                m.weight_quantizer._global_amax
+                m.weight_quantizer.global_amax
                 for m in modules
-                if hasattr(m.weight_quantizer, "_global_amax")
-                and m.weight_quantizer._global_amax is not None
+                if m.weight_quantizer.global_amax is not None
             ]
             if global_amax_list:
                 unified_global_amax = torch.max(torch.stack(global_amax_list))
                 for module in modules:
-                    if hasattr(module.weight_quantizer, "_global_amax"):
-                        module.weight_quantizer._global_amax = unified_global_amax
+                    module.weight_quantizer.global_amax = unified_global_amax
 
         elif (
             modules[0].weight_quantizer.is_enabled
@@ -1486,9 +1450,7 @@ def get_quant_config(
                         if weight_quantizer is not None:
                             break
                 if weight_quantizer is not None:
-                    is_static = isinstance(weight_quantizer, NVFP4StaticQuantizer) or getattr(
-                        weight_quantizer, "_is_nvfp4_static_quantizer", False
-                    )
+                    is_static = isinstance(weight_quantizer, NVFP4StaticQuantizer)
                     if is_static:
                         quantization_format = "nvfp4_static"
 
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index 289668351..ac66eac96 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -509,9 +509,7 @@ def _export_quantized_weight(
 
         # Check if this is a static NVFP4 quantizer (has pre-computed scales from MSE calibration)
         # For static NVFP4, weight_scale is already computed from static _amax values in get_weight_scaling_factor
-        is_nvfp4_static = isinstance(weight_quantizer, NVFP4StaticQuantizer) or getattr(
-            weight_quantizer, "_is_nvfp4_static_quantizer", False
-        )
+        is_nvfp4_static = isinstance(weight_quantizer, NVFP4StaticQuantizer)
 
         if not is_nvfp4_static:
             # For dynamic NVFP4, compute scales from weights
diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py
index 8f7c31bac..956218e60 100644
--- a/modelopt/torch/quantization/config.py
+++ b/modelopt/torch/quantization/config.py
@@ -388,7 +388,7 @@
     "algorithm": "max",
 }
 
-NVFP4_WEIGHT_MSE_FP8_SWEEP_CFG = {
+NVFP4_W4A4_WEIGHT_MSE_FP8_SWEEP_CFG = {
     "quant_cfg": {
         "*weight_quantizer": {
             "num_bits": (2, 1),
@@ -396,9 +396,6 @@
             "axis": None,
             "enable": True,
         },
-        # "*input_quantizer": {
-        #     "enable": False,
-        # },
         "*input_quantizer": {
             "num_bits": (2, 1),
             "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)},
diff --git a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
index 2ff1b17e9..6ff31424c 100644
--- a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
+++ b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
@@ -52,12 +52,87 @@ def get_e2m1_bounds(cls, device):
             cls.e2m1_bounds_on_device[device] = e2m1_bounds.to(device)
         return cls.e2m1_bounds_on_device[device]
 
+    @classmethod
+    def _is_static_quantizer(cls, weight_quantizer) -> bool:
+        """Check if the weight quantizer is a static NVFP4 quantizer with pre-computed amax."""
+        return hasattr(weight_quantizer, "global_amax") and weight_quantizer.global_amax is not None
+
     @classmethod
     def get_weights_scaling_factor_2_from_quantizer(cls, weight_quantizer):
-        """Returns per tensor weight scaling factor from the weight_quantizer amax."""
-        # Assert that weight_quantizer has attribute amax
-        assert hasattr(weight_quantizer, "_amax"), "Weight quantizer does not have attribute amax"
-        return weight_quantizer._amax.float() / (6.0 * 448.0)
+        """Returns per tensor weight scaling factor from the weight_quantizer.
+
+        Handles both static NVFP4 quantizers (using global_amax) and
+        dynamic quantizers (using _amax).
+
+        Args:
+            weight_quantizer: The weight quantizer (static or dynamic).
+
+        Returns:
+            The global scaling factor as a float tensor.
+        """
+        if cls._is_static_quantizer(weight_quantizer):
+            return weight_quantizer.global_amax.float() / (6.0 * 448.0)
+        else:
+            assert hasattr(weight_quantizer, "_amax"), (
+                "Weight quantizer does not have attribute amax"
+            )
+            return weight_quantizer._amax.float() / (6.0 * 448.0)
+
+    @classmethod
+    def get_weights_scaling_factor_from_quantizer(
+        cls,
+        weight_quantizer,
+        weight: torch.Tensor,
+        weights_scaling_factor_2: torch.Tensor | None = None,
+        keep_high_precision: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Returns quantized per block weight scaling factor from quantizer.
+
+        Handles both static NVFP4 quantizers (with pre-computed per-block amax)
+        and dynamic quantizers (computing from weight tensor).
+
+        Args:
+            weight_quantizer: The weight quantizer (static or dynamic).
+            weight: The weight tensor (used for shape in static, values in dynamic).
+            weights_scaling_factor_2: Optional pre-computed global scale.
+            keep_high_precision: Whether to keep scales in high precision.
+
+        Returns:
+            Tuple of (per_block_scale, weights_scaling_factor_2).
+        """
+        block_size = weight_quantizer.block_sizes[-1]
+
+        if weights_scaling_factor_2 is None:
+            weights_scaling_factor_2 = cls.get_weights_scaling_factor_2_from_quantizer(
+                weight_quantizer
+            )
+
+        if cls._is_static_quantizer(weight_quantizer):
+            # Static path: use pre-computed per-block amax values from quantizer
+            global_amax = weight_quantizer.global_amax.float()
+            per_block_amax = weight_quantizer._amax.float()
+
+            # Compute scales in float
+            per_block_scale_max = global_amax / 6.0
+            per_block_scale = per_block_amax / 6.0
+            per_block_scale[per_block_scale == 0] = 1.0
+
+            # Reshape per_block_scale to match weight's block structure
+            num_blocks_per_row = weight.shape[-1] // block_size
+            expected_shape = (*weight.shape[:-1], num_blocks_per_row)
+            per_block_scale = per_block_scale.view(expected_shape)
+
+            # Quantize scales to FP8
+            if not keep_high_precision:
+                per_block_scale = (per_block_scale * 448.0 / per_block_scale_max).to(
+                    torch.float8_e4m3fn
+                )
+            return per_block_scale, weights_scaling_factor_2
+        else:
+            # Dynamic path: compute from weight tensor
+            return cls.get_weights_scaling_factor(
+                weight, block_size, weights_scaling_factor_2, keep_high_precision
+            )
 
     @classmethod
     def get_weights_scaling_factor(
@@ -67,7 +142,11 @@ def get_weights_scaling_factor(
         weights_scaling_factor_2: torch.Tensor | None = None,
         keep_high_precision: bool = False,
     ):
-        """Returns quantized per block weight scaling factor."""
+        """Returns quantized per block weight scaling factor from weight tensor.
+
+        This is the dynamic path that computes scales directly from the weight values.
+        For quantizers with pre-computed amax, use get_weights_scaling_factor_from_quantizer.
+        """
         if weights_scaling_factor_2 is None:
             weights_scaling_factor_2 = cls.get_weights_scaling_factor_2(input)
 

From 9725c34b64281ae1e3ced6f0432ae95838ed4ae3 Mon Sep 17 00:00:00 2001
From: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com>
Date: Sat, 7 Feb 2026 01:09:01 +0000
Subject: [PATCH 5/5] update example script for export

Signed-off-by: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com>
---
 examples/llm_ptq/hf_ptq.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index a5af5e97d..8691a2db2 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -80,6 +80,7 @@
     "w4a8_awq": mtq.W4A8_AWQ_BETA_CFG,
     "nvfp4": mtq.NVFP4_DEFAULT_CFG,
     "nvfp4_awq": mtq.NVFP4_AWQ_LITE_CFG,
+    "nvfp4_mse": mtq.NVFP4_W4A4_WEIGHT_MSE_FP8_SWEEP_CFG,
     "fp8_pb_wo": mtq.FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG,
     "fp8_pc_pt": mtq.FP8_PER_CHANNEL_PER_TOKEN_CFG,
     "w4a8_nvfp4_fp8": mtq.W4A8_NVFP4_FP8_CFG,