[gpt-oss] Add gpt-oss mxfp4 support

2025-08-25 15:31:09 +08:00
parent db7f48eeac
commit 7a35b2f32d
32 changed files with 4835 additions and 1190 deletions
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py
@@ -6,14 +6,16 @@ from typing import Any, Callable, Optional
 import torch
 import torch.nn.functional as F

-import vllm.envs as envs
+from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.quark.schemes import QuarkScheme
 from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
-    OCP_MX_BLOCK_SIZE, per_token_group_quant_mxfp4)
+    OCP_MX_BLOCK_SIZE, dequant_mxfp4, quant_dequant_mxfp4)
 from vllm.model_executor.parameter import (GroupQuantScaleParameter,
                                           PackedvLLMParameter)
 from vllm.platforms import current_platform

+logger = init_logger(__name__)
+
 __all__ = ["QuarkW4A4MXFP4"]


@@ -25,7 +27,29 @@ class QuarkW4A4MXFP4(QuarkScheme):
        self.qscheme = "per_group"
        self.weight_quant_spec = weight_quant_spec
        self.input_quant_spec = input_quant_spec
-        self.emulate = not current_platform.supports_mx()
+
+        self.static_input_scales = not input_quant_spec.get("is_dynamic")
+
+        if self.static_input_scales:
+            raise NotImplementedError(
+                "QuarkW4A4MXFP4 with static input scales is currently not "
+                "implemented. Please open an issue.")
+
+        if not current_platform.supports_mx():
+            self.emulate = True
+            logger.warning_once(
+                "The current platform does not support native MXFP4 "
+                "computation. Simulated weight dequantization and activation "
+                "QDQ (quantize and dequantize) will be used, with the linear "
+                "layers computed in high precision.")
+        else:
+            self.emulate = True
+            logger.warning_once(
+                "The current platform supports native MXFP4 "
+                "computation, but kernels are not yet integrated in vLLM. "
+                "Simulated weight dequantization and activation "
+                "QDQ (quantize and dequantize) will be used, with the linear "
+                "layers computed in high precision.")

    @classmethod
    def get_min_capability(cls) -> int:
@@ -37,43 +61,6 @@ class QuarkW4A4MXFP4(QuarkScheme):
        layer.weight_scale = torch.nn.Parameter(layer.weight_scale.data,
                                                requires_grad=False)

-        if self.emulate:
-            try:
-                from quark.torch.export.nn.modules import realquantizer
-                from quark.torch.quantization.config.config import (
-                    QuantizationSpec)
-            except ImportError as err:
-                raise ImportError(
-                    "The package `amd-quark` is required to use AMD Quark "
-                    "MX-FP4 models. Please install it with `pip install "
-                    "amd-quark`.") from err
-
-            weight_quant_spec = QuantizationSpec.from_dict(
-                self.weight_quant_spec)
-
-            weight_quantizer = realquantizer.get_real_quantizer(
-                qspec=weight_quant_spec,
-                quantizer=None,
-                real_quantized=True,
-                reorder=False,
-                float_dtype=self.out_dtype,
-                scale_shape=layer.weight_scale.shape,
-                zero_point_shape=None,
-            )
-            weight_quantizer.scale.data = layer.weight_scale.data
-
-            if not envs.VLLM_QUARK_EMU_MEM_OPT:
-                layer.weight = torch.nn.Parameter(
-                    weight_quantizer(layer.weight.data).to(self.out_dtype),
-                    requires_grad=False,
-                )
-            else:
-                self.weight_quantizer = weight_quantizer
-            layer.weight_scale = None
-
-            # This call is necessary to release the scales memory.
-            torch.cuda.empty_cache()
-
    def create_weights(self, layer: torch.nn.Module,
                       output_partition_sizes: list[int],
                       input_size_per_partition: int,
@@ -116,11 +103,10 @@ class QuarkW4A4MXFP4(QuarkScheme):
                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:

        if self.emulate:
-            if envs.VLLM_QUARK_EMU_MEM_OPT:
-                dq_w = self.weight_quantizer(layer.weight).to(self.out_dtype)
-            else:
-                dq_w = layer.weight
-            qdq_x, _ = per_token_group_quant_mxfp4(x, OCP_MX_BLOCK_SIZE)
-            return F.linear(qdq_x, dq_w, bias)
+            dq_w = dequant_mxfp4(layer.weight, layer.weight_scale, x.dtype)
+
+            x = quant_dequant_mxfp4(x)
+
+            return F.linear(x, dq_w, bias)
        else:
            raise NotImplementedError()