support mxfp8 quantization (qwen dense) (#5723)

### What this PR does / why we need it? support mxfp8 quantization (qwen liner layer) ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: 2f4e6548ef Signed-off-by: wangyao <iwangyao@outlook.com>
2026-01-09 16:26:31 +08:00
parent 09b3f9d91b
commit 3b997fdd32
3 changed files with 112 additions and 3 deletions
--- a/vllm_ascend/quantization/quant_config.py
+++ b/vllm_ascend/quantization/quant_config.py
@@ -45,7 +45,7 @@ from vllm_ascend.ops.linear import AscendUnquantizedLinearMethod
 from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, flashcomm2_enable,
                               mlp_tp_enable, oproj_tp_enable)

-from .utils import get_quant_method
+from .utils import get_quant_method, is_mx_quant_type


@register_quantization_config(ASCEND_QUANTIZATION_METHOD)
@@ -401,7 +401,8 @@ class AscendLinearMethod(LinearMethodBase):
            set_weight_attrs(param, {"output_dim": 0})
            layer.register_parameter(pergroup_name, param)
            set_weight_attrs(param, extra_weight_attrs)
-            if "weight_scale_second" in pergroup_name or "weight_offset_second" in pergroup_name:
+            if "weight_scale_second" in pergroup_name or "weight_offset_second" in pergroup_name \
+                or is_mx_quant_type(self.quant_method):
                setattr(param, "input_dim", 1)
                param.input_dim = 1