support mxfp8 quantization (qwen dense) (#5723)

### What this PR does / why we need it? support mxfp8 quantization (qwen liner layer) ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: 2f4e6548ef Signed-off-by: wangyao <iwangyao@outlook.com>
2026-01-09 16:26:31 +08:00
parent 09b3f9d91b
commit 3b997fdd32
3 changed files with 112 additions and 3 deletions
--- a/vllm_ascend/quantization/utils.py
+++ b/vllm_ascend/quantization/utils.py
@@ -14,6 +14,7 @@ from .w8a8_dynamic import (AscendW8A8DynamicFusedMoEMethod,
                           AscendW8A8DynamicLinearMethod)
 from .w8a8_pdmix import (AscendW8A8PDMixFusedMoeMethod,
                         AscendW8A8PDMixLinearMethod)
+from .w8a8mxfp8 import AscendW8A8MXFP8DynamicLinearMethod
 from .w8a16 import AscendW8A16LinearMethod

 ASCEND_QUANTIZATION_METHOD_MAP: Dict[str, Dict[str, Type[Any]]] = {
@@ -40,7 +41,10 @@ ASCEND_QUANTIZATION_METHOD_MAP: Dict[str, Dict[str, Type[Any]]] = {
    },
    "W8A16": {
        "linear": AscendW8A16LinearMethod,
-    }
+    },
+    "W8A8_MXFP8": {
+        "linear": AscendW8A8MXFP8DynamicLinearMethod,
+    },
 }


@@ -113,3 +117,9 @@ def get_quant_method_modelslim(
            )
    raise NotImplementedError("Currently, vLLM Ascend only supports following quant types:" \
                                f"{list(ASCEND_QUANTIZATION_METHOD_MAP.keys())}")
+
+
+def is_mx_quant_type(instance: Any) -> bool:
+    """Checks if the quantization method is a mix-precision type."""
+    MX_QUANT_TYPES = (AscendW8A8MXFP8DynamicLinearMethod, )
+    return isinstance(instance, MX_QUANT_TYPES)