support mxfp8 quantization (qwen dense) (#5723)

### What this PR does / why we need it?
support mxfp8 quantization (qwen liner layer)

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.13.0
- vLLM main:
2f4e6548ef


Signed-off-by: wangyao <iwangyao@outlook.com>
This commit is contained in:
wangyao-i
2026-01-09 16:26:31 +08:00
committed by GitHub
parent 09b3f9d91b
commit 3b997fdd32
3 changed files with 112 additions and 3 deletions

View File

@@ -14,6 +14,7 @@ from .w8a8_dynamic import (AscendW8A8DynamicFusedMoEMethod,
AscendW8A8DynamicLinearMethod)
from .w8a8_pdmix import (AscendW8A8PDMixFusedMoeMethod,
AscendW8A8PDMixLinearMethod)
from .w8a8mxfp8 import AscendW8A8MXFP8DynamicLinearMethod
from .w8a16 import AscendW8A16LinearMethod
ASCEND_QUANTIZATION_METHOD_MAP: Dict[str, Dict[str, Type[Any]]] = {
@@ -40,7 +41,10 @@ ASCEND_QUANTIZATION_METHOD_MAP: Dict[str, Dict[str, Type[Any]]] = {
},
"W8A16": {
"linear": AscendW8A16LinearMethod,
}
},
"W8A8_MXFP8": {
"linear": AscendW8A8MXFP8DynamicLinearMethod,
},
}
@@ -113,3 +117,9 @@ def get_quant_method_modelslim(
)
raise NotImplementedError("Currently, vLLM Ascend only supports following quant types:" \
f"{list(ASCEND_QUANTIZATION_METHOD_MAP.keys())}")
def is_mx_quant_type(instance: Any) -> bool:
"""Checks if the quantization method is a mix-precision type."""
MX_QUANT_TYPES = (AscendW8A8MXFP8DynamicLinearMethod, )
return isinstance(instance, MX_QUANT_TYPES)