[Feature] Add support of new W4A4_LAOS_DYNAMIC quantization method (#5143)

Introduce W4A4 LAOS Quantization for better model compression and inference efficiency on Ascend devices. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-01-22 05:34:58 +03:00
parent dd8571860d
commit ef9d8367f5
4 changed files with 134 additions and 0 deletions
--- a/vllm_ascend/quantization/utils.py
+++ b/vllm_ascend/quantization/utils.py
@@ -6,6 +6,7 @@ from vllm.logger import logger
 from vllm_ascend.utils import COMPRESSED_TENSORS_METHOD

 from .w4a4_flatquant_dynamic import AscendW4A4FlatQuantDynamicLinearMethod
+from .w4a4_laos_dynamic import AscendW4A4LaosDynamicLinearMethod
 from .w4a8_dynamic import (AscendW4A8DynamicFusedMoEMethod,
                           AscendW4A8DynamicLinearMethod)
 from .w4a16 import AscendW4A16FusedMoEMethod
@@ -25,6 +26,9 @@ ASCEND_QUANTIZATION_METHOD_MAP: Dict[str, Dict[str, Type[Any]]] = {
        "linear": AscendW4A8DynamicLinearMethod,
        "moe": AscendW4A8DynamicFusedMoEMethod,
    },
+    "W4A4_DYNAMIC": {
+        "linear": AscendW4A4LaosDynamicLinearMethod,
+    },
    "W4A4_FLATQUANT_DYNAMIC": {
        "linear": AscendW4A4FlatQuantDynamicLinearMethod,
    },