[Feature] Add W4A4 Flat Quantization support (#3427)

Introduce W4A4 Flat Quantization for better model compression and inference efficiency on Ascend devices. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
2025-10-13 23:20:16 +08:00
parent 6972df5951
commit 4f6d60eb06
3 changed files with 511 additions and 0 deletions
--- a/vllm_ascend/quantization/utils.py
+++ b/vllm_ascend/quantization/utils.py
@@ -2,6 +2,7 @@ from typing import Any, Dict, Optional, Type

 from vllm.logger import logger

+from .w4a4_flatquant_dynamic import AscendW4A4FlatQuantDynamicLinearMethod
 from .w4a8_dynamic import (AscendW4A8DynamicFusedMoEMethod,
                           AscendW4A8DynamicLinearMethod)
 from .w8a8 import (AscendC8KVCacheMethod, AscendW8A8FusedMoEMethod,
@@ -14,6 +15,9 @@ ASCEND_QUANTIZATION_METHOD_MAP: Dict[str, Dict[str, Type[Any]]] = {
        "linear": AscendW4A8DynamicLinearMethod,
        "moe": AscendW4A8DynamicFusedMoEMethod,
    },
+    "W4A4_FLATQUANT_DYNAMIC": {
+        "linear": AscendW4A4FlatQuantDynamicLinearMethod,
+    },
    "W8A8": {
        "linear": AscendW8A8LinearMethod,
        "moe": AscendW8A8FusedMoEMethod,