[Feature] Add W4A4 Flat Quantization support (#3427)

Introduce W4A4 Flat Quantization for better model compression and
inference efficiency on Ascend devices.

- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0

---------

Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
This commit is contained in:
Slightwind
2025-10-13 23:20:16 +08:00
committed by GitHub
parent 6972df5951
commit 4f6d60eb06
3 changed files with 511 additions and 0 deletions

View File

@@ -2,6 +2,7 @@ from typing import Any, Dict, Optional, Type
from vllm.logger import logger
from .w4a4_flatquant_dynamic import AscendW4A4FlatQuantDynamicLinearMethod
from .w4a8_dynamic import (AscendW4A8DynamicFusedMoEMethod,
AscendW4A8DynamicLinearMethod)
from .w8a8 import (AscendC8KVCacheMethod, AscendW8A8FusedMoEMethod,
@@ -14,6 +15,9 @@ ASCEND_QUANTIZATION_METHOD_MAP: Dict[str, Dict[str, Type[Any]]] = {
"linear": AscendW4A8DynamicLinearMethod,
"moe": AscendW4A8DynamicFusedMoEMethod,
},
"W4A4_FLATQUANT_DYNAMIC": {
"linear": AscendW4A4FlatQuantDynamicLinearMethod,
},
"W8A8": {
"linear": AscendW8A8LinearMethod,
"moe": AscendW8A8FusedMoEMethod,