[Feature] Add W4A4 Flat Quantization support (#3427)
Introduce W4A4 Flat Quantization for better model compression and inference efficiency on Ascend devices. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
This commit is contained in:
@@ -2,6 +2,7 @@ from typing import Any, Dict, Optional, Type
|
||||
|
||||
from vllm.logger import logger
|
||||
|
||||
from .w4a4_flatquant_dynamic import AscendW4A4FlatQuantDynamicLinearMethod
|
||||
from .w4a8_dynamic import (AscendW4A8DynamicFusedMoEMethod,
|
||||
AscendW4A8DynamicLinearMethod)
|
||||
from .w8a8 import (AscendC8KVCacheMethod, AscendW8A8FusedMoEMethod,
|
||||
@@ -14,6 +15,9 @@ ASCEND_QUANTIZATION_METHOD_MAP: Dict[str, Dict[str, Type[Any]]] = {
|
||||
"linear": AscendW4A8DynamicLinearMethod,
|
||||
"moe": AscendW4A8DynamicFusedMoEMethod,
|
||||
},
|
||||
"W4A4_FLATQUANT_DYNAMIC": {
|
||||
"linear": AscendW4A4FlatQuantDynamicLinearMethod,
|
||||
},
|
||||
"W8A8": {
|
||||
"linear": AscendW8A8LinearMethod,
|
||||
"moe": AscendW8A8FusedMoEMethod,
|
||||
|
||||
Reference in New Issue
Block a user