[Feature] Add support of new W4A4_LAOS_DYNAMIC quantization method (#5143)
Introduce W4A4 LAOS Quantization for better model compression and
inference efficiency on Ascend devices.
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
Signed-off-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
@@ -6,6 +6,7 @@ from vllm.logger import logger
|
||||
from vllm_ascend.utils import COMPRESSED_TENSORS_METHOD
|
||||
|
||||
from .w4a4_flatquant_dynamic import AscendW4A4FlatQuantDynamicLinearMethod
|
||||
from .w4a4_laos_dynamic import AscendW4A4LaosDynamicLinearMethod
|
||||
from .w4a8_dynamic import (AscendW4A8DynamicFusedMoEMethod,
|
||||
AscendW4A8DynamicLinearMethod)
|
||||
from .w4a16 import AscendW4A16FusedMoEMethod
|
||||
@@ -25,6 +26,9 @@ ASCEND_QUANTIZATION_METHOD_MAP: Dict[str, Dict[str, Type[Any]]] = {
|
||||
"linear": AscendW4A8DynamicLinearMethod,
|
||||
"moe": AscendW4A8DynamicFusedMoEMethod,
|
||||
},
|
||||
"W4A4_DYNAMIC": {
|
||||
"linear": AscendW4A4LaosDynamicLinearMethod,
|
||||
},
|
||||
"W4A4_FLATQUANT_DYNAMIC": {
|
||||
"linear": AscendW4A4FlatQuantDynamicLinearMethod,
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user