support pangumoe w8a8c8 and docs (#1477)
### What this PR does / why we need it? support pangu moe w8a8c8 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI passed with new added test. Signed-off-by: zhuyilin <809721801@qq.com>
This commit is contained in:
@@ -24,7 +24,8 @@ from vllm.logger import logger
|
||||
|
||||
from .func_wrapper import (wrapper_load_model, wrapper_rmsnorm_forward_oot,
|
||||
wrapper_rmsnorm_init)
|
||||
from .w8a8 import AscendW8A8LinearMethod
|
||||
from .w8a8 import (AscendC8KVCacheMethod, AscendW8A8FusedMoEMethod,
|
||||
AscendW8A8LinearMethod)
|
||||
from .w8a8_dynamic import (AscendW8A8DynamicFusedMoEMethod,
|
||||
AscendW8A8DynamicLinearMethod)
|
||||
|
||||
@@ -250,6 +251,8 @@ class VLLMAscendQuantizer:
|
||||
# Attention
|
||||
if '.attn' in prefix and 'fa_quant_type' in quant_description.keys():
|
||||
quant_type = quant_description['fa_quant_type']
|
||||
if '.attn' in prefix and 'kv_quant_type' in quant_description.keys():
|
||||
quant_type = quant_description['kv_quant_type']
|
||||
# Linear
|
||||
else:
|
||||
quant_type = cls.get_linear_quant_type(quant_description, prefix,
|
||||
@@ -269,6 +272,14 @@ class W8A8Quantizer(VLLMAscendQuantizer):
|
||||
def build_linear_method():
|
||||
return AscendW8A8LinearMethod()
|
||||
|
||||
@staticmethod
|
||||
def build_moe_method():
|
||||
return AscendW8A8FusedMoEMethod()
|
||||
|
||||
@staticmethod
|
||||
def build_attention_method():
|
||||
return AscendC8KVCacheMethod()
|
||||
|
||||
|
||||
class W8A8DYNAMICQuantizer(VLLMAscendQuantizer):
|
||||
|
||||
@@ -284,4 +295,5 @@ class W8A8DYNAMICQuantizer(VLLMAscendQuantizer):
|
||||
SUPPORT_ASCEND_QUANTIZER_TYPE = {
|
||||
"W8A8": W8A8Quantizer,
|
||||
"W8A8_DYNAMIC": W8A8DYNAMICQuantizer,
|
||||
"C8": W8A8Quantizer,
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user