support pangumoe w8a8c8 and docs (#1477)

### What this PR does / why we need it? support pangu moe w8a8c8 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI passed with new added test. Signed-off-by: zhuyilin <809721801@qq.com>
2025-06-28 18:51:07 +08:00
parent c59d69d9e6
commit b308a7a258
8 changed files with 689 additions and 50 deletions
--- a/vllm_ascend/quantization/quantizer.py
+++ b/vllm_ascend/quantization/quantizer.py
@@ -24,7 +24,8 @@ from vllm.logger import logger

 from .func_wrapper import (wrapper_load_model, wrapper_rmsnorm_forward_oot,
                           wrapper_rmsnorm_init)
-from .w8a8 import AscendW8A8LinearMethod
+from .w8a8 import (AscendC8KVCacheMethod, AscendW8A8FusedMoEMethod,
+                   AscendW8A8LinearMethod)
 from .w8a8_dynamic import (AscendW8A8DynamicFusedMoEMethod,
                           AscendW8A8DynamicLinearMethod)

@@ -250,6 +251,8 @@ class VLLMAscendQuantizer:
        # Attention
        if '.attn' in prefix and 'fa_quant_type' in quant_description.keys():
            quant_type = quant_description['fa_quant_type']
+        if '.attn' in prefix and 'kv_quant_type' in quant_description.keys():
+            quant_type = quant_description['kv_quant_type']
        # Linear
        else:
            quant_type = cls.get_linear_quant_type(quant_description, prefix,
@@ -269,6 +272,14 @@ class W8A8Quantizer(VLLMAscendQuantizer):
    def build_linear_method():
        return AscendW8A8LinearMethod()

+    @staticmethod
+    def build_moe_method():
+        return AscendW8A8FusedMoEMethod()
+
+    @staticmethod
+    def build_attention_method():
+        return AscendC8KVCacheMethod()
+

 class W8A8DYNAMICQuantizer(VLLMAscendQuantizer):

@@ -284,4 +295,5 @@ class W8A8DYNAMICQuantizer(VLLMAscendQuantizer):
 SUPPORT_ASCEND_QUANTIZER_TYPE = {
    "W8A8": W8A8Quantizer,
    "W8A8_DYNAMIC": W8A8DYNAMICQuantizer,
+    "C8": W8A8Quantizer,
 }