[feature] Support W8A8 PD-Mix Quantization (#4235)

In PD-separated deployment scenarios: * MoE layers use dynamic quantization exclusively. * For the Attention module, Prefill (P) nodes use **dynamic** quantization, while Decode (D) nodes use **static** quantization. In PD-mixed deployment scenarios: * **All components fall back to dynamic quantization**, as it is difficult to distinguish between Prefill and Decode tokens. ___ - vLLM version: v0.11.2 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2 --------- Signed-off-by: SlightwindSec <slightwindsec@gmail.com> Signed-off-by: Slightwind <slightwindsec@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-11-30 11:57:26 +08:00
parent ff7061317f
commit 18eefc23c3
6 changed files with 93 additions and 7 deletions
--- a/vllm_ascend/quantization/utils.py
+++ b/vllm_ascend/quantization/utils.py
@@ -12,6 +12,8 @@ from .w8a8 import (AscendC8KVCacheMethod, AscendW8A8FusedMoEMethod,
                   AscendW8A8LinearMethod)
 from .w8a8_dynamic import (AscendW8A8DynamicFusedMoEMethod,
                           AscendW8A8DynamicLinearMethod)
+from .w8a8_pdmix import (AscendW8A8PDMixFusedMoeMethod,
+                         AscendW8A8PDMixLinearMethod)

 ASCEND_QUANTIZATION_METHOD_MAP: Dict[str, Dict[str, Type[Any]]] = {
    "W4A8_DYNAMIC": {
@@ -30,6 +32,10 @@ ASCEND_QUANTIZATION_METHOD_MAP: Dict[str, Dict[str, Type[Any]]] = {
        "linear": AscendW8A8DynamicLinearMethod,
        "moe": AscendW8A8DynamicFusedMoEMethod,
    },
+    "W8A8_MIX": {
+        "linear": AscendW8A8PDMixLinearMethod,
+        "moe": AscendW8A8PDMixFusedMoeMethod,
+    },
    "C8": {
        "attention": AscendC8KVCacheMethod,
    },