[quantization] Add w8a16 quantization support (#4541)

### What this PR does / why we need it? related to https://github.com/vllm-project/vllm-ascend/issues/4267 ### Does this PR introduce _any_ user-facing change? support w8a16 quantization now ### How was this patch tested? - vLLM version: v0.12.0 - vLLM main: ad32e3e19c ### Test tested using [aisbench](https://gitee.com/aisbench/benchmark/) with tp2 #### Precision | ceval | mmlu | gsm8k -- | -- | -- | -- bf16 | 90.46 | 89.17 | 96.21 w8a16 | 89.51 | 89.29 | 95.98 #### Performance | input_len | output_len | concurrency | TTFT (ms) | TPOT (ms) | TPS (Total) (tokens/s) -- | -- | -- | -- | -- | -- | -- bf16 | 2048 | 2048 | 10 | 1911.7136 | 77.988 | 253.9866 w8a16 | 2048 | 2048 | 10 | 2128.6334 | 67.1633 | 293.9117 bf16 | 3500 | 1024 | 10 | 3076.2509 | 84.3525 | 506.949 w8a16 | 3500 | 1024 | 10 | 2685.2031 | 73.015 | 585.4717 --------- Signed-off-by: yyt <yangyit139@gmail.com> Signed-off-by: TmacAaron <yangyit139@gmail.com> Co-authored-by: realliujiaxu <realliujiaxu@163.com>
2025-12-24 19:49:32 +08:00
parent 515267de22
commit 5018f2d8fd
4 changed files with 229 additions and 1 deletions
--- a/vllm_ascend/quantization/utils.py
+++ b/vllm_ascend/quantization/utils.py
@@ -14,6 +14,7 @@ from .w8a8_dynamic import (AscendW8A8DynamicFusedMoEMethod,
                           AscendW8A8DynamicLinearMethod)
 from .w8a8_pdmix import (AscendW8A8PDMixFusedMoeMethod,
                         AscendW8A8PDMixLinearMethod)
+from .w8a16 import AscendW8A16LinearMethod

 ASCEND_QUANTIZATION_METHOD_MAP: Dict[str, Dict[str, Type[Any]]] = {
    "W4A16": {
@@ -36,6 +37,9 @@ ASCEND_QUANTIZATION_METHOD_MAP: Dict[str, Dict[str, Type[Any]]] = {
    "W8A8_MIX": {
        "linear": AscendW8A8PDMixLinearMethod,
        "moe": AscendW8A8PDMixFusedMoeMethod,
+    },
+    "W8A16": {
+        "linear": AscendW8A16LinearMethod,
    }
 }