[Feat] Support native Kimi-K2-Thinking native W4A16 quantized experts weights (#4516)

### What this PR does / why we need it? Adds W4A16 quantization method for the Kimi-K2-Thinking model and updates relevant modules to support the new quantization method. - Implements complete W4A16 quantization method including weight packing/unpacking, per-group quantization parameter generation, post-processing logic and MoE method application. - Adds parameters `use_int4_w4a16`, `w1_offset` and `w2_offset`, adjusts `with_quant` conditional logic to support W4A16 matrix multiplication. - Adds `packed_modules_model_mapping` for Kimi-K2-Thinking model and processing logic for `weight_packed` field. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: zhoux77899 <zhouxiang100@huawei.com> Signed-off-by: Ruri <33858552+zhoux77899@users.noreply.github.com> Signed-off-by: Ruri <zhouxiang100@huawei.com>
2025-12-10 15:58:52 +08:00
parent c1db298f43
commit ce5872705e
13 changed files with 781 additions and 13 deletions
--- a/vllm_ascend/quantization/utils.py
+++ b/vllm_ascend/quantization/utils.py
@@ -8,6 +8,7 @@ from vllm_ascend.utils import COMPRESSED_TENSORS_METHOD
 from .w4a4_flatquant_dynamic import AscendW4A4FlatQuantDynamicLinearMethod
 from .w4a8_dynamic import (AscendW4A8DynamicFusedMoEMethod,
                           AscendW4A8DynamicLinearMethod)
+from .w4a16 import AscendW4A16FusedMoEMethod
 from .w8a8 import (AscendC8KVCacheMethod, AscendW8A8FusedMoEMethod,
                   AscendW8A8LinearMethod)
 from .w8a8_dynamic import (AscendW8A8DynamicFusedMoEMethod,
@@ -16,6 +17,9 @@ from .w8a8_pdmix import (AscendW8A8PDMixFusedMoeMethod,
                         AscendW8A8PDMixLinearMethod)

 ASCEND_QUANTIZATION_METHOD_MAP: Dict[str, Dict[str, Type[Any]]] = {
+    "W4A16": {
+        "moe": AscendW4A16FusedMoEMethod,
+    },
    "W4A8_DYNAMIC": {
        "linear": AscendW4A8DynamicLinearMethod,
        "moe": AscendW4A8DynamicFusedMoEMethod,