[Feat] Support native Kimi-K2-Thinking native W4A16 quantized experts weights (#4516)

### What this PR does / why we need it? Adds W4A16 quantization method for the Kimi-K2-Thinking model and updates relevant modules to support the new quantization method. - Implements complete W4A16 quantization method including weight packing/unpacking, per-group quantization parameter generation, post-processing logic and MoE method application. - Adds parameters `use_int4_w4a16`, `w1_offset` and `w2_offset`, adjusts `with_quant` conditional logic to support W4A16 matrix multiplication. - Adds `packed_modules_model_mapping` for Kimi-K2-Thinking model and processing logic for `weight_packed` field. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: zhoux77899 <zhouxiang100@huawei.com> Signed-off-by: Ruri <33858552+zhoux77899@users.noreply.github.com> Signed-off-by: Ruri <zhouxiang100@huawei.com>
2025-12-10 15:58:52 +08:00
parent c1db298f43
commit ce5872705e
13 changed files with 781 additions and 13 deletions
--- a/tests/e2e/multicard/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -49,6 +49,10 @@ DEEPSEEK_W4A8_MODELS = [
    "vllm-ascend/DeepSeek-V3.1-W4A8-puring"
 ]

+KIMI_W4A16_MODELS = [
+    "vllm-ascend/Kimi-K2-Thinking-Pruning",
+]
+

 def test_models_distributed_QwQ():
    example_prompts = [
@@ -250,3 +254,24 @@ def test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight(model):
            quantization="ascend",
    ) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)
+
+
+@pytest.mark.parametrize("model", KIMI_W4A16_MODELS)
+def test_models_distributed_Kimi_K2_Thinking_W4A16(model):
+    example_prompts = [
+        "Hello, my name is",
+    ]
+    max_tokens = 5
+
+    with VllmRunner(
+            model,
+            max_model_len=8192,
+            dtype="auto",
+            tensor_parallel_size=4,
+            enable_expert_parallel=True,
+            compilation_config={
+                "cudagraph_mode": "FULL_DECODE_ONLY",
+                "cudagraph_capture_sizes": [1],
+            },
+    ) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)