[Feat] Support native Kimi-K2-Thinking native W4A16 quantized experts weights (#4516)

### What this PR does / why we need it? Adds W4A16 quantization method for the Kimi-K2-Thinking model and updates relevant modules to support the new quantization method. - Implements complete W4A16 quantization method including weight packing/unpacking, per-group quantization parameter generation, post-processing logic and MoE method application. - Adds parameters `use_int4_w4a16`, `w1_offset` and `w2_offset`, adjusts `with_quant` conditional logic to support W4A16 matrix multiplication. - Adds `packed_modules_model_mapping` for Kimi-K2-Thinking model and processing logic for `weight_packed` field. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: zhoux77899 <zhouxiang100@huawei.com> Signed-off-by: Ruri <33858552+zhoux77899@users.noreply.github.com> Signed-off-by: Ruri <zhouxiang100@huawei.com>
2025-12-10 15:58:52 +08:00
parent c1db298f43
commit ce5872705e
13 changed files with 781 additions and 13 deletions
--- a/vllm_ascend/ops/fused_moe/moe_comm_method.py
+++ b/vllm_ascend/ops/fused_moe/moe_comm_method.py
@@ -93,12 +93,15 @@ class MoECommMethod(ABC):
            apply_router_weight_on_input: bool = False,
            use_int8_w8a8: bool = False,
            use_int4_w4a8: bool = False,
+            use_int4_w4a16: bool = False,
            global_num_experts: Optional[int] = None,
            expert_map: Optional[torch.Tensor] = None,
            w1_scale: Optional[list[torch.Tensor]] = None,
            w2_scale: Optional[list[torch.Tensor]] = None,
            w1_scale_bias: torch.Tensor = None,
            w2_scale_bias: torch.Tensor = None,
+            w1_offset: Optional[torch.Tensor] = None,
+            w2_offset: Optional[torch.Tensor] = None,
            # For Cube/Vector parallel
            shared_experts: Optional[Any] = None,
            quantized_x_for_share: Optional[Any] = None,
@@ -147,9 +150,11 @@ class MoECommMethod(ABC):
                                       group_list_type=group_list_type,
                                       w1_scale_bias=w1_scale_bias,
                                       w2_scale_bias=w2_scale_bias,
+                                       w1_offset=w1_offset,
+                                       w2_offset=w2_offset,
                                       topk_scales=topk_scales,
                                       with_quant=use_int8_w8a8
-                                       or use_int4_w4a8,
+                                       or use_int4_w4a8 or use_int4_w4a16,
                                       fusion=use_int8_w8a8,
                                       need_trans=need_trans,
                                       dynamic_eplb=dynamic_eplb)
@@ -275,12 +280,15 @@ class FusedAlltoAllCommImpl(MoECommMethod):
            apply_router_weight_on_input: bool = False,
            use_int8_w8a8: bool = False,
            use_int4_w4a8: bool = False,
+            use_int4_w4a16: bool = False,
            global_num_experts: Optional[int] = None,
            expert_map: Optional[torch.Tensor] = None,
            w1_scale: Optional[torch.Tensor] = None,
            w2_scale: Optional[torch.Tensor] = None,
            w1_scale_bias: torch.Tensor = None,
            w2_scale_bias: torch.Tensor = None,
+            w1_offset: Optional[torch.Tensor] = None,
+            w2_offset: Optional[torch.Tensor] = None,
            # For Cube/Vector parallel
            shared_experts: Optional[Any] = None,
            quantized_x_for_share: Optional[Any] = None,
--- a/vllm_ascend/ops/fused_moe/moe_mlp.py
+++ b/vllm_ascend/ops/fused_moe/moe_mlp.py
@@ -68,9 +68,14 @@ def quant_apply_mlp(hidden_states: torch.Tensor,
                    dynamic_scale: torch.Tensor = None,
                    w1_scale_bias: torch.Tensor = None,
                    w2_scale_bias: torch.Tensor = None,
+                    w1_offset: Optional[torch.Tensor] = None,
+                    w2_offset: Optional[torch.Tensor] = None,
                    fusion: bool = False,
                    dynamic_eplb: bool = False) -> torch.Tensor:
-    if dynamic_scale is None:
+    if w1_offset is not None:
+        unquantized_hidden_states = hidden_states
+        quantized_hidden_states = None
+    elif dynamic_scale is None:
        unquantized_hidden_states = hidden_states
        hidden_states, pertoken_scale = torch_npu.npu_dynamic_quant(
            hidden_states)
@@ -79,6 +84,7 @@ def quant_apply_mlp(hidden_states: torch.Tensor,
        dispose_tensor(unquantized_hidden_states)
        quantized_hidden_states = None
    else:
+        unquantized_hidden_states = None
        pertoken_scale = dynamic_scale
        quantized_hidden_states = hidden_states

@@ -90,7 +96,7 @@ def quant_apply_mlp(hidden_states: torch.Tensor,
        weight_prefetch_method.maybe_prefetch_moe_weight_postprocess(
            hidden_states)
    is_mc2 = get_forward_context().moe_comm_type == MoECommType.MC2
-    if w1_scale_bias is None and is_mc2:
+    if w1_scale_bias is None and w1_offset is None and is_mc2:
        if _custom_gmm_swiglu_enabled(fusion, dynamic_eplb):
            # gmm1: gate_up_proj & act_fn: swiglu
            hidden_states, swiglu_out_scale, _ = (
@@ -149,6 +155,32 @@ def quant_apply_mlp(hidden_states: torch.Tensor,
            group_type=0,
            group_list=group_list,
            output_dtype=w2_scale[0].dtype)[0]
+    elif w1_offset is not None:
+        # gmm1: gate_up_proj
+        hidden_states = torch_npu.npu_grouped_matmul(
+            x=[unquantized_hidden_states],
+            weight=[w1],
+            antiquant_scale=[w1_scale],
+            antiquant_offset=[w1_offset],
+            split_item=2,
+            group_list_type=group_list_type,
+            group_type=0,
+            group_list=group_list,
+            output_dtype=_output_dtype)[0]
+        dispose_tensor(unquantized_hidden_states)
+        # act_fn: swiglu
+        hidden_states = torch_npu.npu_swiglu(hidden_states)
+        # gmm2: down_proj
+        hidden_states = torch_npu.npu_grouped_matmul(
+            x=[hidden_states],
+            weight=[w2],
+            antiquant_scale=[w2_scale],
+            antiquant_offset=[w2_offset],
+            split_item=2,
+            group_list_type=group_list_type,
+            group_type=0,
+            group_list=group_list,
+            output_dtype=_output_dtype)[0]
    else:
        if w1_scale_bias is not None:
            if group_list_type == 0:
@@ -269,6 +301,8 @@ def unified_apply_mlp(hidden_states: torch.Tensor,
                      group_list_type: int = 1,
                      w1_scale_bias: torch.Tensor = None,
                      w2_scale_bias: torch.Tensor = None,
+                      w1_offset: Optional[torch.Tensor] = None,
+                      w2_offset: Optional[torch.Tensor] = None,
                      topk_scales: Optional[torch.Tensor] = None,
                      with_quant: bool = False,
                      fusion: bool = False,
@@ -286,6 +320,8 @@ def unified_apply_mlp(hidden_states: torch.Tensor,
                               group_list_type=group_list_type,
                               w1_scale_bias=w1_scale_bias,
                               w2_scale_bias=w2_scale_bias,
+                               w1_offset=w1_offset,
+                               w2_offset=w2_offset,
                               fusion=fusion,
                               dynamic_eplb=dynamic_eplb)
    else: