[Feature] support compressed-tensors w4a16 quantization (#154)

- native int4 kimi model inference is supported Signed-off-by: Li Wei <liwei.109@outlook.com>
2026-01-27 19:56:22 +08:00
parent 0711c1abfa
commit 71bd70ad6c
9 changed files with 369 additions and 28 deletions
--- a/README.md
+++ b/README.md
@@ -149,6 +149,14 @@ By utilizing the vLLM Kunlun plugin, popular open-source models, including Trans
      <td class="status-support">✅</td>
      <td></td>
    </tr>
+    <tr>
+      <td class="model-name">Kimi-K2</td>
+      <td class="status-support">✅</td>
+      <td class="status-support">✅</td>
+      <td></td>
+      <td class="status-support">✅</td>
+      <td></td>
+    </tr>
  </tbody>
 </table>

--- a/docs/source/user_guide/feature_guide/quantization.md
+++ b/docs/source/user_guide/feature_guide/quantization.md
@@ -8,22 +8,23 @@ Like vLLM, we now support quantization methods such as compressed-tensors, AWQ,
 <table border="1" style="border-collapse: collapse; width: auto; margin: 0 0 0 0; text-align: center;">
  <thead>
    <tr>
-      <td colspan="2" style="padding: 10px; font-weight: bold; border: 1px solid #000;">Compressed-Tensor (w8a8)</td>
+      <td colspan="2" style="padding: 10px; font-weight: bold; border: 1px solid #000;">Compressed-Tensors (w8a8-Int8)</td>
      <td colspan="4" style="padding: 10px; font-weight: bold; border: 1px solid #000;">Weight only (w4a16/w8a16)</td>
    </tr>
    <tr>
      <td style="padding: 10px; border: 1px solid #000;">Dynamic</td>
      <td style="padding: 10px; border: 1px solid #000;">Static</td>
-      <td colspan="2" style="padding: 10px; border: 1px solid #000;">AWQ (w4a16)</td>
+      <td colspan="1" style="padding: 10px; border: 1px solid #000;">AWQ (w4a16)</td>
      <td colspan="2" style="padding: 10px; border: 1px solid #000;">GPTQ (w4a16/w8a16)</td>
+       <td colspan="1" style="padding: 10px; border: 1px solid #000;">Compressed-Tensors (w4a16)</td>
    </tr>
    <tr>
+      <td style="padding: 10px; border: 1px solid #000;">Dense/MoE</td>
      <td style="padding: 10px; border: 1px solid #000;">Dense/MoE</td>
      <td style="padding: 10px; border: 1px solid #000;">Dense/MoE</td>
      <td style="padding: 10px; border: 1px solid #000;">Dense</td>
      <td style="padding: 10px; border: 1px solid #000;">MoE</td>
-      <td style="padding: 10px; border: 1px solid #000;">Dense</td>
-      <td style="padding: 10px; border: 1px solid #000;">MoE</td>
+      <td style="padding: 10px; border: 1px solid #000;">Dense/MoE</td>
    </tr>
  </thead>
  <tbody>
@@ -32,14 +33,16 @@ Like vLLM, we now support quantization methods such as compressed-tensors, AWQ,
      <td style="padding: 10px; border: 1px solid #000;">✅</td>
      <td style="padding: 10px; border: 1px solid #000;">✅</td>
      <td style="padding: 10px; border: 1px solid #000;">✅</td>
-      <td style="padding: 10px; border: 1px solid #000;">✅</td>
      <td style="padding: 10px; border: 1px solid #000;">WIP</td>
+      <td style="padding: 10px; border: 1px solid #000;">✅</td>
    </tr>
  </tbody>
 </table>

-+ W8A8 dynamic and static quantization are now supported for all LLMs and VLMs.
-+ AWQ/GPTQ quantization is supported for all dense models.
+ Compressed-Tensors w8a8-Int8 dynamic and static quantization are supported for all LLMs and VLMs.
+ Compressed-Tensors w4a16 are supported for all LLMs and VLMs.
+ AWQ(w4a16) quantization is supported for all LLMs and VLMs.
+ GPTQ (w4a16/w8a16) quantization is supported for all dense models.

 ## Usages

--- a/vllm_kunlun/ops/init.py
+++ b/vllm_kunlun/ops/init.py
@@ -15,13 +15,20 @@
 # This file is a part of the vllm-ascend project.
 #

+# embedding
 import vllm_kunlun.ops.rotary_embedding
-import vllm_kunlun.ops.layernorm
+import vllm_kunlun.ops.vocab_parallel_embedding
+
+# quantization
 import vllm_kunlun.ops.quantization.awq
 import vllm_kunlun.ops.quantization.gptq
 import vllm_kunlun.ops.quantization.moe_wna16
-import vllm_kunlun.ops.vocab_parallel_embedding
+import vllm_kunlun.ops.quantization.compressed_tensors.compressed_tensors
+import vllm_kunlun.ops.quantization.compressed_tensors.compressed_tensors_moe
+import vllm_kunlun.ops.quantization.kernels.kunlun_scale_mm
+import vllm_kunlun.ops.quantization.kernels.kunlun_exllama_linear
+
+# base layers
+import vllm_kunlun.ops.layernorm
 import vllm_kunlun.ops.linear
 import vllm_kunlun.ops.fused_moe.layer
-import vllm_kunlun.ops.quantization.compressed_tensors.compressed_tensors_moe
-import vllm_kunlun.ops.quantization.kernels.kunlun_scale_mm
--- a/vllm_kunlun/ops/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm_kunlun/ops/quantization/compressed_tensors/compressed_tensors.py
@@ -0,0 +1,86 @@
+#
+# Copyright (c) 2026 Baidu, Inc. All Rights Reserved.
+# Author: Li Wei, Tang Shiwen
+# Email: liwei157@baidu.com, tangshiwen@baidu.com
+# This file is a part of the vllm-kunlun project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+import torch
+from vllm.model_executor.layers.quantization.base_config import QuantizeMethodBase
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.linear import (
+    LinearBase,
+    LinearMethodBase,
+    UnquantizedLinearMethod,
+)
+from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (
+    CompressedTensorsConfig,
+    CompressedTensorsLinearMethod,
+    CompressedTensorsKVCacheMethod,
+    CompressedTensorsLinearTransformMethod,
+    get_linear_transform_schemes,
+)
+from vllm_kunlun.ops.quantization.compressed_tensors.compressed_tensors_moe import (
+    KunlunCompressedTensorsMoEMethod,
+)
+
+
+class KunlunCompressedTensorsConfig(CompressedTensorsConfig):
+    def get_quant_method(
+        self,
+        layer: torch.nn.Module,
+        prefix: str,
+    ) -> Optional["QuantizeMethodBase"]:
+        from vllm.attention.layer import Attention  # Avoid circular import
+
+        if isinstance(layer, LinearBase):
+            # collect schemes
+            quant_scheme = self.get_scheme(layer=layer, layer_name=prefix)
+            input_tfms, output_tfms = get_linear_transform_schemes(
+                layer, prefix, self.transform_config, self.packed_modules_mapping
+            )
+
+            # choose quantization method
+            quant_method: LinearMethodBase = UnquantizedLinearMethod()
+            if quant_scheme is not None:
+                layer.scheme = quant_scheme
+                quant_method = CompressedTensorsLinearMethod(self)
+
+            # choose transform method
+            if any((input_tfms, output_tfms)):
+                return CompressedTensorsLinearTransformMethod.from_schemes(
+                    quant_method, quant_scheme, input_tfms, output_tfms
+                )
+
+            else:
+                return quant_method
+
+        if isinstance(layer, Attention):
+            return CompressedTensorsKVCacheMethod(self)
+        if isinstance(layer, FusedMoE):
+            return KunlunCompressedTensorsMoEMethod.get_moe_method(self, layer)
+        return None
+
+
+# monkey patch
+from vllm.model_executor.layers.quantization.compressed_tensors import (
+    compressed_tensors,
+)
+
+compressed_tensors.CompressedTensorsConfig = KunlunCompressedTensorsConfig
+print(
+    "[Monkey Patch Applied] >>> vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors.CompressedTensorsConfig \
+      --> vllm_kunlun.ops.quantization.compressed_tensors.KunlunCompressedTensorsConfig"
+)
--- a/vllm_kunlun/ops/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm_kunlun/ops/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -19,9 +19,95 @@
 from typing import Callable, Optional, Union

 import torch
+from vllm.logger import init_logger
+from compressed_tensors.quantization import ActivationOrdering, QuantizationStrategy
+from vllm.model_executor.layers.fused_moe import FusedMoEConfig, FusedMoEMethodBase
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import (
+    CompressedTensorsW4A4MoeMethod,
+    CompressedTensorsW4A8Int8MoEMethod,
    CompressedTensorsW8A8Int8MoEMethod,
+    CompressedTensorsW8A8Int8MoEMethod,
+    CompressedTensorsW8A8Fp8MoEMethod,
+    CompressedTensorsWNA16MoEMethod,
+    find_matched_target,
 )
+from vllm_kunlun.ops._kunlun_ops import KunlunOps as ops
+from vllm_kunlun.ops.quantization.kernels.quant_ops import dequant_int4_native
+
+logger = init_logger(__name__)
+
+
+class KunlunCompressedTensorsMoEMethod(FusedMoEMethodBase):
+
+    def __init_(self, moe: FusedMoEConfig):
+        super().__init__(moe)
+
+    @staticmethod
+    def get_moe_method(
+        quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
+        layer: torch.nn.Module,
+    ) -> "KunlunCompressedTensorsMoEMethod":
+        # TODO: @dsikka: refactor this to use schemes as other kernels
+        # are supported + check if the layer is being ignored.
+        # Check if a using "Linear" to select schemes
+        if "Linear" in quant_config.target_scheme_map:
+            matched_target = "Linear"
+        else:
+            # May have instead defined the linear layers in the fused model
+            fused_layers = ["re:.*down_proj.*", "re:.*gate_proj.*", "re:.*up_proj.*"]
+            current_scheme = None
+            for fused_layer in fused_layers:
+                # Check if one of the fused layers are defined in quant_config
+                matched_target = find_matched_target(
+                    layer_name=fused_layer,
+                    module=layer,
+                    targets=quant_config.target_scheme_map.keys(),
+                    fused_mapping=quant_config.packed_modules_mapping,
+                )
+
+                # Only valid if down_proj, gate_proj, and up_proj
+                # are mapped to the same quant scheme in the quant_config
+                if current_scheme is None:
+                    current_scheme = quant_config.target_scheme_map.get(matched_target)
+                else:
+                    assert current_scheme == quant_config.target_scheme_map.get(
+                        matched_target
+                    )
+
+        weight_quant = quant_config.target_scheme_map[matched_target].get("weights")
+        input_quant = quant_config.target_scheme_map[matched_target].get(
+            "input_activations"
+        )
+        if quant_config._is_wNa16_group_channel(weight_quant, input_quant):
+            if (
+                weight_quant.strategy in QuantizationStrategy.GROUP
+                and weight_quant.actorder
+                in (ActivationOrdering.GROUP, ActivationOrdering.DYNAMIC)
+            ):
+                raise ValueError(
+                    "WNA16MoE is not supported with actorder=group/dynamic."
+                )
+            # MarlinMoE kernel is not supported on XPU.
+            logger.warning_once(f"Using KunlunCompressedTensorsWNA16MoEMethod")
+            return KunlunCompressedTensorsWNA16MoEMethod(quant_config, layer.moe_config)
+        elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant):
+            return CompressedTensorsW4A4MoeMethod(layer.moe_config)
+        elif (
+            quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant)
+            or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant)
+            or quant_config._is_fp8_w8a8(weight_quant, input_quant)
+        ):
+            return CompressedTensorsW8A8Fp8MoEMethod(quant_config, layer.moe_config)
+        elif quant_config._is_dynamic_token_w8a8(weight_quant, input_quant):
+            return KunlunCompressedTensorsW8A8Int8MoEMethod(
+                quant_config, layer.moe_config
+            )
+        elif quant_config._is_dynamic_token_w4a8_int(weight_quant, input_quant):
+            return CompressedTensorsW4A8Int8MoEMethod(quant_config, layer.moe_config)
+        else:
+            raise RuntimeError(
+                f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}"
+            )


 class KunlunCompressedTensorsW8A8Int8MoEMethod(CompressedTensorsW8A8Int8MoEMethod):
@@ -184,7 +270,7 @@ class KunlunCompressedTensorsW8A8Int8MoEMethod(CompressedTensorsW8A8Int8MoEMetho
            # sort_mode=False,
            act=None,
        )
-        del x_q, x_scale, sorted_tokens_num_lod,expert_m
+        del x_q, x_scale, sorted_tokens_num_lod, expert_m

        dequant_scale = torch.ones([M, top_k], dtype=torch.float32, device=out.device)
        output = torch.empty(
@@ -202,6 +288,75 @@ class KunlunCompressedTensorsW8A8Int8MoEMethod(CompressedTensorsW8A8Int8MoEMetho
        return output


+class KunlunCompressedTensorsWNA16MoEMethod(CompressedTensorsWNA16MoEMethod):
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        # dequant packed weights to float16
+        w13_weight = dequant_int4_native(
+            weight_packed_uint8=layer.w13_weight_packed,
+            scale=self.moe_quant_config.w1_scale,
+        )
+        w2_weight = dequant_int4_native(
+            weight_packed_uint8=layer.w2_weight_packed,
+            scale=self.moe_quant_config.w2_scale,
+        )
+        
+        if self.moe.use_ep:
+            return ops.fused_moe_ep(
+                x,
+                w13_weight,
+                w2_weight,
+                router_logits,
+                self.moe.ep_rank,
+                top_k,
+                renormalize=renormalize,
+                inplace=True,
+                use_grouped_topk=use_grouped_topk,
+                num_expert_group=num_expert_group,
+                topk_group=topk_group,
+            )
+        else:
+            return ops.fused_moe(
+                x,
+                w13_weight,
+                w2_weight,
+                router_logits,
+                self.moe.ep_rank,
+                top_k,
+                renormalize=renormalize,
+                inplace=True,
+                use_grouped_topk=use_grouped_topk,
+                num_expert_group=num_expert_group,
+                topk_group=topk_group,
+                scoring_func=scoring_func,
+                e_score_correction_bias=e_score_correction_bias,
+                w1_bias=getattr(layer, "w13_bias", None),
+                w2_bias=getattr(layer, "w2_bias", None),
+            )
+
+
 # monkey patch
 from vllm.model_executor.layers.quantization.compressed_tensors import (
    compressed_tensors_moe,
@@ -210,7 +365,21 @@ from vllm.model_executor.layers.quantization.compressed_tensors import (
 compressed_tensors_moe.CompressedTensorsW8A8Int8MoEMethod = (
    KunlunCompressedTensorsW8A8Int8MoEMethod
 )
-print(
-    "[Monkey Patch Applied] >>> vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW8A8Int8MoEMethod \
-      --> vllm_kunlun.ops.quantization.compressed_tensors_moe.py:KunlunCompressedTensorsW8A8Int8MoEMethod"
+compressed_tensors_moe.CompressedTensorsMoEMethod = KunlunCompressedTensorsMoEMethod
+compressed_tensors_moe.CompressedTensorsWNA16MoEMethod = (
+    KunlunCompressedTensorsWNA16MoEMethod
+)
+KunlunCompressedTensorsWNA16MoEMethod.__name__ = "CompressedTensorsWNA16MoEMethod"
+
+logger.info_once(
+    "[Monkey Patch Applied] >>> vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW8A8Int8MoEMethod \
+      --> vllm_kunlun.ops.quantization.compressed_tensors_moe.KunlunCompressedTensorsW8A8Int8MoEMethod"
+)
+logger.info_once(
+    "[Monkey Patch Applied] >>> vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsMoEMethod \
+      --> vllm_kunlun.ops.quantization.compressed_tensors_moe.KunlunCompressedTensorsMoEMethod"
+)
+logger.info_once(
+    "[Monkey Patch Applied] >>> vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsWNA16MoEMethod \
+      --> vllm_kunlun.ops.quantization.compressed_tensors_moe.KunlunCompressedTensorsWNA16MoEMethod"
 )
--- a/vllm_kunlun/ops/quantization/kernels/kunlun_exllama_linear.py
+++ b/vllm_kunlun/ops/quantization/kernels/kunlun_exllama_linear.py
@@ -0,0 +1,57 @@
+#
+# Copyright (c) 2026 Baidu, Inc. All Rights Reserved.
+# Author: Li Wei
+# Email: liwei157@baidu.com
+# This file is a part of the vllm-kunlun project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import torch
+import xspeedgate_ops
+from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
+    ExllamaLinearKernel,
+    _POSSIBLE_KERNELS,
+)
+
+
+class KunlunExllamaLinearKernel(ExllamaLinearKernel):
+
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        c = self.config
+
+        x_2d = x.reshape(-1, x.shape[-1])
+        out_shape = x.shape[:-1] + (c.partition_weight_shape[1],)
+
+        w_q, w_s, w_zp, w_g_idx = self._get_weight_params(layer)
+
+        assert w_zp is not None, "Zero points are required by Exllama"
+        assert w_g_idx is not None, "Group index is required by Exllama"
+        output = torch.ops.xspeedgate_ops.gptq_gemm(
+            x_2d, w_q, w_zp, w_s, w_g_idx, True, c.weight_type.size_bits
+        )
+
+        if bias is not None:
+            output.add_(bias)
+        return output.reshape(out_shape)
+
+
+# remove ExllamaLinearKernel and add KunlunExllamaLinearKernel
+_POSSIBLE_KERNELS.remove(ExllamaLinearKernel)
+_POSSIBLE_KERNELS.append(KunlunExllamaLinearKernel)
--- a/vllm_kunlun/ops/quantization/kernels/kunlun_scale_mm.py
+++ b/vllm_kunlun/ops/quantization/kernels/kunlun_scale_mm.py
@@ -99,12 +99,5 @@ class KunlunScaledMMLinearKernel(CutlassScaledMMLinearKernel):
            # )


-# monkey patch
-_POSSIBLE_KERNELS[PlatformEnum.CUDA] = [KunlunScaledMMLinearKernel]
-from vllm.model_executor.layers.quantization.kernels.scaled_mm import cutlass
-
-cutlass.CutlassScaledMMLinearKernel = KunlunScaledMMLinearKernel
-print(
-    "[Monkey Patch Applied] >>> vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass.CutlassScaledMMLinearKernel \
-      --> vllm_kunlun.ops.quantization.kernels.kunlun_scale_mm.KunlunScaledMMLinearKernel"
-)
+# replace CutlassScaledMMLinearKernel with KunlunScaledMMLinearKernel
+_POSSIBLE_KERNELS[PlatformEnum.CUDA] = [KunlunScaledMMLinearKernel]
--- a/vllm_kunlun/ops/quantization/kernels/quant_ops.py
+++ b/vllm_kunlun/ops/quantization/kernels/quant_ops.py
@@ -1,7 +1,7 @@
 #
-# Copyright (c) 2025 Baidu, Inc. All Rights Reserved.
-# Author: Tang Shiwen
-# Email: tangshiwen@baidu.com
+# Copyright (c) 2026 Baidu, Inc. All Rights Reserved.
+# Author: Tang Shiwen, Li Wei
+# Email: tangshiwen@baidu.com, liwei157@baidu.com
 # This file is a part of the vllm-kunlun project.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -66,3 +66,21 @@ def dequant_int4(
    )

    return fpweight.transpose(1, 2).contiguous()
+
+
+def dequant_int4_native(weight_packed_uint8: torch.Tensor, scale: torch.Tensor):
+    """Unpack uint4 weight from packed uint8 weight and dequant it to float16."""
+    weight_upacked_fp16 = (
+        torch.stack(
+            (weight_packed_uint8 & 0xF, (weight_packed_uint8 >> 4) & 0xF),
+            dim=-1,
+        )
+        .reshape(*weight_packed_uint8.shape[:-1], -1)
+        .contiguous()
+        .to(torch.float16)
+        - 8.0
+    )
+    weight_upacked_fp16 *= scale.repeat(
+        1, 1, weight_upacked_fp16.shape[-1] // scale.shape[-1]
+    )
+    return weight_upacked_fp16
--- a/vllm_kunlun/vllm_utils_wrapper.py
+++ b/vllm_kunlun/vllm_utils_wrapper.py
@@ -2275,7 +2275,7 @@ fwd_kvcache_mla.register_fake(_fake_fwd_kvcache_mla)


 ##################################################
-# --------------- dequant_int4 -----------------
+# --------------- dequant_int4 -------------------
 ##################################################
@custom_op("_C::dequant_int4", mutates_args=())
 def dequant_int4(