diff --git a/README.md b/README.md
index e2bec0b..778b707 100644
--- a/README.md
+++ b/README.md
@@ -149,6 +149,14 @@ By utilizing the vLLM Kunlun plugin, popular open-source models, including Trans
✅ |
|
+
+ | Kimi-K2 |
+ ✅ |
+ ✅ |
+ |
+ ✅ |
+ |
+
diff --git a/docs/source/user_guide/feature_guide/quantization.md b/docs/source/user_guide/feature_guide/quantization.md
index 9d90df7..a7094b6 100644
--- a/docs/source/user_guide/feature_guide/quantization.md
+++ b/docs/source/user_guide/feature_guide/quantization.md
@@ -8,22 +8,23 @@ Like vLLM, we now support quantization methods such as compressed-tensors, AWQ,
- | Compressed-Tensor (w8a8) |
+ Compressed-Tensors (w8a8-Int8) |
Weight only (w4a16/w8a16) |
| Dynamic |
Static |
- AWQ (w4a16) |
+ AWQ (w4a16) |
GPTQ (w4a16/w8a16) |
+ Compressed-Tensors (w4a16) |
+ | Dense/MoE |
Dense/MoE |
Dense/MoE |
Dense |
MoE |
- Dense |
- MoE |
+ Dense/MoE |
@@ -32,14 +33,16 @@ Like vLLM, we now support quantization methods such as compressed-tensors, AWQ,
✅ |
✅ |
✅ |
- ✅ |
WIP |
+ ✅ |
-+ W8A8 dynamic and static quantization are now supported for all LLMs and VLMs.
-+ AWQ/GPTQ quantization is supported for all dense models.
++ Compressed-Tensors w8a8-Int8 dynamic and static quantization are supported for all LLMs and VLMs.
++ Compressed-Tensors w4a16 are supported for all LLMs and VLMs.
++ AWQ(w4a16) quantization is supported for all LLMs and VLMs.
++ GPTQ (w4a16/w8a16) quantization is supported for all dense models.
## Usages
diff --git a/vllm_kunlun/ops/__init__.py b/vllm_kunlun/ops/__init__.py
index c5d2991..1776a76 100644
--- a/vllm_kunlun/ops/__init__.py
+++ b/vllm_kunlun/ops/__init__.py
@@ -15,13 +15,20 @@
# This file is a part of the vllm-ascend project.
#
+# embedding
import vllm_kunlun.ops.rotary_embedding
-import vllm_kunlun.ops.layernorm
+import vllm_kunlun.ops.vocab_parallel_embedding
+
+# quantization
import vllm_kunlun.ops.quantization.awq
import vllm_kunlun.ops.quantization.gptq
import vllm_kunlun.ops.quantization.moe_wna16
-import vllm_kunlun.ops.vocab_parallel_embedding
+import vllm_kunlun.ops.quantization.compressed_tensors.compressed_tensors
+import vllm_kunlun.ops.quantization.compressed_tensors.compressed_tensors_moe
+import vllm_kunlun.ops.quantization.kernels.kunlun_scale_mm
+import vllm_kunlun.ops.quantization.kernels.kunlun_exllama_linear
+
+# base layers
+import vllm_kunlun.ops.layernorm
import vllm_kunlun.ops.linear
import vllm_kunlun.ops.fused_moe.layer
-import vllm_kunlun.ops.quantization.compressed_tensors.compressed_tensors_moe
-import vllm_kunlun.ops.quantization.kernels.kunlun_scale_mm
\ No newline at end of file
diff --git a/vllm_kunlun/ops/quantization/compressed_tensors/compressed_tensors.py b/vllm_kunlun/ops/quantization/compressed_tensors/compressed_tensors.py
new file mode 100644
index 0000000..27d528f
--- /dev/null
+++ b/vllm_kunlun/ops/quantization/compressed_tensors/compressed_tensors.py
@@ -0,0 +1,86 @@
+#
+# Copyright (c) 2026 Baidu, Inc. All Rights Reserved.
+# Author: Li Wei, Tang Shiwen
+# Email: liwei157@baidu.com, tangshiwen@baidu.com
+# This file is a part of the vllm-kunlun project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+import torch
+from vllm.model_executor.layers.quantization.base_config import QuantizeMethodBase
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.linear import (
+ LinearBase,
+ LinearMethodBase,
+ UnquantizedLinearMethod,
+)
+from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (
+ CompressedTensorsConfig,
+ CompressedTensorsLinearMethod,
+ CompressedTensorsKVCacheMethod,
+ CompressedTensorsLinearTransformMethod,
+ get_linear_transform_schemes,
+)
+from vllm_kunlun.ops.quantization.compressed_tensors.compressed_tensors_moe import (
+ KunlunCompressedTensorsMoEMethod,
+)
+
+
+class KunlunCompressedTensorsConfig(CompressedTensorsConfig):
+ def get_quant_method(
+ self,
+ layer: torch.nn.Module,
+ prefix: str,
+ ) -> Optional["QuantizeMethodBase"]:
+ from vllm.attention.layer import Attention # Avoid circular import
+
+ if isinstance(layer, LinearBase):
+ # collect schemes
+ quant_scheme = self.get_scheme(layer=layer, layer_name=prefix)
+ input_tfms, output_tfms = get_linear_transform_schemes(
+ layer, prefix, self.transform_config, self.packed_modules_mapping
+ )
+
+ # choose quantization method
+ quant_method: LinearMethodBase = UnquantizedLinearMethod()
+ if quant_scheme is not None:
+ layer.scheme = quant_scheme
+ quant_method = CompressedTensorsLinearMethod(self)
+
+ # choose transform method
+ if any((input_tfms, output_tfms)):
+ return CompressedTensorsLinearTransformMethod.from_schemes(
+ quant_method, quant_scheme, input_tfms, output_tfms
+ )
+
+ else:
+ return quant_method
+
+ if isinstance(layer, Attention):
+ return CompressedTensorsKVCacheMethod(self)
+ if isinstance(layer, FusedMoE):
+ return KunlunCompressedTensorsMoEMethod.get_moe_method(self, layer)
+ return None
+
+
+# monkey patch
+from vllm.model_executor.layers.quantization.compressed_tensors import (
+ compressed_tensors,
+)
+
+compressed_tensors.CompressedTensorsConfig = KunlunCompressedTensorsConfig
+print(
+ "[Monkey Patch Applied] >>> vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors.CompressedTensorsConfig \
+ --> vllm_kunlun.ops.quantization.compressed_tensors.KunlunCompressedTensorsConfig"
+)
diff --git a/vllm_kunlun/ops/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm_kunlun/ops/quantization/compressed_tensors/compressed_tensors_moe.py
index c46fa3b..d9d810d 100644
--- a/vllm_kunlun/ops/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm_kunlun/ops/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -19,9 +19,95 @@
from typing import Callable, Optional, Union
import torch
+from vllm.logger import init_logger
+from compressed_tensors.quantization import ActivationOrdering, QuantizationStrategy
+from vllm.model_executor.layers.fused_moe import FusedMoEConfig, FusedMoEMethodBase
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import (
+ CompressedTensorsW4A4MoeMethod,
+ CompressedTensorsW4A8Int8MoEMethod,
CompressedTensorsW8A8Int8MoEMethod,
+ CompressedTensorsW8A8Int8MoEMethod,
+ CompressedTensorsW8A8Fp8MoEMethod,
+ CompressedTensorsWNA16MoEMethod,
+ find_matched_target,
)
+from vllm_kunlun.ops._kunlun_ops import KunlunOps as ops
+from vllm_kunlun.ops.quantization.kernels.quant_ops import dequant_int4_native
+
+logger = init_logger(__name__)
+
+
+class KunlunCompressedTensorsMoEMethod(FusedMoEMethodBase):
+
+ def __init_(self, moe: FusedMoEConfig):
+ super().__init__(moe)
+
+ @staticmethod
+ def get_moe_method(
+ quant_config: "CompressedTensorsConfig", # type: ignore # noqa E501
+ layer: torch.nn.Module,
+ ) -> "KunlunCompressedTensorsMoEMethod":
+ # TODO: @dsikka: refactor this to use schemes as other kernels
+ # are supported + check if the layer is being ignored.
+ # Check if a using "Linear" to select schemes
+ if "Linear" in quant_config.target_scheme_map:
+ matched_target = "Linear"
+ else:
+ # May have instead defined the linear layers in the fused model
+ fused_layers = ["re:.*down_proj.*", "re:.*gate_proj.*", "re:.*up_proj.*"]
+ current_scheme = None
+ for fused_layer in fused_layers:
+ # Check if one of the fused layers are defined in quant_config
+ matched_target = find_matched_target(
+ layer_name=fused_layer,
+ module=layer,
+ targets=quant_config.target_scheme_map.keys(),
+ fused_mapping=quant_config.packed_modules_mapping,
+ )
+
+ # Only valid if down_proj, gate_proj, and up_proj
+ # are mapped to the same quant scheme in the quant_config
+ if current_scheme is None:
+ current_scheme = quant_config.target_scheme_map.get(matched_target)
+ else:
+ assert current_scheme == quant_config.target_scheme_map.get(
+ matched_target
+ )
+
+ weight_quant = quant_config.target_scheme_map[matched_target].get("weights")
+ input_quant = quant_config.target_scheme_map[matched_target].get(
+ "input_activations"
+ )
+ if quant_config._is_wNa16_group_channel(weight_quant, input_quant):
+ if (
+ weight_quant.strategy in QuantizationStrategy.GROUP
+ and weight_quant.actorder
+ in (ActivationOrdering.GROUP, ActivationOrdering.DYNAMIC)
+ ):
+ raise ValueError(
+ "WNA16MoE is not supported with actorder=group/dynamic."
+ )
+ # MarlinMoE kernel is not supported on XPU.
+ logger.warning_once(f"Using KunlunCompressedTensorsWNA16MoEMethod")
+ return KunlunCompressedTensorsWNA16MoEMethod(quant_config, layer.moe_config)
+ elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant):
+ return CompressedTensorsW4A4MoeMethod(layer.moe_config)
+ elif (
+ quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant)
+ or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant)
+ or quant_config._is_fp8_w8a8(weight_quant, input_quant)
+ ):
+ return CompressedTensorsW8A8Fp8MoEMethod(quant_config, layer.moe_config)
+ elif quant_config._is_dynamic_token_w8a8(weight_quant, input_quant):
+ return KunlunCompressedTensorsW8A8Int8MoEMethod(
+ quant_config, layer.moe_config
+ )
+ elif quant_config._is_dynamic_token_w4a8_int(weight_quant, input_quant):
+ return CompressedTensorsW4A8Int8MoEMethod(quant_config, layer.moe_config)
+ else:
+ raise RuntimeError(
+ f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}"
+ )
class KunlunCompressedTensorsW8A8Int8MoEMethod(CompressedTensorsW8A8Int8MoEMethod):
@@ -184,7 +270,7 @@ class KunlunCompressedTensorsW8A8Int8MoEMethod(CompressedTensorsW8A8Int8MoEMetho
# sort_mode=False,
act=None,
)
- del x_q, x_scale, sorted_tokens_num_lod,expert_m
+ del x_q, x_scale, sorted_tokens_num_lod, expert_m
dequant_scale = torch.ones([M, top_k], dtype=torch.float32, device=out.device)
output = torch.empty(
@@ -202,6 +288,75 @@ class KunlunCompressedTensorsW8A8Int8MoEMethod(CompressedTensorsW8A8Int8MoEMetho
return output
+class KunlunCompressedTensorsWNA16MoEMethod(CompressedTensorsWNA16MoEMethod):
+
+ def apply(
+ self,
+ layer: torch.nn.Module,
+ x: torch.Tensor,
+ router_logits: torch.Tensor,
+ top_k: int,
+ renormalize: bool,
+ use_grouped_topk: bool = False,
+ topk_group: Optional[int] = None,
+ num_expert_group: Optional[int] = None,
+ global_num_experts: int = -1,
+ expert_map: Optional[torch.Tensor] = None,
+ custom_routing_function: Optional[Callable] = None,
+ scoring_func: str = "softmax",
+ routed_scaling_factor: float = 1.0,
+ e_score_correction_bias: Optional[torch.Tensor] = None,
+ apply_router_weight_on_input: bool = False,
+ activation: str = "silu",
+ enable_eplb: bool = False,
+ expert_load_view: Optional[torch.Tensor] = None,
+ logical_to_physical_map: Optional[torch.Tensor] = None,
+ logical_replica_count: Optional[torch.Tensor] = None,
+ ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+ # dequant packed weights to float16
+ w13_weight = dequant_int4_native(
+ weight_packed_uint8=layer.w13_weight_packed,
+ scale=self.moe_quant_config.w1_scale,
+ )
+ w2_weight = dequant_int4_native(
+ weight_packed_uint8=layer.w2_weight_packed,
+ scale=self.moe_quant_config.w2_scale,
+ )
+
+ if self.moe.use_ep:
+ return ops.fused_moe_ep(
+ x,
+ w13_weight,
+ w2_weight,
+ router_logits,
+ self.moe.ep_rank,
+ top_k,
+ renormalize=renormalize,
+ inplace=True,
+ use_grouped_topk=use_grouped_topk,
+ num_expert_group=num_expert_group,
+ topk_group=topk_group,
+ )
+ else:
+ return ops.fused_moe(
+ x,
+ w13_weight,
+ w2_weight,
+ router_logits,
+ self.moe.ep_rank,
+ top_k,
+ renormalize=renormalize,
+ inplace=True,
+ use_grouped_topk=use_grouped_topk,
+ num_expert_group=num_expert_group,
+ topk_group=topk_group,
+ scoring_func=scoring_func,
+ e_score_correction_bias=e_score_correction_bias,
+ w1_bias=getattr(layer, "w13_bias", None),
+ w2_bias=getattr(layer, "w2_bias", None),
+ )
+
+
# monkey patch
from vllm.model_executor.layers.quantization.compressed_tensors import (
compressed_tensors_moe,
@@ -210,7 +365,21 @@ from vllm.model_executor.layers.quantization.compressed_tensors import (
compressed_tensors_moe.CompressedTensorsW8A8Int8MoEMethod = (
KunlunCompressedTensorsW8A8Int8MoEMethod
)
-print(
- "[Monkey Patch Applied] >>> vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW8A8Int8MoEMethod \
- --> vllm_kunlun.ops.quantization.compressed_tensors_moe.py:KunlunCompressedTensorsW8A8Int8MoEMethod"
+compressed_tensors_moe.CompressedTensorsMoEMethod = KunlunCompressedTensorsMoEMethod
+compressed_tensors_moe.CompressedTensorsWNA16MoEMethod = (
+ KunlunCompressedTensorsWNA16MoEMethod
+)
+KunlunCompressedTensorsWNA16MoEMethod.__name__ = "CompressedTensorsWNA16MoEMethod"
+
+logger.info_once(
+ "[Monkey Patch Applied] >>> vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW8A8Int8MoEMethod \
+ --> vllm_kunlun.ops.quantization.compressed_tensors_moe.KunlunCompressedTensorsW8A8Int8MoEMethod"
+)
+logger.info_once(
+ "[Monkey Patch Applied] >>> vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsMoEMethod \
+ --> vllm_kunlun.ops.quantization.compressed_tensors_moe.KunlunCompressedTensorsMoEMethod"
+)
+logger.info_once(
+ "[Monkey Patch Applied] >>> vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsWNA16MoEMethod \
+ --> vllm_kunlun.ops.quantization.compressed_tensors_moe.KunlunCompressedTensorsWNA16MoEMethod"
)
diff --git a/vllm_kunlun/ops/quantization/kernels/kunlun_exllama_linear.py b/vllm_kunlun/ops/quantization/kernels/kunlun_exllama_linear.py
new file mode 100644
index 0000000..2e68834
--- /dev/null
+++ b/vllm_kunlun/ops/quantization/kernels/kunlun_exllama_linear.py
@@ -0,0 +1,57 @@
+#
+# Copyright (c) 2026 Baidu, Inc. All Rights Reserved.
+# Author: Li Wei
+# Email: liwei157@baidu.com
+# This file is a part of the vllm-kunlun project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import torch
+import xspeedgate_ops
+from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
+ ExllamaLinearKernel,
+ _POSSIBLE_KERNELS,
+)
+
+
+class KunlunExllamaLinearKernel(ExllamaLinearKernel):
+
+ def apply_weights(
+ self,
+ layer: torch.nn.Module,
+ x: torch.Tensor,
+ bias: Optional[torch.Tensor] = None,
+ ) -> torch.Tensor:
+ c = self.config
+
+ x_2d = x.reshape(-1, x.shape[-1])
+ out_shape = x.shape[:-1] + (c.partition_weight_shape[1],)
+
+ w_q, w_s, w_zp, w_g_idx = self._get_weight_params(layer)
+
+ assert w_zp is not None, "Zero points are required by Exllama"
+ assert w_g_idx is not None, "Group index is required by Exllama"
+ output = torch.ops.xspeedgate_ops.gptq_gemm(
+ x_2d, w_q, w_zp, w_s, w_g_idx, True, c.weight_type.size_bits
+ )
+
+ if bias is not None:
+ output.add_(bias)
+ return output.reshape(out_shape)
+
+
+# remove ExllamaLinearKernel and add KunlunExllamaLinearKernel
+_POSSIBLE_KERNELS.remove(ExllamaLinearKernel)
+_POSSIBLE_KERNELS.append(KunlunExllamaLinearKernel)
diff --git a/vllm_kunlun/ops/quantization/kernels/kunlun_scale_mm.py b/vllm_kunlun/ops/quantization/kernels/kunlun_scale_mm.py
index d1c6b3a..c8f4e1c 100644
--- a/vllm_kunlun/ops/quantization/kernels/kunlun_scale_mm.py
+++ b/vllm_kunlun/ops/quantization/kernels/kunlun_scale_mm.py
@@ -99,12 +99,5 @@ class KunlunScaledMMLinearKernel(CutlassScaledMMLinearKernel):
# )
-# monkey patch
-_POSSIBLE_KERNELS[PlatformEnum.CUDA] = [KunlunScaledMMLinearKernel]
-from vllm.model_executor.layers.quantization.kernels.scaled_mm import cutlass
-
-cutlass.CutlassScaledMMLinearKernel = KunlunScaledMMLinearKernel
-print(
- "[Monkey Patch Applied] >>> vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass.CutlassScaledMMLinearKernel \
- --> vllm_kunlun.ops.quantization.kernels.kunlun_scale_mm.KunlunScaledMMLinearKernel"
-)
+# replace CutlassScaledMMLinearKernel with KunlunScaledMMLinearKernel
+_POSSIBLE_KERNELS[PlatformEnum.CUDA] = [KunlunScaledMMLinearKernel]
\ No newline at end of file
diff --git a/vllm_kunlun/ops/quantization/kernels/quant_ops.py b/vllm_kunlun/ops/quantization/kernels/quant_ops.py
index 00e1312..bdc47ec 100644
--- a/vllm_kunlun/ops/quantization/kernels/quant_ops.py
+++ b/vllm_kunlun/ops/quantization/kernels/quant_ops.py
@@ -1,7 +1,7 @@
#
-# Copyright (c) 2025 Baidu, Inc. All Rights Reserved.
-# Author: Tang Shiwen
-# Email: tangshiwen@baidu.com
+# Copyright (c) 2026 Baidu, Inc. All Rights Reserved.
+# Author: Tang Shiwen, Li Wei
+# Email: tangshiwen@baidu.com, liwei157@baidu.com
# This file is a part of the vllm-kunlun project.
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -66,3 +66,21 @@ def dequant_int4(
)
return fpweight.transpose(1, 2).contiguous()
+
+
+def dequant_int4_native(weight_packed_uint8: torch.Tensor, scale: torch.Tensor):
+ """Unpack uint4 weight from packed uint8 weight and dequant it to float16."""
+ weight_upacked_fp16 = (
+ torch.stack(
+ (weight_packed_uint8 & 0xF, (weight_packed_uint8 >> 4) & 0xF),
+ dim=-1,
+ )
+ .reshape(*weight_packed_uint8.shape[:-1], -1)
+ .contiguous()
+ .to(torch.float16)
+ - 8.0
+ )
+ weight_upacked_fp16 *= scale.repeat(
+ 1, 1, weight_upacked_fp16.shape[-1] // scale.shape[-1]
+ )
+ return weight_upacked_fp16
diff --git a/vllm_kunlun/vllm_utils_wrapper.py b/vllm_kunlun/vllm_utils_wrapper.py
index 80ff691..b4d15a4 100644
--- a/vllm_kunlun/vllm_utils_wrapper.py
+++ b/vllm_kunlun/vllm_utils_wrapper.py
@@ -2275,7 +2275,7 @@ fwd_kvcache_mla.register_fake(_fake_fwd_kvcache_mla)
##################################################
-# --------------- dequant_int4 -----------------
+# --------------- dequant_int4 -------------------
##################################################
@custom_op("_C::dequant_int4", mutates_args=())
def dequant_int4(