From 71bd70ad6c82519951b95b2bb898ee165191d7bf Mon Sep 17 00:00:00 2001 From: Li Wei <52344829+liwei109@users.noreply.github.com> Date: Tue, 27 Jan 2026 19:56:22 +0800 Subject: [PATCH] [Feature] support compressed-tensors w4a16 quantization (#154) - native int4 kimi model inference is supported Signed-off-by: Li Wei --- README.md | 8 + .../user_guide/feature_guide/quantization.md | 17 +- vllm_kunlun/ops/__init__.py | 15 +- .../compressed_tensors/compressed_tensors.py | 86 +++++++++ .../compressed_tensors_moe.py | 177 +++++++++++++++++- .../kernels/kunlun_exllama_linear.py | 57 ++++++ .../quantization/kernels/kunlun_scale_mm.py | 11 +- .../ops/quantization/kernels/quant_ops.py | 24 ++- vllm_kunlun/vllm_utils_wrapper.py | 2 +- 9 files changed, 369 insertions(+), 28 deletions(-) create mode 100644 vllm_kunlun/ops/quantization/compressed_tensors/compressed_tensors.py create mode 100644 vllm_kunlun/ops/quantization/kernels/kunlun_exllama_linear.py diff --git a/README.md b/README.md index e2bec0b..778b707 100644 --- a/README.md +++ b/README.md @@ -149,6 +149,14 @@ By utilizing the vLLM Kunlun plugin, popular open-source models, including Trans ✅ + + Kimi-K2 + ✅ + ✅ + + ✅ + + diff --git a/docs/source/user_guide/feature_guide/quantization.md b/docs/source/user_guide/feature_guide/quantization.md index 9d90df7..a7094b6 100644 --- a/docs/source/user_guide/feature_guide/quantization.md +++ b/docs/source/user_guide/feature_guide/quantization.md @@ -8,22 +8,23 @@ Like vLLM, we now support quantization methods such as compressed-tensors, AWQ, - + - + + + - - + @@ -32,14 +33,16 @@ Like vLLM, we now support quantization methods such as compressed-tensors, AWQ, - +
Compressed-Tensor (w8a8)Compressed-Tensors (w8a8-Int8) Weight only (w4a16/w8a16)
Dynamic StaticAWQ (w4a16)AWQ (w4a16) GPTQ (w4a16/w8a16)Compressed-Tensors (w4a16)
Dense/MoE Dense/MoE Dense/MoE Dense MoEDenseMoEDense/MoE
WIP
-+ W8A8 dynamic and static quantization are now supported for all LLMs and VLMs. -+ AWQ/GPTQ quantization is supported for all dense models. ++ Compressed-Tensors w8a8-Int8 dynamic and static quantization are supported for all LLMs and VLMs. ++ Compressed-Tensors w4a16 are supported for all LLMs and VLMs. ++ AWQ(w4a16) quantization is supported for all LLMs and VLMs. ++ GPTQ (w4a16/w8a16) quantization is supported for all dense models. ## Usages diff --git a/vllm_kunlun/ops/__init__.py b/vllm_kunlun/ops/__init__.py index c5d2991..1776a76 100644 --- a/vllm_kunlun/ops/__init__.py +++ b/vllm_kunlun/ops/__init__.py @@ -15,13 +15,20 @@ # This file is a part of the vllm-ascend project. # +# embedding import vllm_kunlun.ops.rotary_embedding -import vllm_kunlun.ops.layernorm +import vllm_kunlun.ops.vocab_parallel_embedding + +# quantization import vllm_kunlun.ops.quantization.awq import vllm_kunlun.ops.quantization.gptq import vllm_kunlun.ops.quantization.moe_wna16 -import vllm_kunlun.ops.vocab_parallel_embedding +import vllm_kunlun.ops.quantization.compressed_tensors.compressed_tensors +import vllm_kunlun.ops.quantization.compressed_tensors.compressed_tensors_moe +import vllm_kunlun.ops.quantization.kernels.kunlun_scale_mm +import vllm_kunlun.ops.quantization.kernels.kunlun_exllama_linear + +# base layers +import vllm_kunlun.ops.layernorm import vllm_kunlun.ops.linear import vllm_kunlun.ops.fused_moe.layer -import vllm_kunlun.ops.quantization.compressed_tensors.compressed_tensors_moe -import vllm_kunlun.ops.quantization.kernels.kunlun_scale_mm \ No newline at end of file diff --git a/vllm_kunlun/ops/quantization/compressed_tensors/compressed_tensors.py b/vllm_kunlun/ops/quantization/compressed_tensors/compressed_tensors.py new file mode 100644 index 0000000..27d528f --- /dev/null +++ b/vllm_kunlun/ops/quantization/compressed_tensors/compressed_tensors.py @@ -0,0 +1,86 @@ +# +# Copyright (c) 2026 Baidu, Inc. All Rights Reserved. +# Author: Li Wei, Tang Shiwen +# Email: liwei157@baidu.com, tangshiwen@baidu.com +# This file is a part of the vllm-kunlun project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional +import torch +from vllm.model_executor.layers.quantization.base_config import QuantizeMethodBase +from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.linear import ( + LinearBase, + LinearMethodBase, + UnquantizedLinearMethod, +) +from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( + CompressedTensorsConfig, + CompressedTensorsLinearMethod, + CompressedTensorsKVCacheMethod, + CompressedTensorsLinearTransformMethod, + get_linear_transform_schemes, +) +from vllm_kunlun.ops.quantization.compressed_tensors.compressed_tensors_moe import ( + KunlunCompressedTensorsMoEMethod, +) + + +class KunlunCompressedTensorsConfig(CompressedTensorsConfig): + def get_quant_method( + self, + layer: torch.nn.Module, + prefix: str, + ) -> Optional["QuantizeMethodBase"]: + from vllm.attention.layer import Attention # Avoid circular import + + if isinstance(layer, LinearBase): + # collect schemes + quant_scheme = self.get_scheme(layer=layer, layer_name=prefix) + input_tfms, output_tfms = get_linear_transform_schemes( + layer, prefix, self.transform_config, self.packed_modules_mapping + ) + + # choose quantization method + quant_method: LinearMethodBase = UnquantizedLinearMethod() + if quant_scheme is not None: + layer.scheme = quant_scheme + quant_method = CompressedTensorsLinearMethod(self) + + # choose transform method + if any((input_tfms, output_tfms)): + return CompressedTensorsLinearTransformMethod.from_schemes( + quant_method, quant_scheme, input_tfms, output_tfms + ) + + else: + return quant_method + + if isinstance(layer, Attention): + return CompressedTensorsKVCacheMethod(self) + if isinstance(layer, FusedMoE): + return KunlunCompressedTensorsMoEMethod.get_moe_method(self, layer) + return None + + +# monkey patch +from vllm.model_executor.layers.quantization.compressed_tensors import ( + compressed_tensors, +) + +compressed_tensors.CompressedTensorsConfig = KunlunCompressedTensorsConfig +print( + "[Monkey Patch Applied] >>> vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors.CompressedTensorsConfig \ + --> vllm_kunlun.ops.quantization.compressed_tensors.KunlunCompressedTensorsConfig" +) diff --git a/vllm_kunlun/ops/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm_kunlun/ops/quantization/compressed_tensors/compressed_tensors_moe.py index c46fa3b..d9d810d 100644 --- a/vllm_kunlun/ops/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm_kunlun/ops/quantization/compressed_tensors/compressed_tensors_moe.py @@ -19,9 +19,95 @@ from typing import Callable, Optional, Union import torch +from vllm.logger import init_logger +from compressed_tensors.quantization import ActivationOrdering, QuantizationStrategy +from vllm.model_executor.layers.fused_moe import FusedMoEConfig, FusedMoEMethodBase from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import ( + CompressedTensorsW4A4MoeMethod, + CompressedTensorsW4A8Int8MoEMethod, CompressedTensorsW8A8Int8MoEMethod, + CompressedTensorsW8A8Int8MoEMethod, + CompressedTensorsW8A8Fp8MoEMethod, + CompressedTensorsWNA16MoEMethod, + find_matched_target, ) +from vllm_kunlun.ops._kunlun_ops import KunlunOps as ops +from vllm_kunlun.ops.quantization.kernels.quant_ops import dequant_int4_native + +logger = init_logger(__name__) + + +class KunlunCompressedTensorsMoEMethod(FusedMoEMethodBase): + + def __init_(self, moe: FusedMoEConfig): + super().__init__(moe) + + @staticmethod + def get_moe_method( + quant_config: "CompressedTensorsConfig", # type: ignore # noqa E501 + layer: torch.nn.Module, + ) -> "KunlunCompressedTensorsMoEMethod": + # TODO: @dsikka: refactor this to use schemes as other kernels + # are supported + check if the layer is being ignored. + # Check if a using "Linear" to select schemes + if "Linear" in quant_config.target_scheme_map: + matched_target = "Linear" + else: + # May have instead defined the linear layers in the fused model + fused_layers = ["re:.*down_proj.*", "re:.*gate_proj.*", "re:.*up_proj.*"] + current_scheme = None + for fused_layer in fused_layers: + # Check if one of the fused layers are defined in quant_config + matched_target = find_matched_target( + layer_name=fused_layer, + module=layer, + targets=quant_config.target_scheme_map.keys(), + fused_mapping=quant_config.packed_modules_mapping, + ) + + # Only valid if down_proj, gate_proj, and up_proj + # are mapped to the same quant scheme in the quant_config + if current_scheme is None: + current_scheme = quant_config.target_scheme_map.get(matched_target) + else: + assert current_scheme == quant_config.target_scheme_map.get( + matched_target + ) + + weight_quant = quant_config.target_scheme_map[matched_target].get("weights") + input_quant = quant_config.target_scheme_map[matched_target].get( + "input_activations" + ) + if quant_config._is_wNa16_group_channel(weight_quant, input_quant): + if ( + weight_quant.strategy in QuantizationStrategy.GROUP + and weight_quant.actorder + in (ActivationOrdering.GROUP, ActivationOrdering.DYNAMIC) + ): + raise ValueError( + "WNA16MoE is not supported with actorder=group/dynamic." + ) + # MarlinMoE kernel is not supported on XPU. + logger.warning_once(f"Using KunlunCompressedTensorsWNA16MoEMethod") + return KunlunCompressedTensorsWNA16MoEMethod(quant_config, layer.moe_config) + elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant): + return CompressedTensorsW4A4MoeMethod(layer.moe_config) + elif ( + quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant) + or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant) + or quant_config._is_fp8_w8a8(weight_quant, input_quant) + ): + return CompressedTensorsW8A8Fp8MoEMethod(quant_config, layer.moe_config) + elif quant_config._is_dynamic_token_w8a8(weight_quant, input_quant): + return KunlunCompressedTensorsW8A8Int8MoEMethod( + quant_config, layer.moe_config + ) + elif quant_config._is_dynamic_token_w4a8_int(weight_quant, input_quant): + return CompressedTensorsW4A8Int8MoEMethod(quant_config, layer.moe_config) + else: + raise RuntimeError( + f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}" + ) class KunlunCompressedTensorsW8A8Int8MoEMethod(CompressedTensorsW8A8Int8MoEMethod): @@ -184,7 +270,7 @@ class KunlunCompressedTensorsW8A8Int8MoEMethod(CompressedTensorsW8A8Int8MoEMetho # sort_mode=False, act=None, ) - del x_q, x_scale, sorted_tokens_num_lod,expert_m + del x_q, x_scale, sorted_tokens_num_lod, expert_m dequant_scale = torch.ones([M, top_k], dtype=torch.float32, device=out.device) output = torch.empty( @@ -202,6 +288,75 @@ class KunlunCompressedTensorsW8A8Int8MoEMethod(CompressedTensorsW8A8Int8MoEMetho return output +class KunlunCompressedTensorsWNA16MoEMethod(CompressedTensorsWNA16MoEMethod): + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: Optional[torch.Tensor] = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: Optional[torch.Tensor] = None, + logical_to_physical_map: Optional[torch.Tensor] = None, + logical_replica_count: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: + # dequant packed weights to float16 + w13_weight = dequant_int4_native( + weight_packed_uint8=layer.w13_weight_packed, + scale=self.moe_quant_config.w1_scale, + ) + w2_weight = dequant_int4_native( + weight_packed_uint8=layer.w2_weight_packed, + scale=self.moe_quant_config.w2_scale, + ) + + if self.moe.use_ep: + return ops.fused_moe_ep( + x, + w13_weight, + w2_weight, + router_logits, + self.moe.ep_rank, + top_k, + renormalize=renormalize, + inplace=True, + use_grouped_topk=use_grouped_topk, + num_expert_group=num_expert_group, + topk_group=topk_group, + ) + else: + return ops.fused_moe( + x, + w13_weight, + w2_weight, + router_logits, + self.moe.ep_rank, + top_k, + renormalize=renormalize, + inplace=True, + use_grouped_topk=use_grouped_topk, + num_expert_group=num_expert_group, + topk_group=topk_group, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias, + w1_bias=getattr(layer, "w13_bias", None), + w2_bias=getattr(layer, "w2_bias", None), + ) + + # monkey patch from vllm.model_executor.layers.quantization.compressed_tensors import ( compressed_tensors_moe, @@ -210,7 +365,21 @@ from vllm.model_executor.layers.quantization.compressed_tensors import ( compressed_tensors_moe.CompressedTensorsW8A8Int8MoEMethod = ( KunlunCompressedTensorsW8A8Int8MoEMethod ) -print( - "[Monkey Patch Applied] >>> vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW8A8Int8MoEMethod \ - --> vllm_kunlun.ops.quantization.compressed_tensors_moe.py:KunlunCompressedTensorsW8A8Int8MoEMethod" +compressed_tensors_moe.CompressedTensorsMoEMethod = KunlunCompressedTensorsMoEMethod +compressed_tensors_moe.CompressedTensorsWNA16MoEMethod = ( + KunlunCompressedTensorsWNA16MoEMethod +) +KunlunCompressedTensorsWNA16MoEMethod.__name__ = "CompressedTensorsWNA16MoEMethod" + +logger.info_once( + "[Monkey Patch Applied] >>> vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW8A8Int8MoEMethod \ + --> vllm_kunlun.ops.quantization.compressed_tensors_moe.KunlunCompressedTensorsW8A8Int8MoEMethod" +) +logger.info_once( + "[Monkey Patch Applied] >>> vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsMoEMethod \ + --> vllm_kunlun.ops.quantization.compressed_tensors_moe.KunlunCompressedTensorsMoEMethod" +) +logger.info_once( + "[Monkey Patch Applied] >>> vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsWNA16MoEMethod \ + --> vllm_kunlun.ops.quantization.compressed_tensors_moe.KunlunCompressedTensorsWNA16MoEMethod" ) diff --git a/vllm_kunlun/ops/quantization/kernels/kunlun_exllama_linear.py b/vllm_kunlun/ops/quantization/kernels/kunlun_exllama_linear.py new file mode 100644 index 0000000..2e68834 --- /dev/null +++ b/vllm_kunlun/ops/quantization/kernels/kunlun_exllama_linear.py @@ -0,0 +1,57 @@ +# +# Copyright (c) 2026 Baidu, Inc. All Rights Reserved. +# Author: Li Wei +# Email: liwei157@baidu.com +# This file is a part of the vllm-kunlun project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +import torch +import xspeedgate_ops +from vllm.model_executor.layers.quantization.kernels.mixed_precision import ( + ExllamaLinearKernel, + _POSSIBLE_KERNELS, +) + + +class KunlunExllamaLinearKernel(ExllamaLinearKernel): + + def apply_weights( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + c = self.config + + x_2d = x.reshape(-1, x.shape[-1]) + out_shape = x.shape[:-1] + (c.partition_weight_shape[1],) + + w_q, w_s, w_zp, w_g_idx = self._get_weight_params(layer) + + assert w_zp is not None, "Zero points are required by Exllama" + assert w_g_idx is not None, "Group index is required by Exllama" + output = torch.ops.xspeedgate_ops.gptq_gemm( + x_2d, w_q, w_zp, w_s, w_g_idx, True, c.weight_type.size_bits + ) + + if bias is not None: + output.add_(bias) + return output.reshape(out_shape) + + +# remove ExllamaLinearKernel and add KunlunExllamaLinearKernel +_POSSIBLE_KERNELS.remove(ExllamaLinearKernel) +_POSSIBLE_KERNELS.append(KunlunExllamaLinearKernel) diff --git a/vllm_kunlun/ops/quantization/kernels/kunlun_scale_mm.py b/vllm_kunlun/ops/quantization/kernels/kunlun_scale_mm.py index d1c6b3a..c8f4e1c 100644 --- a/vllm_kunlun/ops/quantization/kernels/kunlun_scale_mm.py +++ b/vllm_kunlun/ops/quantization/kernels/kunlun_scale_mm.py @@ -99,12 +99,5 @@ class KunlunScaledMMLinearKernel(CutlassScaledMMLinearKernel): # ) -# monkey patch -_POSSIBLE_KERNELS[PlatformEnum.CUDA] = [KunlunScaledMMLinearKernel] -from vllm.model_executor.layers.quantization.kernels.scaled_mm import cutlass - -cutlass.CutlassScaledMMLinearKernel = KunlunScaledMMLinearKernel -print( - "[Monkey Patch Applied] >>> vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass.CutlassScaledMMLinearKernel \ - --> vllm_kunlun.ops.quantization.kernels.kunlun_scale_mm.KunlunScaledMMLinearKernel" -) +# replace CutlassScaledMMLinearKernel with KunlunScaledMMLinearKernel +_POSSIBLE_KERNELS[PlatformEnum.CUDA] = [KunlunScaledMMLinearKernel] \ No newline at end of file diff --git a/vllm_kunlun/ops/quantization/kernels/quant_ops.py b/vllm_kunlun/ops/quantization/kernels/quant_ops.py index 00e1312..bdc47ec 100644 --- a/vllm_kunlun/ops/quantization/kernels/quant_ops.py +++ b/vllm_kunlun/ops/quantization/kernels/quant_ops.py @@ -1,7 +1,7 @@ # -# Copyright (c) 2025 Baidu, Inc. All Rights Reserved. -# Author: Tang Shiwen -# Email: tangshiwen@baidu.com +# Copyright (c) 2026 Baidu, Inc. All Rights Reserved. +# Author: Tang Shiwen, Li Wei +# Email: tangshiwen@baidu.com, liwei157@baidu.com # This file is a part of the vllm-kunlun project. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -66,3 +66,21 @@ def dequant_int4( ) return fpweight.transpose(1, 2).contiguous() + + +def dequant_int4_native(weight_packed_uint8: torch.Tensor, scale: torch.Tensor): + """Unpack uint4 weight from packed uint8 weight and dequant it to float16.""" + weight_upacked_fp16 = ( + torch.stack( + (weight_packed_uint8 & 0xF, (weight_packed_uint8 >> 4) & 0xF), + dim=-1, + ) + .reshape(*weight_packed_uint8.shape[:-1], -1) + .contiguous() + .to(torch.float16) + - 8.0 + ) + weight_upacked_fp16 *= scale.repeat( + 1, 1, weight_upacked_fp16.shape[-1] // scale.shape[-1] + ) + return weight_upacked_fp16 diff --git a/vllm_kunlun/vllm_utils_wrapper.py b/vllm_kunlun/vllm_utils_wrapper.py index 80ff691..b4d15a4 100644 --- a/vllm_kunlun/vllm_utils_wrapper.py +++ b/vllm_kunlun/vllm_utils_wrapper.py @@ -2275,7 +2275,7 @@ fwd_kvcache_mla.register_fake(_fake_fwd_kvcache_mla) ################################################## -# --------------- dequant_int4 ----------------- +# --------------- dequant_int4 ------------------- ################################################## @custom_op("_C::dequant_int4", mutates_args=()) def dequant_int4(