[Feat] Support native Kimi-K2-Thinking native W4A16 quantized experts weights (#4516)

### What this PR does / why we need it?

Adds W4A16 quantization method for the Kimi-K2-Thinking model and
updates relevant modules to support the new quantization method.

- Implements complete W4A16 quantization method including weight
packing/unpacking, per-group quantization parameter generation,
post-processing logic and MoE method application.
- Adds parameters `use_int4_w4a16`, `w1_offset` and `w2_offset`, adjusts
`with_quant` conditional logic to support W4A16 matrix multiplication.
- Adds `packed_modules_model_mapping` for Kimi-K2-Thinking model and
processing logic for `weight_packed` field.

- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c

---------

Signed-off-by: zhoux77899 <zhouxiang100@huawei.com>
Signed-off-by: Ruri <33858552+zhoux77899@users.noreply.github.com>
Signed-off-by: Ruri <zhouxiang100@huawei.com>
This commit is contained in:
Ruri
2025-12-10 15:58:52 +08:00
committed by GitHub
parent c1db298f43
commit ce5872705e
13 changed files with 781 additions and 13 deletions

View File

@@ -4,7 +4,8 @@ import torch
from compressed_tensors.quantization import (QuantizationArgs,
QuantizationStrategy)
from vllm.logger import init_logger
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.linear import (LinearBase,
UnquantizedLinearMethod)
from vllm.model_executor.layers.quantization import (
QUANTIZATION_METHODS, register_quantization_config)
@@ -16,8 +17,11 @@ from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
find_matched_target, is_activation_quantization_format,
should_ignore_layer)
from vllm_ascend.quantization.quant_config import (AscendLinearMethod,
from vllm_ascend.ops.fused_moe.fused_moe import AscendUnquantizedFusedMoEMethod
from vllm_ascend.quantization.quant_config import (AscendFusedMoEMethod,
AscendLinearMethod,
AscendQuantConfig)
from vllm_ascend.quantization.w4a16 import AscendW4A16FusedMoEMethod
from vllm_ascend.quantization.w8a8 import AscendW8A8LinearMethod
from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod
from vllm_ascend.utils import COMPRESSED_TENSORS_METHOD
@@ -142,7 +146,7 @@ class AscendCompressedTensorsConfig(QuantizationConfig):
quant_scheme = self.get_scheme(layer=layer, layer_name=prefix)
# choose quantization method
quant_method: LinearMethodBase = UnquantizedLinearMethod()
quant_method = UnquantizedLinearMethod()
if quant_scheme is not None:
layer.scheme = quant_scheme
ascend_quant_config = AscendQuantConfig(self.quant_description
@@ -150,6 +154,21 @@ class AscendCompressedTensorsConfig(QuantizationConfig):
quant_method = AscendLinearMethod(ascend_quant_config, prefix,
None, layer)
return quant_method
if isinstance(layer, FusedMoE):
layer.ascend_quant_method = COMPRESSED_TENSORS_METHOD
# collect schemes
quant_scheme = self.get_scheme(layer=layer, layer_name=prefix)
# choose quantization method
quant_method = AscendUnquantizedFusedMoEMethod(layer.moe_config)
if quant_scheme is not None:
layer.scheme = quant_scheme
ascend_quant_config = AscendQuantConfig(self.quant_description
or {})
quant_method = AscendFusedMoEMethod(
ascend_quant_config, prefix,
ascend_quant_config.packed_modules_mapping, layer)
return quant_method
return None
def get_scheme(self,
@@ -215,6 +234,10 @@ class AscendCompressedTensorsConfig(QuantizationConfig):
if self._is_dynamic_token_w8a8(weight_quant, input_quant):
return AscendW8A8DynamicLinearMethod()
if weight_quant is not None:
if self._is_w4a16(weight_quant):
return AscendW4A16FusedMoEMethod()
raise NotImplementedError(
"No compressed-tensors compatible scheme was found.")
@@ -246,6 +269,10 @@ class AscendCompressedTensorsConfig(QuantizationConfig):
# Only symmetric weight quantization supported.
return is_8_bits and is_token and is_symmetric and is_dynamic
def _is_w4a16(self, weight_quant: QuantizationArgs) -> bool:
is_4_bits = weight_quant.num_bits == 4
return is_4_bits
def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
self.target_scheme_map = hf_to_vllm_mapper.apply_dict(
self.target_scheme_map)