[Feat] Support native Kimi-K2-Thinking native W4A16 quantized experts weights (#4516)

### What this PR does / why we need it? Adds W4A16 quantization method for the Kimi-K2-Thinking model and updates relevant modules to support the new quantization method. - Implements complete W4A16 quantization method including weight packing/unpacking, per-group quantization parameter generation, post-processing logic and MoE method application. - Adds parameters `use_int4_w4a16`, `w1_offset` and `w2_offset`, adjusts `with_quant` conditional logic to support W4A16 matrix multiplication. - Adds `packed_modules_model_mapping` for Kimi-K2-Thinking model and processing logic for `weight_packed` field. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: zhoux77899 <zhouxiang100@huawei.com> Signed-off-by: Ruri <33858552+zhoux77899@users.noreply.github.com> Signed-off-by: Ruri <zhouxiang100@huawei.com>
2025-12-10 15:58:52 +08:00
parent c1db298f43
commit ce5872705e
13 changed files with 781 additions and 13 deletions
--- a/vllm_ascend/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm_ascend/quantization/compressed_tensors/compressed_tensors.py
@@ -4,7 +4,8 @@ import torch
 from compressed_tensors.quantization import (QuantizationArgs,
                                             QuantizationStrategy)
 from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.linear import (LinearBase,
                                               UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization import (
    QUANTIZATION_METHODS, register_quantization_config)
@@ -16,8 +17,11 @@ from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
    find_matched_target, is_activation_quantization_format,
    should_ignore_layer)

-from vllm_ascend.quantization.quant_config import (AscendLinearMethod,
+from vllm_ascend.ops.fused_moe.fused_moe import AscendUnquantizedFusedMoEMethod
+from vllm_ascend.quantization.quant_config import (AscendFusedMoEMethod,
+                                                   AscendLinearMethod,
                                                   AscendQuantConfig)
+from vllm_ascend.quantization.w4a16 import AscendW4A16FusedMoEMethod
 from vllm_ascend.quantization.w8a8 import AscendW8A8LinearMethod
 from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod
 from vllm_ascend.utils import COMPRESSED_TENSORS_METHOD
@@ -142,7 +146,7 @@ class AscendCompressedTensorsConfig(QuantizationConfig):
            quant_scheme = self.get_scheme(layer=layer, layer_name=prefix)

            # choose quantization method
-            quant_method: LinearMethodBase = UnquantizedLinearMethod()
+            quant_method = UnquantizedLinearMethod()
            if quant_scheme is not None:
                layer.scheme = quant_scheme
                ascend_quant_config = AscendQuantConfig(self.quant_description
@@ -150,6 +154,21 @@ class AscendCompressedTensorsConfig(QuantizationConfig):
                quant_method = AscendLinearMethod(ascend_quant_config, prefix,
                                                  None, layer)
            return quant_method
+        if isinstance(layer, FusedMoE):
+            layer.ascend_quant_method = COMPRESSED_TENSORS_METHOD
+            # collect schemes
+            quant_scheme = self.get_scheme(layer=layer, layer_name=prefix)
+
+            # choose quantization method
+            quant_method = AscendUnquantizedFusedMoEMethod(layer.moe_config)
+            if quant_scheme is not None:
+                layer.scheme = quant_scheme
+                ascend_quant_config = AscendQuantConfig(self.quant_description
+                                                        or {})
+                quant_method = AscendFusedMoEMethod(
+                    ascend_quant_config, prefix,
+                    ascend_quant_config.packed_modules_mapping, layer)
+            return quant_method
        return None

    def get_scheme(self,
@@ -215,6 +234,10 @@ class AscendCompressedTensorsConfig(QuantizationConfig):
            if self._is_dynamic_token_w8a8(weight_quant, input_quant):
                return AscendW8A8DynamicLinearMethod()

+        if weight_quant is not None:
+            if self._is_w4a16(weight_quant):
+                return AscendW4A16FusedMoEMethod()
+
        raise NotImplementedError(
            "No compressed-tensors compatible scheme was found.")

@@ -246,6 +269,10 @@ class AscendCompressedTensorsConfig(QuantizationConfig):
        # Only symmetric weight quantization supported.
        return is_8_bits and is_token and is_symmetric and is_dynamic

+    def _is_w4a16(self, weight_quant: QuantizationArgs) -> bool:
+        is_4_bits = weight_quant.num_bits == 4
+        return is_4_bits
+
    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
        self.target_scheme_map = hf_to_vllm_mapper.apply_dict(
            self.target_scheme_map)