From 0d094531b4a2c41d898a5322f998114fb7ad2fae Mon Sep 17 00:00:00 2001 From: zhangxinyuehfad <59153331+zhangxinyuehfad@users.noreply.github.com> Date: Tue, 9 Dec 2025 08:47:19 +0800 Subject: [PATCH] =?UTF-8?q?[bugfix]=20Fixed=20the=20bug=20in=20retrieving?= =?UTF-8?q?=20the=20quantization=20method=20for=20mlp.=E2=80=A6=20(#4797)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When retrieving the quantization method for MOE (e.g., the quantization file of DeepSeek v3.2 exp do not match the model's naming convention in eager mode), a KeyError is raised: "model.layers.3.mlp.experts.weight not in self.quant_description". However the quantization file is like : ```bash "model.layers.3.mlp.experts.255.gate_proj.weight": "W8A8_DYNAMIC", "model.layers.3.mlp.experts.255.gate_proj.weight_scale": "W8A8_DYNAMIC", "model.layers.3.mlp.experts.255.gate_proj.weight_offset": "W8A8_DYNAMIC", "model.layers.3.mlp.experts.255.down_proj.weight": "W8A8_DYNAMIC", "model.layers.3.mlp.experts.255.down_proj.weight_scale": "W8A8_DYNAMIC", "model.layers.3.mlp.experts.255.down_proj.weight_offset": "W8A8_DYNAMIC", "model.layers.3.mlp.experts.255.up_proj.weight": "W8A8_DYNAMIC", "model.layers.3.mlp.experts.255.up_proj.weight_scale": "W8A8_DYNAMIC", "model.layers.3.mlp.experts.255.up_proj.weight_offset": "W8A8_DYNAMIC", ``` Co-Authored-By: yangqinghao-cmss Signed-off-by: hfadzxy Co-authored-by: yangqinghao-cmss --- vllm_ascend/quantization/quant_config.py | 9 +++++++++ vllm_ascend/quantization/utils.py | 11 +++++++++++ 2 files changed, 20 insertions(+) diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py index d31a696..49e4e07 100644 --- a/vllm_ascend/quantization/quant_config.py +++ b/vllm_ascend/quantization/quant_config.py @@ -157,6 +157,15 @@ class AscendQuantConfig(QuantizationConfig): f"Detected some but not all shards of {prefix} " "are quantized. All shards of fused layers " "to have the same precision.") + elif "experts" in prefix: + # For the experts' prefix (e.g., "model.layers.3.mlp.experts") + # Assume all experts within the same MLP use the same quantization method + experts_quant_description = [ + self.quant_description[layer] + for layer in self.quant_description if prefix in layer + ] + is_skipped = any(quantization == "FLOAT" + for quantization in experts_quant_description) else: is_skipped = self.quant_description[prefix + '.weight'] == "FLOAT" diff --git a/vllm_ascend/quantization/utils.py b/vllm_ascend/quantization/utils.py index 6d914c0..0fb156a 100644 --- a/vllm_ascend/quantization/utils.py +++ b/vllm_ascend/quantization/utils.py @@ -52,6 +52,17 @@ def get_linear_quant_type(quant_description: Dict[str, Any], prefix: str, f"Not all shards of {prefix} are quantized with same quant type." f"Shard {proj_name} uses {shard_quant_type}, but another shard" f"use {quant_type}. Please check quantization config.") + elif "experts" in prefix: + # For the experts' prefix (e.g., "model.layers.3.mlp.experts") + # Assume all experts within the same MLP use the same quantization method + experts_quant_description = set(quant_description[layer] + for layer in quant_description + if prefix in layer) + if not len(experts_quant_description) == 1: + raise RuntimeError( + f"{prefix} has different quantization type: {experts_quant_description}." + ) + quant_type = experts_quant_description.pop() else: quant_type = quant_description[prefix + '.weight'] return quant_type