From a336543977a731992199d8a0734fd2a51c2b588a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=AC=A7=E6=B4=BE=E6=9E=9C=E5=A5=B6=E6=88=91=E8=BF=98?= =?UTF-8?q?=E8=A6=81?= <47294568+845473182@users.noreply.github.com> Date: Fri, 5 Dec 2025 16:04:24 +0800 Subject: [PATCH] [Bugifx] fix quant_apply_mlp w1_scale type error & fix getting num_local_expert (#4632) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What this PR does / why we need it? Fix bugs introduced by https://github.com/vllm-project/vllm-ascend/commit/bc67696a02fffff3b5d246efe17d0a01e52ab2a4 1. fix getting num_local_experet error in vllm_adaptor 2. fix w1_scale type error in moe_mlp.quant_apply_mlp.npu_dequant_swiglu_quant in w4a8 quantized scenario - vLLM version: v0.12.0 --------- Signed-off-by: 白永斌 Signed-off-by: 欧派果奶我还要 <47294568+845473182@users.noreply.github.com> Co-authored-by: 白永斌 Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: wangxiyuan --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 4 ++-- vllm_ascend/ops/fused_moe/moe_mlp.py | 2 +- vllm_ascend/quantization/w8a8_dynamic.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index 47a99d1b..8aabcc3c 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -107,8 +107,8 @@ class VllmEplbAdaptor(EplbAdaptor): self.buffer_tensor_list[buffer_id].append(buffer_tensor) def init_expert_param_per_layer(self): - num_local_expert = self.param_dict["model.layers." + str(self.num_dense_layers) + \ - ".mlp.experts." + self.expert_weight_names[0]].data.shape[0] + key = f"model.layers.{self.num_dense_layers}.mlp.experts.{self.expert_weight_names[0]}" + num_local_expert = len(self.param_dict[key]) for moe_layer_id in range(self.num_moe_layers): layer_idx = self.num_dense_layers + moe_layer_id self.expert_param_per_layer[layer_idx] = list() diff --git a/vllm_ascend/ops/fused_moe/moe_mlp.py b/vllm_ascend/ops/fused_moe/moe_mlp.py index 13e1efc0..3b182b17 100644 --- a/vllm_ascend/ops/fused_moe/moe_mlp.py +++ b/vllm_ascend/ops/fused_moe/moe_mlp.py @@ -129,7 +129,7 @@ def quant_apply_mlp(hidden_states: torch.Tensor, # act_fn: swiglu hidden_states, swiglu_out_scale = torch_npu.npu_dequant_swiglu_quant( x=hidden_states, - weight_scale=w1_scale, + weight_scale=w1_scale[0], activation_scale=pertoken_scale, bias=None, quant_scale=None, diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py index 00c42cd8..2e86dd6e 100644 --- a/vllm_ascend/quantization/w8a8_dynamic.py +++ b/vllm_ascend/quantization/w8a8_dynamic.py @@ -289,7 +289,7 @@ class AscendW8A8DynamicFusedMoEMethod: ] layer.w13_weight_scale_fp32_list = [ weight.clone() - for weight in layer.w13_weight_scale.data.unbind(dim=0) + for weight in layer.w13_weight_scale_fp32.data.unbind(dim=0) ] layer.w2_weight_scale_list = [ weight.clone()