From 8c2c82f3e1ddac019de9619e9dde340c9ed863ef Mon Sep 17 00:00:00 2001 From: panchao-hub <315134829@qq.com> Date: Fri, 6 Mar 2026 10:36:10 +0800 Subject: [PATCH] =?UTF-8?q?[Bugfix]=20Fix=20the=20moe=5Fforward=20error=20?= =?UTF-8?q?when=20setting=20enable=5Fstatic=5Fkernel=20=E2=80=A6=20(#6964)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What this PR does / why we need it? Fix the moe_forward error when setting enable_static_kernel to true. When static kernels are enabled, the forward pass runs twice (compilation + capture), causing moe_layer_index to overflow. Wrap the index to prevent out-of-bounds errors. ### Does this PR introduce _any_ user-facing change? None ### How was this patch tested? CI passed with new added test - vLLM version: v0.16.0 - vLLM main: https://github.com/vllm-project/vllm/commit/15d76f74e2fdb12a95ea00f0ca283acf6219a2b7 Signed-off-by: p00465316 Co-authored-by: p00465316 --- vllm_ascend/ops/fused_moe/fused_moe.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py index 7d7b581d..79ba6623 100644 --- a/vllm_ascend/ops/fused_moe/fused_moe.py +++ b/vllm_ascend/ops/fused_moe/fused_moe.py @@ -320,6 +320,7 @@ class AscendFusedMoE(FusedMoE): self.quant_method.create_weights(layer=self, **moe_quant_params) self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp + self.enable_npugraph_ex_static_kernel = ascend_config.ascend_compilation_config.enable_static_kernel setup_moe_comm_method(self.moe_config) self.quant_type = self._get_quant_type() @@ -391,6 +392,10 @@ class AscendFusedMoE(FusedMoE): assert self.quant_method is not None forward_context = get_forward_context() + # When static kernels are enabled, the forward pass runs twice (compilation + capture), + # causing moe_layer_index to overflow. Wrap the index to prevent out-of-bounds errors. + if self.enable_npugraph_ex_static_kernel: + forward_context.moe_layer_index = forward_context.moe_layer_index % (len(forward_context.all_moe_layers)) # Load balancing for token distribution among experts in dummy_run # TODO: The community only considers load balancing when DP > 1.