From 8c2c82f3e1ddac019de9619e9dde340c9ed863ef Mon Sep 17 00:00:00 2001
From: panchao-hub <315134829@qq.com>
Date: Fri, 6 Mar 2026 10:36:10 +0800
Subject: [PATCH] =?UTF-8?q?[Bugfix]=20Fix=20the=20moe=5Fforward=20error=20?=
 =?UTF-8?q?when=20setting=20enable=5Fstatic=5Fkernel=20=E2=80=A6=20(#6964)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What this PR does / why we need it?
Fix the moe_forward error when setting enable_static_kernel to true.
When static kernels are enabled, the forward pass runs twice
(compilation + capture), causing moe_layer_index to overflow. Wrap the
index to prevent out-of-bounds errors.

### Does this PR introduce _any_ user-facing change?
None

### How was this patch tested?
CI passed with new added test

- vLLM version: v0.16.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/15d76f74e2fdb12a95ea00f0ca283acf6219a2b7

Signed-off-by: p00465316 <panchao13@huawei.com>
Co-authored-by: p00465316 <panchao13@huawei.com>
---
 vllm_ascend/ops/fused_moe/fused_moe.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py
index 7d7b581d..79ba6623 100644
--- a/vllm_ascend/ops/fused_moe/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe/fused_moe.py
@@ -320,6 +320,7 @@ class AscendFusedMoE(FusedMoE):
         self.quant_method.create_weights(layer=self, **moe_quant_params)
 
         self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp
+        self.enable_npugraph_ex_static_kernel = ascend_config.ascend_compilation_config.enable_static_kernel
 
         setup_moe_comm_method(self.moe_config)
         self.quant_type = self._get_quant_type()
@@ -391,6 +392,10 @@ class AscendFusedMoE(FusedMoE):
         assert self.quant_method is not None
 
         forward_context = get_forward_context()
+        # When static kernels are enabled, the forward pass runs twice (compilation + capture),
+        # causing moe_layer_index to overflow. Wrap the index to prevent out-of-bounds errors.
+        if self.enable_npugraph_ex_static_kernel:
+            forward_context.moe_layer_index = forward_context.moe_layer_index % (len(forward_context.all_moe_layers))
 
         # Load balancing for token distribution among experts in dummy_run
         # TODO: The community only considers load balancing when DP > 1.