[Bugfix] Fix the moe_forward error when setting enable_static_kernel … (#6964)

### What this PR does / why we need it? Fix the moe_forward error when setting enable_static_kernel to true. When static kernels are enabled, the forward pass runs twice (compilation + capture), causing moe_layer_index to overflow. Wrap the index to prevent out-of-bounds errors. ### Does this PR introduce _any_ user-facing change? None ### How was this patch tested? CI passed with new added test - vLLM version: v0.16.0 - vLLM main: 15d76f74e2 Signed-off-by: p00465316 <panchao13@huawei.com> Co-authored-by: p00465316 <panchao13@huawei.com>
2026-03-06 10:36:10 +08:00
parent a7820d20f4
commit 8c2c82f3e1
1 changed files with 5 additions and 0 deletions
--- a/vllm_ascend/ops/fused_moe/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe/fused_moe.py
@@ -320,6 +320,7 @@ class AscendFusedMoE(FusedMoE):
        self.quant_method.create_weights(layer=self, **moe_quant_params)

        self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp
+        self.enable_npugraph_ex_static_kernel = ascend_config.ascend_compilation_config.enable_static_kernel

        setup_moe_comm_method(self.moe_config)
        self.quant_type = self._get_quant_type()
@@ -391,6 +392,10 @@ class AscendFusedMoE(FusedMoE):
        assert self.quant_method is not None

        forward_context = get_forward_context()
+        # When static kernels are enabled, the forward pass runs twice (compilation + capture),
+        # causing moe_layer_index to overflow. Wrap the index to prevent out-of-bounds errors.
+        if self.enable_npugraph_ex_static_kernel:
+            forward_context.moe_layer_index = forward_context.moe_layer_index % (len(forward_context.all_moe_layers))

        # Load balancing for token distribution among experts in dummy_run
        # TODO: The community only considers load balancing when DP > 1.