[Bugfix] Fix the moe_forward error when setting enable_static_kernel … (#6964)
### What this PR does / why we need it?
Fix the moe_forward error when setting enable_static_kernel to true.
When static kernels are enabled, the forward pass runs twice
(compilation + capture), causing moe_layer_index to overflow. Wrap the
index to prevent out-of-bounds errors.
### Does this PR introduce _any_ user-facing change?
None
### How was this patch tested?
CI passed with new added test
- vLLM version: v0.16.0
- vLLM main:
15d76f74e2
Signed-off-by: p00465316 <panchao13@huawei.com>
Co-authored-by: p00465316 <panchao13@huawei.com>
This commit is contained in:
@@ -320,6 +320,7 @@ class AscendFusedMoE(FusedMoE):
|
||||
self.quant_method.create_weights(layer=self, **moe_quant_params)
|
||||
|
||||
self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp
|
||||
self.enable_npugraph_ex_static_kernel = ascend_config.ascend_compilation_config.enable_static_kernel
|
||||
|
||||
setup_moe_comm_method(self.moe_config)
|
||||
self.quant_type = self._get_quant_type()
|
||||
@@ -391,6 +392,10 @@ class AscendFusedMoE(FusedMoE):
|
||||
assert self.quant_method is not None
|
||||
|
||||
forward_context = get_forward_context()
|
||||
# When static kernels are enabled, the forward pass runs twice (compilation + capture),
|
||||
# causing moe_layer_index to overflow. Wrap the index to prevent out-of-bounds errors.
|
||||
if self.enable_npugraph_ex_static_kernel:
|
||||
forward_context.moe_layer_index = forward_context.moe_layer_index % (len(forward_context.all_moe_layers))
|
||||
|
||||
# Load balancing for token distribution among experts in dummy_run
|
||||
# TODO: The community only considers load balancing when DP > 1.
|
||||
|
||||
Reference in New Issue
Block a user