[Bugfix] Support Qwen3-MOE on aclgraph mode (#1381)
### What this PR does / why we need it? Fix the shape of the `npu_moe_init_routing` input parameters to support aclgraph mode on qwen3-moe In addition to this PR, resolving the `gatherv3` error might be necessary. See related PR https://github.com/vllm-project/vllm-ascend/pull/1297 https://github.com/vllm-project/vllm-ascend/pull/1446 Thanks to @yiz-liu for providing the idea ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Tested on Qwen3-30B-A3B Closes: https://github.com/vllm-project/vllm-ascend/issues/1368 --------- Signed-off-by: ApsarasX <apsarax@outlook.com> Signed-off-by: Yikun Jiang <yikunkero@gmail.com> Co-authored-by: Yizhou Liu <liu_yizhou@outlook.com> Co-authored-by: Yikun Jiang <yikunkero@gmail.com>
This commit is contained in:
@@ -18,6 +18,7 @@
|
||||
from typing import Callable, Optional
|
||||
|
||||
import torch
|
||||
from vllm.config import CompilationLevel, get_current_vllm_config
|
||||
from vllm.model_executor.layers.fused_moe.layer import \
|
||||
UnquantizedFusedMoEMethod
|
||||
|
||||
@@ -25,6 +26,15 @@ from vllm_ascend.ops.fused_moe import (fused_experts, fused_experts_moge,
|
||||
select_experts)
|
||||
from vllm_ascend.utils import is_310p
|
||||
|
||||
original_unquantized_fused_moe_init_func = UnquantizedFusedMoEMethod.__init__
|
||||
|
||||
|
||||
def unquantized_fused_moe_init_func(self, *args, **kwargs):
|
||||
original_unquantized_fused_moe_init_func(self, *args, **kwargs)
|
||||
vllm_config = get_current_vllm_config()
|
||||
self.max_num_batched_tokens = vllm_config.scheduler_config.max_num_batched_tokens
|
||||
self.use_aclgraph = vllm_config.compilation_config.level == CompilationLevel.PIECEWISE and not vllm_config.model_config.enforce_eager
|
||||
|
||||
|
||||
def forward_oot(
|
||||
self,
|
||||
@@ -71,6 +81,10 @@ def forward_oot(
|
||||
expert_map=expert_map,
|
||||
apply_router_weight_on_input=apply_router_weight_on_input)
|
||||
|
||||
# If use aclgraph, we need to set max_num_tokens to make
|
||||
# the input shape of `npu_moe_init_routing` fixed
|
||||
max_num_tokens = self.max_num_batched_tokens if self.use_aclgraph else None
|
||||
|
||||
return fused_experts(
|
||||
hidden_states=x,
|
||||
w1=layer.w13_weight,
|
||||
@@ -79,7 +93,9 @@ def forward_oot(
|
||||
topk_ids=topk_ids,
|
||||
top_k=top_k,
|
||||
expert_map=expert_map,
|
||||
apply_router_weight_on_input=apply_router_weight_on_input)
|
||||
apply_router_weight_on_input=apply_router_weight_on_input,
|
||||
max_num_tokens=max_num_tokens)
|
||||
|
||||
|
||||
UnquantizedFusedMoEMethod.__init__ = unquantized_fused_moe_init_func
|
||||
UnquantizedFusedMoEMethod.forward_oot = forward_oot
|
||||
|
||||
Reference in New Issue
Block a user