From bec86418764bfe43bcf431e6693976fb024afa92 Mon Sep 17 00:00:00 2001 From: wangqiankun13 Date: Wed, 21 Jan 2026 09:26:40 +0800 Subject: [PATCH] [BugFix] Fix input parameter bug of dispatch_gmm_combine_decode[RFC: issue 5476] (#5932) ### What this PR does / why we need it? In [PR 5040](https://github.com/vllm-project/vllm-ascend/pull/5040), the `dispatch_gmm_combine_decode` operator was configured with an incorrect global_bs parameter. This PR is to fix the bug. The global_bs provided as input should have the same meaning as in the `moe_distributed_dispatch` operator, specifically: (the maximum batch size across all cards) * (expert parallel world size). However, the implementation incorrectly used the variable max_num_tokens, which does not account for tensor parallelism. This error likely resulted in an unnecessarily large (overestimated) value. More info about this operator, please refer to RFC: issue https://github.com/vllm-project/vllm-ascend/issues/5476 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Acc test qwen3-235b eplb on a single A3 node(ep16), with dispatch_gmm_combine_decode | dataset | version | metric | mode | vllm-api-stream-chat | |----- | ----- | ----- | ----- | -----| | aime2024 | 604a78 | accuracy | gen | 80.00 | - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/11b6af5280d6d6dfb8953af16e67b25f819b3be9 Signed-off-by: wangqiankun --- vllm_ascend/ops/fused_moe/moe_comm_method.py | 2 +- vllm_ascend/ops/fused_moe/token_dispatcher.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm_ascend/ops/fused_moe/moe_comm_method.py b/vllm_ascend/ops/fused_moe/moe_comm_method.py index 41cad9af..458557e9 100644 --- a/vllm_ascend/ops/fused_moe/moe_comm_method.py +++ b/vllm_ascend/ops/fused_moe/moe_comm_method.py @@ -343,7 +343,7 @@ class FusedMC2CommImpl(MoECommMethod): ep_rank_size=self.token_dispatcher.ep_world_size, ep_rank_id=self.token_dispatcher.ep_rank_id, moe_expert_num=self.moe_config.num_experts, - global_bs=self.token_dispatcher.fused_global_bs) + global_bs=self.token_dispatcher.global_bs) else: raise ValueError( f"Wrong value of {envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2=}") diff --git a/vllm_ascend/ops/fused_moe/token_dispatcher.py b/vllm_ascend/ops/fused_moe/token_dispatcher.py index b046e953..a783da19 100644 --- a/vllm_ascend/ops/fused_moe/token_dispatcher.py +++ b/vllm_ascend/ops/fused_moe/token_dispatcher.py @@ -137,7 +137,6 @@ class TokenDispatcherWithMC2(MoETokenDispatcher): max_num_tokens = min(max_num_reqs * uniform_decode_query_len, 512) num_tokens_per_tp_rank = (max_num_tokens + tp_size - 1) // tp_size self.global_bs = num_tokens_per_tp_rank * self.ep_world_size - self.fused_global_bs = max_num_tokens * self.ep_world_size def get_dispatch_mc2_kwargs( self,