From e67608041dc29ca0d9d41210d7f5f6eb812d397e Mon Sep 17 00:00:00 2001 From: wangqiankun13 Date: Thu, 15 Jan 2026 09:29:34 +0800 Subject: [PATCH] [main][BugFix]Fix DispatchGmmCombineDecode acc bug when big batch (#5808) ### What this PR does / why we need it? If one expert handle more than 48 * 8 token, DispatchGmmCombineDecode may incur acc problem, because a flag is set too early. > Reason: LocalTensor ubInputRightHalf, ubInputTmp, ubInputRightHalf, ubQuantF32, ubQuantS32, and ubQuantF16 use the same space with ubAbs, so only after all of them are free, the copy from gm into ubInputRightHalf can start, while before this pr, AscendC::SetFlag(0) is too early. This pr sets flag at right time. More info about this operator, please refer to RFC: issue https://github.com/vllm-project/vllm-ascend/issues/5476 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? test qwen3-235b eplb with DispatchGmmCombineDecode on a single A3 node(ep16) | dataset | version | metric | mode | vllm-api-stream-chat | |----- | ----- | ----- | ----- | -----| | aime2024 | 604a78 | accuracy | gen | 86.67 | - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d Signed-off-by: wangqiankun --- ...lice_m_per_token_dequant_swiglu_quant_multistage_workspace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/dispatch_gmm_combine_decode/op_kernel/dispatch_gmm_combine_decode/gemm/kernel/grouped_matmul_slice_m_per_token_dequant_swiglu_quant_multistage_workspace.h b/csrc/dispatch_gmm_combine_decode/op_kernel/dispatch_gmm_combine_decode/gemm/kernel/grouped_matmul_slice_m_per_token_dequant_swiglu_quant_multistage_workspace.h index 2420ba3e..cf4956e0 100644 --- a/csrc/dispatch_gmm_combine_decode/op_kernel/dispatch_gmm_combine_decode/gemm/kernel/grouped_matmul_slice_m_per_token_dequant_swiglu_quant_multistage_workspace.h +++ b/csrc/dispatch_gmm_combine_decode/op_kernel/dispatch_gmm_combine_decode/gemm/kernel/grouped_matmul_slice_m_per_token_dequant_swiglu_quant_multistage_workspace.h @@ -230,7 +230,6 @@ public: AscendC::WholeReduceMax(ubReduceMax, ubMax, mask, tileRow, 1, 1, halfTileColumn / elementPerBlk, AscendC::ReduceOrder::ORDER_ONLY_VALUE); AscendC::SetFlag(0); - AscendC::SetFlag(0); AscendC::PipeBarrier(); AscendC::WaitFlag(0); @@ -266,6 +265,7 @@ public: AscendC::WaitFlag(1); AscendC::Cast(ubOutput, ubQuantF16, AscendC::RoundMode::CAST_RINT, tileCount); AscendC::SetFlag(1); + AscendC::SetFlag(0); auto gmTileOutput = gmOutput[params.layoutOutput.GetOffset(tileOffset)]; auto layoutGmTileOutput = params.layoutOutput.GetTileLayout(actualTileShape);