From 717d299ae5a74c43ed142975f288fef1b4d7983e Mon Sep 17 00:00:00 2001 From: lhchg Date: Fri, 23 Jan 2026 21:14:18 +0800 Subject: [PATCH] [BugFix]bug fix for dispatch_ffn_combine (#6156) ### What this PR does / why we need it? ### Does this PR introduce _any_ user-facing change? Some synchronization logic of the fusion operator copies EP * expertPerRank int32 values. This part of data contains synchronization signals and data. The 512B DataBlock of Ascend A3 writes all data in the same block atomically to the HBM. For the DeepSeek model, when expertPerRank per device is 16, the 512B alignment is met in both 16-device single-node and 32-device two-node scenarios. Therefore, we check the first position of each 512B data. If the value is not 0, it indicates that the current 512B data has been sent. However, for other cases where expertPerRank per device is not 16, EP * expertPerRank does not meet the 512B alignment. If the above logic is used for checking, there will be problems. Therefore, here we will pad the EP * expertPerRank data length to the length aligned to 512B. ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60 --------- Signed-off-by: lhchg Co-authored-by: lihaocheng --- .../op_kernel/dispatch_ffn_combine_kernel.hpp | 11 ++++++----- .../op_kernel/utils/const_args.hpp | 1 + 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/csrc/dispatch_ffn_combine/op_kernel/dispatch_ffn_combine_kernel.hpp b/csrc/dispatch_ffn_combine/op_kernel/dispatch_ffn_combine_kernel.hpp index 469a89e2..a0fe0ad8 100644 --- a/csrc/dispatch_ffn_combine/op_kernel/dispatch_ffn_combine_kernel.hpp +++ b/csrc/dispatch_ffn_combine/op_kernel/dispatch_ffn_combine_kernel.hpp @@ -224,7 +224,7 @@ private: tokenPerExpert.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t *>(shmem() + peermemInfo.offsetPeerTokenPerExpert)); - tokenPerExpertLayout = Layout3D(params.EP * params.expertPerRank, params.expertPerRank); + tokenPerExpertLayout = Layout3D( AlignUp(params.EP * params.expertPerRank, ALIGN_128), params.expertPerRank); } template @@ -291,7 +291,7 @@ private: AscendC::DataCopyPad( tmpBuffer1, tokenPerExpert[rankId * expertPerRank], - {U16(EP), U16(expertPerRank * sizeof(int32_t)), U16(((EP - 1) * expertPerRank) * sizeof(int32_t)), 0}, + {U16(EP), U16(expertPerRank * sizeof(int32_t)), U16((AlignUp(EP * expertPerRank, ALIGN_128) - expertPerRank) * sizeof(int32_t)), 0}, {} ); @@ -547,7 +547,7 @@ private: CATLASS_DEVICE void CrossRankSyncAndlocalTokenPerExpertAllGather(Params const ¶ms, int64_t localTokenPerExpertOffset){ AscendC::LocalTensor tmpBuffer = resource.ubBuf.template GetBufferByByte(0); - uint32_t numPerCore = params.EP * params.expertPerRank; + uint32_t numPerCore = AlignUp(params.EP * params.expertPerRank, ALIGN_128); for(int32_t dstEpIdx = coreIdx; dstEpIdx < params.EP; dstEpIdx += coreNum) { if (dstEpIdx == params.rank) { continue; @@ -582,12 +582,13 @@ private: AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); } + for(int32_t dstEpIdx = coreIdx; dstEpIdx < params.EP; dstEpIdx += coreNum) { if (dstEpIdx == params.rank) { continue; } int32_t intPer512 = CACHE_LINE / sizeof(int); - for(int32_t checkIdx = 0; checkIdx < params.EP * params.expertPerRank; checkIdx += intPer512) { + for(int32_t checkIdx = 0; checkIdx < AlignUp(params.EP * params.expertPerRank, ALIGN_128); checkIdx += intPer512) { __gm__ int32_t* sync_check = reinterpret_cast<__gm__ int32_t*>(shmem() + peermemInfo.offsetPeerTokenPerExpert) + tokenPerExpertLayout(dstEpIdx, 0, checkIdx); gm_signal_wait_until_ne(sync_check, 0); } @@ -776,7 +777,7 @@ private: } blockEpilogue.Finalize(); AscendC::SyncAll(); - ResetTokenPerExpert(tokenPerExpert, params.EP * params.EP * params.expertPerRank); + ResetTokenPerExpert(tokenPerExpert, params.EP * AlignUp(params.EP * params.expertPerRank, ALIGN_128)); shmem.CrossRankSync(); MoeTokenUnpermuteTilingData tilingData; MoeTokenUnpermuteTiling(params.problemShape.m() * params.topK, n2, params.topK, tilingData, coreNum); diff --git a/csrc/dispatch_ffn_combine/op_kernel/utils/const_args.hpp b/csrc/dispatch_ffn_combine/op_kernel/utils/const_args.hpp index 61a3d866..84cb6c4e 100644 --- a/csrc/dispatch_ffn_combine/op_kernel/utils/const_args.hpp +++ b/csrc/dispatch_ffn_combine/op_kernel/utils/const_args.hpp @@ -5,4 +5,5 @@ constexpr static uint64_t MB_SIZE = 1024 * 1024UL; constexpr static int32_t NUMS_PER_FLAG = 16; constexpr static int32_t CACHE_LINE = 512; constexpr static int32_t RESET_VAL = 0xffff; +constexpr static int32_t ALIGN_128 = 128; #endif \ No newline at end of file