From 717d299ae5a74c43ed142975f288fef1b4d7983e Mon Sep 17 00:00:00 2001
From: lhchg <lhao_cheng@163.com>
Date: Fri, 23 Jan 2026 21:14:18 +0800
Subject: [PATCH] [BugFix]bug fix for dispatch_ffn_combine (#6156)

### What this PR does / why we need it?

### Does this PR introduce _any_ user-facing change?
Some synchronization logic of the fusion operator copies EP *
expertPerRank int32 values. This part of data contains synchronization
signals and data.

The 512B DataBlock of Ascend A3 writes all data in the same block
atomically to the HBM.

For the DeepSeek model, when expertPerRank per device is 16, the 512B
alignment is met in both 16-device single-node and 32-device two-node
scenarios. Therefore, we check the first position of each 512B data. If
the value is not 0, it indicates that the current 512B data has been
sent.

However, for other cases where expertPerRank per device is not 16, EP *
expertPerRank does not meet the 512B alignment. If the above logic is
used for checking, there will be problems.

Therefore, here we will pad the EP * expertPerRank data length to the
length aligned to 512B.

### How was this patch tested?

- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60

---------

Signed-off-by: lhchg <lhao_cheng@163.com>
Co-authored-by: lihaocheng <lihaosheng1@h-partners.com>
---
 .../op_kernel/dispatch_ffn_combine_kernel.hpp         | 11 ++++++-----
 .../op_kernel/utils/const_args.hpp                    |  1 +
 2 files changed, 7 insertions(+), 5 deletions(-)
diff --git a/csrc/dispatch_ffn_combine/op_kernel/dispatch_ffn_combine_kernel.hpp b/csrc/dispatch_ffn_combine/op_kernel/dispatch_ffn_combine_kernel.hpp
index 469a89e2..a0fe0ad8 100644
--- a/csrc/dispatch_ffn_combine/op_kernel/dispatch_ffn_combine_kernel.hpp
+++ b/csrc/dispatch_ffn_combine/op_kernel/dispatch_ffn_combine_kernel.hpp
@@ -224,7 +224,7 @@ private:
 
         tokenPerExpert.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t *>(shmem() + peermemInfo.offsetPeerTokenPerExpert));
 
-        tokenPerExpertLayout = Layout3D(params.EP * params.expertPerRank, params.expertPerRank);
+        tokenPerExpertLayout = Layout3D( AlignUp(params.EP * params.expertPerRank, ALIGN_128), params.expertPerRank);
     }
 
     template<typename T>
@@ -291,7 +291,7 @@ private:
         AscendC::DataCopyPad(
             tmpBuffer1,
             tokenPerExpert[rankId * expertPerRank],
-            {U16(EP), U16(expertPerRank * sizeof(int32_t)), U16(((EP - 1) * expertPerRank) * sizeof(int32_t)), 0},
+            {U16(EP), U16(expertPerRank * sizeof(int32_t)), U16((AlignUp(EP * expertPerRank, ALIGN_128) - expertPerRank) * sizeof(int32_t)), 0},
             {}
         );
 
@@ -547,7 +547,7 @@ private:
     CATLASS_DEVICE
     void CrossRankSyncAndlocalTokenPerExpertAllGather(Params const &params, int64_t localTokenPerExpertOffset){
         AscendC::LocalTensor<int32_t> tmpBuffer = resource.ubBuf.template GetBufferByByte<int32_t>(0);
-        uint32_t numPerCore = params.EP * params.expertPerRank;
+        uint32_t numPerCore = AlignUp(params.EP * params.expertPerRank, ALIGN_128);
         for(int32_t dstEpIdx = coreIdx; dstEpIdx < params.EP; dstEpIdx += coreNum) {
             if (dstEpIdx == params.rank) {
                 continue;
@@ -582,12 +582,13 @@ private:
             AscendC::SetFlag<AscendC::HardEvent::MTE3_MTE2>(EVENT_ID0);
             AscendC::WaitFlag<AscendC::HardEvent::MTE3_MTE2>(EVENT_ID0);
         }
+
         for(int32_t dstEpIdx = coreIdx; dstEpIdx < params.EP; dstEpIdx += coreNum) {
             if (dstEpIdx == params.rank) {
                 continue;
             }
             int32_t intPer512 = CACHE_LINE / sizeof(int);
-            for(int32_t checkIdx = 0; checkIdx < params.EP * params.expertPerRank; checkIdx += intPer512) {
+            for(int32_t checkIdx = 0; checkIdx < AlignUp(params.EP * params.expertPerRank, ALIGN_128); checkIdx += intPer512) {
                 __gm__ int32_t* sync_check = reinterpret_cast<__gm__ int32_t*>(shmem() + peermemInfo.offsetPeerTokenPerExpert) + tokenPerExpertLayout(dstEpIdx, 0, checkIdx);
                 gm_signal_wait_until_ne(sync_check, 0);
             }
@@ -776,7 +777,7 @@ private:
         }
         blockEpilogue.Finalize();
         AscendC::SyncAll<true>();
-        ResetTokenPerExpert(tokenPerExpert, params.EP * params.EP * params.expertPerRank);
+        ResetTokenPerExpert(tokenPerExpert, params.EP * AlignUp(params.EP * params.expertPerRank, ALIGN_128));
         shmem.CrossRankSync();
         MoeTokenUnpermuteTilingData tilingData;
         MoeTokenUnpermuteTiling(params.problemShape.m() * params.topK, n2, params.topK, tilingData, coreNum);
diff --git a/csrc/dispatch_ffn_combine/op_kernel/utils/const_args.hpp b/csrc/dispatch_ffn_combine/op_kernel/utils/const_args.hpp
index 61a3d866..84cb6c4e 100644
--- a/csrc/dispatch_ffn_combine/op_kernel/utils/const_args.hpp
+++ b/csrc/dispatch_ffn_combine/op_kernel/utils/const_args.hpp
@@ -5,4 +5,5 @@ constexpr static uint64_t MB_SIZE = 1024 * 1024UL;
 constexpr static int32_t NUMS_PER_FLAG = 16;
 constexpr static int32_t CACHE_LINE = 512;
 constexpr static int32_t RESET_VAL = 0xffff;
+constexpr static int32_t ALIGN_128 = 128;
 #endif
\ No newline at end of file