[BugFix]bug fix for dispatch_ffn_combine (#6156)

### What this PR does / why we need it? ### Does this PR introduce _any_ user-facing change? Some synchronization logic of the fusion operator copies EP * expertPerRank int32 values. This part of data contains synchronization signals and data. The 512B DataBlock of Ascend A3 writes all data in the same block atomically to the HBM. For the DeepSeek model, when expertPerRank per device is 16, the 512B alignment is met in both 16-device single-node and 32-device two-node scenarios. Therefore, we check the first position of each 512B data. If the value is not 0, it indicates that the current 512B data has been sent. However, for other cases where expertPerRank per device is not 16, EP * expertPerRank does not meet the 512B alignment. If the above logic is used for checking, there will be problems. Therefore, here we will pad the EP * expertPerRank data length to the length aligned to 512B. ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: d68209402d --------- Signed-off-by: lhchg <lhao_cheng@163.com> Co-authored-by: lihaocheng <lihaosheng1@h-partners.com>
2026-01-23 21:14:18 +08:00
parent 44a4ff6960
commit 717d299ae5
2 changed files with 7 additions and 5 deletions
--- a/csrc/dispatch_ffn_combine/op_kernel/dispatch_ffn_combine_kernel.hpp
+++ b/csrc/dispatch_ffn_combine/op_kernel/dispatch_ffn_combine_kernel.hpp
@@ -224,7 +224,7 @@ private:

        tokenPerExpert.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t *>(shmem() + peermemInfo.offsetPeerTokenPerExpert));

-        tokenPerExpertLayout = Layout3D(params.EP * params.expertPerRank, params.expertPerRank);
+        tokenPerExpertLayout = Layout3D( AlignUp(params.EP * params.expertPerRank, ALIGN_128), params.expertPerRank);
    }

    template<typename T>
@@ -291,7 +291,7 @@ private:
        AscendC::DataCopyPad(
            tmpBuffer1,
            tokenPerExpert[rankId * expertPerRank],
-            {U16(EP), U16(expertPerRank * sizeof(int32_t)), U16(((EP - 1) * expertPerRank) * sizeof(int32_t)), 0},
+            {U16(EP), U16(expertPerRank * sizeof(int32_t)), U16((AlignUp(EP * expertPerRank, ALIGN_128) - expertPerRank) * sizeof(int32_t)), 0},
            {}
        );

@@ -547,7 +547,7 @@ private:
    CATLASS_DEVICE
    void CrossRankSyncAndlocalTokenPerExpertAllGather(Params const &params, int64_t localTokenPerExpertOffset){
        AscendC::LocalTensor<int32_t> tmpBuffer = resource.ubBuf.template GetBufferByByte<int32_t>(0);
-        uint32_t numPerCore = params.EP * params.expertPerRank;
+        uint32_t numPerCore = AlignUp(params.EP * params.expertPerRank, ALIGN_128);
        for(int32_t dstEpIdx = coreIdx; dstEpIdx < params.EP; dstEpIdx += coreNum) {
            if (dstEpIdx == params.rank) {
                continue;
@@ -582,12 +582,13 @@ private:
            AscendC::SetFlag<AscendC::HardEvent::MTE3_MTE2>(EVENT_ID0);
            AscendC::WaitFlag<AscendC::HardEvent::MTE3_MTE2>(EVENT_ID0);
        }
+
        for(int32_t dstEpIdx = coreIdx; dstEpIdx < params.EP; dstEpIdx += coreNum) {
            if (dstEpIdx == params.rank) {
                continue;
            }
            int32_t intPer512 = CACHE_LINE / sizeof(int);
-            for(int32_t checkIdx = 0; checkIdx < params.EP * params.expertPerRank; checkIdx += intPer512) {
+            for(int32_t checkIdx = 0; checkIdx < AlignUp(params.EP * params.expertPerRank, ALIGN_128); checkIdx += intPer512) {
                __gm__ int32_t* sync_check = reinterpret_cast<__gm__ int32_t*>(shmem() + peermemInfo.offsetPeerTokenPerExpert) + tokenPerExpertLayout(dstEpIdx, 0, checkIdx);
                gm_signal_wait_until_ne(sync_check, 0);
            }
@@ -776,7 +777,7 @@ private:
        }
        blockEpilogue.Finalize();
        AscendC::SyncAll<true>();
-        ResetTokenPerExpert(tokenPerExpert, params.EP * params.EP * params.expertPerRank);
+        ResetTokenPerExpert(tokenPerExpert, params.EP * AlignUp(params.EP * params.expertPerRank, ALIGN_128));
        shmem.CrossRankSync();
        MoeTokenUnpermuteTilingData tilingData;
        MoeTokenUnpermuteTiling(params.problemShape.m() * params.topK, n2, params.topK, tilingData, coreNum);
--- a/csrc/dispatch_ffn_combine/op_kernel/utils/const_args.hpp
+++ b/csrc/dispatch_ffn_combine/op_kernel/utils/const_args.hpp
@@ -5,4 +5,5 @@ constexpr static uint64_t MB_SIZE = 1024 * 1024UL;
 constexpr static int32_t NUMS_PER_FLAG = 16;
 constexpr static int32_t CACHE_LINE = 512;
 constexpr static int32_t RESET_VAL = 0xffff;
+constexpr static int32_t ALIGN_128 = 128;
 #endif