[BugFix]bug fix for dispatch_ffn_combine (#6156)

### What this PR does / why we need it?

### Does this PR introduce _any_ user-facing change?
Some synchronization logic of the fusion operator copies EP *
expertPerRank int32 values. This part of data contains synchronization
signals and data.

The 512B DataBlock of Ascend A3 writes all data in the same block
atomically to the HBM.

For the DeepSeek model, when expertPerRank per device is 16, the 512B
alignment is met in both 16-device single-node and 32-device two-node
scenarios. Therefore, we check the first position of each 512B data. If
the value is not 0, it indicates that the current 512B data has been
sent.

However, for other cases where expertPerRank per device is not 16, EP *
expertPerRank does not meet the 512B alignment. If the above logic is
used for checking, there will be problems.

Therefore, here we will pad the EP * expertPerRank data length to the
length aligned to 512B.

### How was this patch tested?

- vLLM version: v0.13.0
- vLLM main:
d68209402d

---------

Signed-off-by: lhchg <lhao_cheng@163.com>
Co-authored-by: lihaocheng <lihaosheng1@h-partners.com>
This commit is contained in:
lhchg
2026-01-23 21:14:18 +08:00
committed by GitHub
parent 44a4ff6960
commit 717d299ae5
2 changed files with 7 additions and 5 deletions

View File

@@ -224,7 +224,7 @@ private:
tokenPerExpert.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t *>(shmem() + peermemInfo.offsetPeerTokenPerExpert));
tokenPerExpertLayout = Layout3D(params.EP * params.expertPerRank, params.expertPerRank);
tokenPerExpertLayout = Layout3D( AlignUp(params.EP * params.expertPerRank, ALIGN_128), params.expertPerRank);
}
template<typename T>
@@ -291,7 +291,7 @@ private:
AscendC::DataCopyPad(
tmpBuffer1,
tokenPerExpert[rankId * expertPerRank],
{U16(EP), U16(expertPerRank * sizeof(int32_t)), U16(((EP - 1) * expertPerRank) * sizeof(int32_t)), 0},
{U16(EP), U16(expertPerRank * sizeof(int32_t)), U16((AlignUp(EP * expertPerRank, ALIGN_128) - expertPerRank) * sizeof(int32_t)), 0},
{}
);
@@ -547,7 +547,7 @@ private:
CATLASS_DEVICE
void CrossRankSyncAndlocalTokenPerExpertAllGather(Params const &params, int64_t localTokenPerExpertOffset){
AscendC::LocalTensor<int32_t> tmpBuffer = resource.ubBuf.template GetBufferByByte<int32_t>(0);
uint32_t numPerCore = params.EP * params.expertPerRank;
uint32_t numPerCore = AlignUp(params.EP * params.expertPerRank, ALIGN_128);
for(int32_t dstEpIdx = coreIdx; dstEpIdx < params.EP; dstEpIdx += coreNum) {
if (dstEpIdx == params.rank) {
continue;
@@ -582,12 +582,13 @@ private:
AscendC::SetFlag<AscendC::HardEvent::MTE3_MTE2>(EVENT_ID0);
AscendC::WaitFlag<AscendC::HardEvent::MTE3_MTE2>(EVENT_ID0);
}
for(int32_t dstEpIdx = coreIdx; dstEpIdx < params.EP; dstEpIdx += coreNum) {
if (dstEpIdx == params.rank) {
continue;
}
int32_t intPer512 = CACHE_LINE / sizeof(int);
for(int32_t checkIdx = 0; checkIdx < params.EP * params.expertPerRank; checkIdx += intPer512) {
for(int32_t checkIdx = 0; checkIdx < AlignUp(params.EP * params.expertPerRank, ALIGN_128); checkIdx += intPer512) {
__gm__ int32_t* sync_check = reinterpret_cast<__gm__ int32_t*>(shmem() + peermemInfo.offsetPeerTokenPerExpert) + tokenPerExpertLayout(dstEpIdx, 0, checkIdx);
gm_signal_wait_until_ne(sync_check, 0);
}
@@ -776,7 +777,7 @@ private:
}
blockEpilogue.Finalize();
AscendC::SyncAll<true>();
ResetTokenPerExpert(tokenPerExpert, params.EP * params.EP * params.expertPerRank);
ResetTokenPerExpert(tokenPerExpert, params.EP * AlignUp(params.EP * params.expertPerRank, ALIGN_128));
shmem.CrossRankSync();
MoeTokenUnpermuteTilingData tilingData;
MoeTokenUnpermuteTiling(params.problemShape.m() * params.topK, n2, params.topK, tilingData, coreNum);

View File

@@ -5,4 +5,5 @@ constexpr static uint64_t MB_SIZE = 1024 * 1024UL;
constexpr static int32_t NUMS_PER_FLAG = 16;
constexpr static int32_t CACHE_LINE = 512;
constexpr static int32_t RESET_VAL = 0xffff;
constexpr static int32_t ALIGN_128 = 128;
#endif