[fix]: fix precision issue in dispatch_ffn_combine_bf16 and remove redundant sync (#7198)

### What this PR does / why we need it?
Fix the precision issue in dispatch_ffn_combine_bf16 operator.
Remove redundant synchronization operations in dispatch_ffn_combine
operator.

- vLLM version: v0.16.0
- vLLM main:
4034c3d32e

---------

Signed-off-by: guanguan0308 <1546542263@qq.com>
This commit is contained in:
guanguan0308
2026-03-23 10:14:03 +08:00
committed by GitHub
parent e68464a1d6
commit 44ef9a36ac
8 changed files with 531 additions and 462 deletions

View File

@@ -391,7 +391,6 @@ private:
uint16_t syncgmmIdx = 0;
AscendC::CrossCoreWaitFlag<0x2>(syncgmmIdx / CROSS_CORE_FLAG_MAX_SET_COUNT); // Wait for AIV to finish cumsum for matmul
syncgmmIdx++;
AscendC::PipeBarrier<PIPE_ALL>();
for (uint32_t groupIdx = 0; groupIdx < params.expertPerRank; ++groupIdx) {
uint32_t currentM = cumsumMM((params.EP - 1) * params.expertPerRank + groupIdx);
@@ -405,7 +404,6 @@ private:
int32_t arrayGroupIdx = params.listLen == 1 ? 0 : groupIdx;
gmB1.SetGlobalBuffer(reinterpret_cast<__gm__ ElementB *>(GetTensorAddr<int8_t>(arrayGroupIdx, params.ptrB1)));
gmS.SetGlobalBuffer(reinterpret_cast<__gm__ ElementScale *>(GetTensorAddr<int64_t>(arrayGroupIdx, params.ptrScale1)));
AscendC::PipeBarrier<PIPE_ALL>();
if (currentM <= L1TileShape::M) {
gmB1.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE);
}
@@ -493,8 +491,6 @@ private:
uint32_t startCoreIdx = 0;
AscendC::PipeBarrier<PIPE_ALL>();
int64_t preCurrentmSum = 0;
int32_t syncLoopIdx = -1;
uint32_t lastDequantExpertNum = params.expertPerRank;
@@ -503,8 +499,6 @@ private:
lastDequantExpertNum = params.expertPerRank - params.epilogueGranularity;
}
AscendC::PipeBarrier<PIPE_ALL>();
for (uint32_t groupIdx = 0; groupIdx < params.expertPerRank; ++groupIdx) {
uint32_t currentM = cumsumMM((params.EP - 1) * params.expertPerRank + groupIdx);
if (preCurrentmSum >= params.maxOutputSize) {