[BugFix][0.18.0]dispatch_ffn_combine kernal rollback combine 、unpermute part and scale part (#8534)

cherry-pick https://github.com/vllm-project/vllm-ascend/pull/8539

<!--  Thanks for sending a pull request!

BEFORE SUBMITTING, PLEASE READ
https://docs.vllm.ai/en/latest/contributing/overview.html

-->
### What this PR does / why we need it?
<!--
- Please clarify what changes you are proposing. The purpose of this
section is to outline the changes and how this PR fixes the issue.
If possible, please consider writing useful notes for better and faster
reviews in your PR.

- Please clarify why the changes are needed. For instance, the use case
and bug description.

- Fixes #
-->
Due to end-to-end testing , three optimization points for the decode
scenario have been reverted in dispatch_ffn_combine kernel.
### Does this PR introduce _any_ user-facing change?
<!--
Note that it means *any* user-facing change including all aspects such
as API, interface or other behavior changes.
Documentation-only updates are not considered user-facing changes.
-->

### How was this patch tested?
<!--
CI passed with new added/existing test.
If it was tested in a way different from regular unit tests, please
clarify how you tested step by step, ideally copy and paste-able, so
that other reviewers can test and check, and descendants can verify in
the future.
If tests were not added, please describe why they were not added and/or
why it was difficult to add.
-->

---------

Signed-off-by: l00893928 <liuquanlu@huawei.com>
Co-authored-by: l00893928 <liuquanlu@huawei.com>
This commit is contained in:
LQLlulu
2026-04-22 23:27:02 +08:00
committed by GitHub
parent 69a57bc9ec
commit fcf4d477a7
6 changed files with 180 additions and 136 deletions

View File

@@ -234,7 +234,8 @@ __aicore__ inline void DispatchFFNCombine<TemplateMMA2ACFunc>::Process()
using BlockEpilogue1 = Epilogue::Block::BlockEpilogue<EpilogueDispatchPolicy1, CType, PerTokenScaleType, using BlockEpilogue1 = Epilogue::Block::BlockEpilogue<EpilogueDispatchPolicy1, CType, PerTokenScaleType,
D1Type, TileElemWiseMuls, TileCopy1>; D1Type, TileElemWiseMuls, TileCopy1>;
using EpilogueDispatchPolicy2 = Epilogue::EpilogueAtlasA2PerTokenDequantV2<ubStages>; using EpilogueDispatchPolicy2 = Epilogue::EpilogueAtlasA2PerTokenDequant<ubStages>;
using TileCopy2 = Epilogue::Tile::TileCopy<ArchTag, CType, ScaleType, PerTokenScaleType, D2Type>; using TileCopy2 = Epilogue::Tile::TileCopy<ArchTag, CType, ScaleType, PerTokenScaleType, D2Type>;
using BlockEpilogue2 = Epilogue::Block::BlockEpilogue<EpilogueDispatchPolicy2, CType,PerTokenScaleType, using BlockEpilogue2 = Epilogue::Block::BlockEpilogue<EpilogueDispatchPolicy2, CType,PerTokenScaleType,
D2Type, TileCopy2>; D2Type, TileCopy2>;
@@ -254,9 +255,11 @@ __aicore__ inline void DispatchFFNCombine<TemplateMMA2ACFunc>::Process()
GemmCoord problemShape{static_cast<uint32_t>(m), static_cast<uint32_t>(n), static_cast<uint32_t>(k)}; GemmCoord problemShape{static_cast<uint32_t>(m), static_cast<uint32_t>(n), static_cast<uint32_t>(k)};
uint32_t epilogueCoreNum = aivNum / 2; uint32_t epilogueCoreNum = aivNum;
uint32_t epilogueGranularity = expertPerRank - 1; uint32_t epilogueGranularity = expertPerRank - 3;
if (expertPerRank <= 4) {
epilogueGranularity = expertPerRank - 1;
}
typename MatmulKernel::Params params{ typename MatmulKernel::Params params{
problemShape, static_cast<uint32_t>(EP), static_cast<uint32_t>(listLen), static_cast<uint32_t>(expertPerRank), static_cast<uint32_t>(maxOutputSize), problemShape, static_cast<uint32_t>(EP), static_cast<uint32_t>(listLen), static_cast<uint32_t>(expertPerRank), static_cast<uint32_t>(maxOutputSize),
static_cast<uint32_t>(rank), static_cast<uint32_t>(rankSize), static_cast<uint32_t>(rank), static_cast<uint32_t>(rankSize),

View File

@@ -571,6 +571,7 @@ private:
if constexpr (BlockMmad::DispatchPolicy::ASYNC) { if constexpr (BlockMmad::DispatchPolicy::ASYNC) {
blockMmad.SynchronizeBlock(); blockMmad.SynchronizeBlock();
} }
blockMmad.Finalize(params.expertPerRank - 1, 0);
} }
@@ -727,19 +728,6 @@ private:
} }
CATLASS_DEVICE
void CombineSetFlag() {
AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID0);
AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID1);
AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID2);
AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID3);
AscendC::SetFlag<AscendC::HardEvent::S_MTE2>(EVENT_ID2);
AscendC::SetFlag<AscendC::HardEvent::S_MTE2>(EVENT_ID3);
AscendC::SetFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID0);
AscendC::SetFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID1);
}
CATLASS_DEVICE CATLASS_DEVICE
void DispatchAndCombine(Params const &params) { void DispatchAndCombine(Params const &params) {
icache_preload(8); icache_preload(8);
@@ -800,13 +788,17 @@ private:
GM_ADDR otherRankPtr = shmem(0, dstEpIdx); GM_ADDR otherRankPtr = shmem(0, dstEpIdx);
AscendC::GlobalTensor<ElementA> gmRemoteA; AscendC::GlobalTensor<ElementA> gmRemoteA;
gmRemoteA.SetGlobalBuffer(reinterpret_cast<__gm__ ElementA*>(otherRankPtr + peermemInfo.offsetA)); gmRemoteA.SetGlobalBuffer(reinterpret_cast<__gm__ ElementA*>(otherRankPtr + peermemInfo.offsetA));
AscendC::GlobalTensor<ElementPerTokenScale> gmRemotePerTokenScale;
gmRemotePerTokenScale.SetGlobalBuffer(reinterpret_cast<__gm__ ElementPerTokenScale*>(otherRankPtr + peermemInfo.offsetPeerPerTokenScale));
MatrixCoord offsetA{rowStart, 0}; MatrixCoord offsetA{rowStart, 0};
MatrixCoord offsetPeer{rowSrc, 0}; MatrixCoord offsetPeer{rowSrc, 0};
int64_t gmOffsetA = params.layoutA.GetOffset(offsetA); int64_t gmOffsetA = params.layoutA.GetOffset(offsetA);
int64_t gmOffsetPeer = rowSrc * (params.problemShape.k() + ALIGN_512); int64_t gmOffsetPeer = params.layoutA.GetOffset(offsetPeer);
// Communication data // Communication data
CopyGMToGMPerToken(gmA[gmOffsetA], gmPerTokenScale1[rowStart], gmRemoteA[gmOffsetPeer], rows, params.problemShape.k()); CopyGMToGM(gmA[gmOffsetA], gmRemoteA[gmOffsetPeer], rows * params.problemShape.k(), params.ubMoveNum);
// Communication scale
CopyGMToGM(gmPerTokenScale1[rowStart], gmRemotePerTokenScale[rowSrc], rows, rows);
} }
} }
@@ -837,16 +829,12 @@ private:
uint32_t n2 = params.problemShape.k(); uint32_t n2 = params.problemShape.k();
typename BlockEpilogue2::Params epilogueParams{ typename BlockEpilogue2::Params epilogueParams{
static_cast<int32_t>(params.EP), static_cast<int32_t>(params.EP),
static_cast<int32_t>(params.expertPerRank), static_cast<int32_t>(params.expertPerRank),
static_cast<int32_t>(params.rank),
reinterpret_cast<__gm__ int32_t *>(shmem() + peermemInfo.offsetPeerTokenPerExpert), reinterpret_cast<__gm__ int32_t *>(shmem() + peermemInfo.offsetPeerTokenPerExpert),
params.layoutD2, static_cast<int32_t>(n2)
static_cast<int32_t>(n2),
static_cast<int32_t>(L1TileShape::N),
shmem,
static_cast<int32_t>(peermemInfo.offsetD)
}; };
uint32_t n = params.problemShape.n(); uint32_t n = params.problemShape.n();
@@ -890,109 +878,65 @@ private:
blockEpilogue1.Finalize(); blockEpilogue1.Finalize();
blockEpilogue2.SetFlag();
CombineSetFlag(); CombineV1(params, blockEpilogue2);
CombineV2(params, blockEpilogue2);
AscendC::SyncAll<true>(); AscendC::SyncAll<true>();
#ifndef __CROSSRANKSYNCANDALLGATHERV1__ #ifndef __CROSSRANKSYNCANDALLGATHERV1__
ResetTokenPerExpert(params.EP * AlignUp(params.EP * params.expertPerRank, 128)); ResetTokenPerExpert(params.EP * AlignUp(params.EP * params.expertPerRank, 128));
#endif #endif
shmem.InitStatusTargetSum();
if (get_subblockid() == 0) { shmem.CrossRankSync();
AscendC::LocalTensor<int32_t> ctrBuffer = resource.ubBuf.template GetBufferByByte<int32_t>(0);
shmem.CrossRankSyncV2Set(ctrBuffer);
} else {
uint32_t uboffset = 0;
uint32_t aicCoreNum = coreNum / 2;
uint32_t aicCoreIdx = get_block_idx();
uint32_t sendRankNum_ = params.EP / aicCoreNum;
uint32_t remainderRankNum = params.EP % aicCoreNum;
if (aicCoreIdx < remainderRankNum) {
sendRankNum_++;
}
AscendC::LocalTensor<float> statusTensor = resource.ubBuf.template GetBufferByByte<float>(uboffset);
uboffset += sendRankNum_ * UB_ALIGN;
AscendC::LocalTensor<float> gatherMaskOutTensor = resource.ubBuf.template GetBufferByByte<float>(uboffset);
uboffset += AlignUp(params.EP * sizeof(float), 32);
AscendC::LocalTensor<uint32_t> gatherTmpTensor = resource.ubBuf.template GetBufferByByte<uint32_t>(uboffset);
uboffset += AlignUp(sizeof(uint32_t), 32);
AscendC::LocalTensor<float> statusSumOutTensor = resource.ubBuf.template GetBufferByByte<float>(uboffset);
uboffset += AlignUp(sizeof(float), 32);
shmem.CrossRankSyncV2Wait(statusTensor, gatherMaskOutTensor, gatherTmpTensor, statusSumOutTensor);
MoeTokenUnpermuteTilingData tilingData; MoeTokenUnpermuteTilingData tilingData;
MoeTokenUnpermuteTiling(params.problemShape.m() * params.topK, n2, params.topK, tilingData, coreNum / 2); MoeTokenUnpermuteTiling(params.problemShape.m() * params.topK, n2, params.topK, tilingData, coreNum);
KernelMoeTokenUnpermute<ElementD2, int32_t, float, true> kernelMoeTokenUnpermuteOp; KernelMoeTokenUnpermute<ElementD2, int32_t, float, true> kernelMoeTokenUnpermuteOp;
kernelMoeTokenUnpermuteOp.Init(shmem() + peermemInfo.offsetD, workspaceInfo.expandedRowIdx, params.probs, reinterpret_cast<GM_ADDR>(params.ptrOutput), &tilingData); kernelMoeTokenUnpermuteOp.Init(shmem() + peermemInfo.offsetD, workspaceInfo.expandedRowIdx, params.probs, reinterpret_cast<GM_ADDR>(params.ptrOutput), &tilingData);
kernelMoeTokenUnpermuteOp.Process(); kernelMoeTokenUnpermuteOp.Process();
} }
}
CATLASS_DEVICE CATLASS_DEVICE
void CombineV2(Params const &params, BlockEpilogue2 & blockEpilogue) { void CombineV1(Params const &params, BlockEpilogue2 & blockEpilogue) {
BlockScheduler blockScheduler;
int32_t syncLoopIdx = 0;
uint32_t startCoreIdx = 0;
uint32_t aicCoreNum = coreNum / 2;
uint32_t aicCoreIdx = get_block_idx();
uint32_t aivSubCoreIdx = get_subblockid();
uint32_t preSrcExpertSum = 0;
uint32_t n2 = params.problemShape.k(); uint32_t n2 = params.problemShape.k();
uint32_t k2 = params.problemShape.n() / 2; int32_t prevGroupSum2 = 0;
icache_preload(8); icache_preload(8);
for (uint32_t groupIdx = 0; groupIdx < params.expertPerRank; ++groupIdx) { for (uint32_t t_groupIdx = 0; t_groupIdx < params.expertPerRank; ++t_groupIdx) {
uint32_t currentExpertM = cumsumMM((params.EP - 1) * params.expertPerRank + groupIdx); int32_t flagId = t_groupIdx / CROSS_CORE_FLAG_MAX_SET_COUNT;
if (preSrcExpertSum >= params.maxOutputSize) { AscendC::CrossCoreWaitFlag<0x2>(flagId);
currentExpertM = 0; AscendC::SyncAll<true>();
} else if (preSrcExpertSum + currentExpertM > params.maxOutputSize) {
currentExpertM = params.maxOutputSize - preSrcExpertSum;
}
GemmCoord inGroupProblemShape{currentExpertM, n2, k2}; // M N K
blockScheduler.Update(inGroupProblemShape, MakeCoord(L1TileShape::M, L1TileShape::N));
uint32_t coreLoops = blockScheduler.GetCoreLoops();
uint32_t startLoopIdx = ((aicCoreIdx < startCoreIdx) ? (aicCoreIdx + aicCoreNum) : aicCoreIdx) - startCoreIdx;
for (uint32_t loopIdx = startLoopIdx; loopIdx < coreLoops; loopIdx += aicCoreNum) { uint32_t groupIdx = t_groupIdx;
GemmCoord blockCoord = blockScheduler.GetBlockCoord(loopIdx);
GemmCoord actualBlockShape = blockScheduler.GetActualBlockShape(blockCoord);
int32_t m0 = 16;
// Block count, the shape of each block is (m0, actualBlockShape.n())
int32_t m_rows = (actualBlockShape.m() + m0 - 1) / m0;
int32_t aiv_m_rows = m_rows / 2;
if (aivSubCoreIdx == 1 && aiv_m_rows * 2 < m_rows) {
aiv_m_rows += 1;
}
uint32_t m_offset = blockCoord.m() * L1TileShape::M;//blockOffset
if(aivSubCoreIdx == 1) {
m_offset += (m_rows / 2) * m0;
}
for(int32_t dstEpIdx = coreIdx; dstEpIdx < params.EP; dstEpIdx += coreNum) {
for (;syncLoopIdx <= groupIdx; syncLoopIdx ++) { __gm__ void* dstPeermemPtr = shmem(peermemInfo.offsetD, dstEpIdx);
int32_t flag_id = syncLoopIdx / CROSS_CORE_FLAG_MAX_SET_COUNT; AscendC::GlobalTensor<ElementD2> gmRemotePeer;
AscendC::CrossCoreWaitFlag<0x2>(flag_id); gmRemotePeer.SetGlobalBuffer(reinterpret_cast<__gm__ ElementD2*>(dstPeermemPtr));
uint32_t srcRowOffset = (dstEpIdx == 0 ? 0 : cumsumMM((dstEpIdx - 1) * params.expertPerRank + groupIdx)) + prevGroupSum2;
if (srcRowOffset < params.maxOutputSize) {
uint32_t dataRows = tokenPerExpert(tokenPerExpertLayout(dstEpIdx, params.rank, groupIdx));
if (srcRowOffset + dataRows > params.maxOutputSize) {
dataRows = params.maxOutputSize - srcRowOffset;
} }
//uint32_t dstRowOffset = preSumBeforeRank(2 * dstEpIdx * FLAGSTRIDE + groupIdx);
for (int32_t cur_row = 0; cur_row < aiv_m_rows; cur_row ++) { int32_t tmpBlock = AlignUp(params.expertPerRank, FLAGSTRIDE);
GemmCoord realTileCoord{m_offset, blockCoord.n() * L1TileShape::N, 1}; //uint32_t dstRowOffset = preSumBeforeRank(dstEpIdx * tmpBlock + groupIdx);
uint32_t actualm = m0; uint32_t dstRowOffset = preSumBeforeRank(dstEpIdx * params.expertPerRank + groupIdx);
if(aivSubCoreIdx == 1 && cur_row == aiv_m_rows - 1){ MatrixCoord offsetC{srcRowOffset, 0};
actualm = actualBlockShape.m() - (m_rows / 2) * m0 - cur_row * m0; MatrixCoord offsetPeer{dstRowOffset, 0};
} MatrixCoord shapeC{dataRows, n2};
GemmCoord realTileShape{actualm, actualBlockShape.n(), 1}; int64_t gmOffsetC = params.layoutD2.GetOffset(offsetC);
blockEpilogue(gmC2, gmPerTokenScale2, realTileCoord, realTileShape, groupIdx, preSrcExpertSum, preSumBeforeRank); int64_t gmOffsetPeer = params.layoutD2.GetOffset(offsetPeer);
m_offset += m0; if constexpr (std::is_same_v<ElementA, int8_t>) {
blockEpilogue(gmC2[gmOffsetC], shapeC, gmPerTokenScale2[srcRowOffset], gmRemotePeer[gmOffsetPeer]);
} else {
blockEpilogue(gmC2[gmOffsetC], shapeC, gmRemotePeer[gmOffsetPeer]);
} }
} }
preSrcExpertSum += currentExpertM; }
startCoreIdx = (startCoreIdx + coreLoops) % aicCoreNum; prevGroupSum2 += cumsumMM((params.EP - 1) * params.expertPerRank + groupIdx);
} }
blockEpilogue.Finalize(); blockEpilogue.Finalize();
} }
private: private:
struct WorkspaceInfo { struct WorkspaceInfo {
GM_ADDR ptrA; GM_ADDR ptrA;

View File

@@ -35,6 +35,7 @@ class MoeV2FullLoadDynamicQuant : public MoeV2SortBase {
__aicore__ inline void CopyOutIdx(); __aicore__ inline void CopyOutIdx();
__aicore__ inline void CopyOutEmpty(); __aicore__ inline void CopyOutEmpty();
__aicore__ inline void CopyOutXQuant1H(); __aicore__ inline void CopyOutXQuant1H();
__aicore__ inline void CopyOutXQuantEH();
__aicore__ inline void ComputeExpertTokenCountOrCumsum(); __aicore__ inline void ComputeExpertTokenCountOrCumsum();
__aicore__ inline void Compute(LocalTensor<float>& smoothLocal); __aicore__ inline void Compute(LocalTensor<float>& smoothLocal);
@@ -48,7 +49,6 @@ class MoeV2FullLoadDynamicQuant : public MoeV2SortBase {
int64_t k_; int64_t k_;
int64_t n_; int64_t n_;
int64_t cols_; int64_t cols_;
int64_t cols_scale_;
int64_t activateRows_; int64_t activateRows_;
int64_t expertNum; int64_t expertNum;
int64_t expertCapacity; int64_t expertCapacity;
@@ -63,10 +63,12 @@ class MoeV2FullLoadDynamicQuant : public MoeV2SortBase {
TQue<QuePosition::VECIN, 1> smoothInQueue; TQue<QuePosition::VECIN, 1> smoothInQueue;
TQue<QuePosition::VECOUT, 1> calcQueue; TQue<QuePosition::VECOUT, 1> calcQueue;
TQue<QuePosition::VECOUT, 1> inputXOutQueue; TQue<QuePosition::VECOUT, 1> inputXOutQueue;
TQue<QuePosition::VECOUT, 1> scaleOutQueue;
GlobalTensor<T> xGm_; GlobalTensor<T> xGm_;
GlobalTensor<int32_t> expertIdxGm_; GlobalTensor<int32_t> expertIdxGm_;
GlobalTensor<float> quantSmoothGm; GlobalTensor<float> quantSmoothGm;
GlobalTensor<float> dynamicQuantScaleGm;
GlobalTensor<int8_t> expandedXGm_; GlobalTensor<int8_t> expandedXGm_;
GlobalTensor<int32_t> expandedRowIdxGm_; GlobalTensor<int32_t> expandedRowIdxGm_;
@@ -223,7 +225,7 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::Compute(LocalTensor<float>&
LocalTensor<float> tempLocal = calcQueue.AllocTensor<float>(); LocalTensor<float> tempLocal = calcQueue.AllocTensor<float>();
LocalTensor<int8_t> outLocal = inputXOutQueue.AllocTensor<int8_t>(); LocalTensor<int8_t> outLocal = inputXOutQueue.AllocTensor<int8_t>();
LocalTensor<float> dynamicQuantLocal = outLocal[this->cols_].template ReinterpretCast<float>(); LocalTensor<float> dynamicQuantLocal = scaleOutQueue.AllocTensor<float>();
if constexpr (!IsSameType<T, float>::value) { if constexpr (!IsSameType<T, float>::value) {
Cast(inLocal, inLocal.ReinterpretCast<T>()[colsAlign], RoundMode::CAST_NONE, this->cols_); Cast(inLocal, inLocal.ReinterpretCast<T>()[colsAlign], RoundMode::CAST_NONE, this->cols_);
@@ -257,6 +259,7 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::Compute(LocalTensor<float>&
calcQueue.FreeTensor(tempLocal); calcQueue.FreeTensor(tempLocal);
inputXOutQueue.EnQue(outLocal); inputXOutQueue.EnQue(outLocal);
scaleOutQueue.EnQue(dynamicQuantLocal);
} }
template <typename T> template <typename T>
@@ -272,7 +275,7 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::CopyOutXQuant1H() {
DataCopyExtParams dataXCopyParams{1, static_cast<uint32_t>(this->cols_ * sizeof(T)), 0, 0, 0}; DataCopyExtParams dataXCopyParams{1, static_cast<uint32_t>(this->cols_ * sizeof(T)), 0, 0, 0};
DataCopyExtParams smoothCopyParams{1, static_cast<uint32_t>(this->cols_ * sizeof(float)), 0, 0, 0}; DataCopyExtParams smoothCopyParams{1, static_cast<uint32_t>(this->cols_ * sizeof(float)), 0, 0, 0};
DataCopyExtParams intriParams{1, static_cast<uint32_t>((this->cols_ + BLOCK_BYTES) * sizeof(int8_t)), 0, 0, 0}; DataCopyExtParams intriParams{1, static_cast<uint32_t>(this->cols_ * sizeof(int8_t)), 0, 0, 0};
LocalTensor<float> smoothLocal; LocalTensor<float> smoothLocal;
if (smoothType == 1) { if (smoothType == 1) {
@@ -292,6 +295,7 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::CopyOutXQuant1H() {
xCopyInQueue_.EnQue<T>(xLocal); xCopyInQueue_.EnQue<T>(xLocal);
Compute(smoothLocal); Compute(smoothLocal);
LocalTensor<float> quantScaleLocal = scaleOutQueue.DeQue<float>();
LocalTensor<int8_t> outLocal = inputXOutQueue.DeQue<int8_t>(); LocalTensor<int8_t> outLocal = inputXOutQueue.DeQue<int8_t>();
while (curRowsStart <= curRowsEnd && curRowsStart / this->k_ == row) { while (curRowsStart <= curRowsEnd && curRowsStart / this->k_ == row) {
int32_t outIndex = expandedRowIdx.GetValue(curRowsStart); int32_t outIndex = expandedRowIdx.GetValue(curRowsStart);
@@ -299,15 +303,74 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::CopyOutXQuant1H() {
if (outIndex == -1 || (this->dropPadMode == DROPLESS_MODE && outIndex >= this->activateRows_)) { if (outIndex == -1 || (this->dropPadMode == DROPLESS_MODE && outIndex >= this->activateRows_)) {
continue; continue;
} }
DataCopyPad(expandedXGm_[outIndex * this->cols_scale_], outLocal, intriParams); DataCopyPad(expandedXGm_[outIndex * cols_], outLocal, intriParams);
DataCopyPad(dynamicQuantScaleGm[outIndex], quantScaleLocal, {1, 4, 0, 0, 0});
} }
xCopyInQueue_.FreeTensor(xLocal); xCopyInQueue_.FreeTensor(xLocal);
inputXOutQueue.FreeTensor(outLocal); inputXOutQueue.FreeTensor(outLocal);
scaleOutQueue.FreeTensor(quantScaleLocal);
}
if (smoothType == 1) {
smoothInQueue.FreeTensor(smoothLocal);
} }
expandedRowIdxCopyOutQueue_.FreeTensor(expandedRowIdx); expandedRowIdxCopyOutQueue_.FreeTensor(expandedRowIdx);
} }
template <typename T>
__aicore__ inline void MoeV2FullLoadDynamicQuant<T>::CopyOutXQuantEH() {
LocalTensor<int32_t> expandedRowIdx = expandedRowIdxCopyOutQueue_.DeQue<int32_t>();
expandedRowIdxCopyOutQueue_.FreeTensor(expandedRowIdx);
Muls(expandDstToSrcRowLocal.ReinterpretCast<float>(), expandDstToSrcRowLocal.ReinterpretCast<float>(), (float)-1,
this->totalLength);
pipe_barrier(PIPE_V);
LocalTensor<int32_t> sortedRowIdx = expandDstToSrcRowLocal.ReinterpretCast<int32_t>();
Cast(sortedRowIdx, expandDstToSrcRowLocal.ReinterpretCast<float>(), RoundMode::CAST_ROUND, this->totalLength);
int64_t curRowsStart = this->blockIdx_ * this->perCoreRows_;
int64_t curRowsEnd = curRowsStart + this->coreRows_ - 1;
DataCopyExtParams dataXCopyParams{1, static_cast<uint32_t>(this->cols_ * sizeof(T)), 0, 0, 0};
DataCopyExtParams smoothCopyParams{1, static_cast<uint32_t>(this->cols_ * sizeof(float)), 0, 0, 0};
DataCopyExtParams intriParams{1, static_cast<uint32_t>(this->cols_ * sizeof(int8_t)), 0, 0, 0};
for (int64_t row = curRowsStart; row <= curRowsEnd; row++) {
if (this->dropPadMode == DROPLESS_MODE && row >= this->activateRows_) {
break;
}
int32_t srcIdx = sortedRowIdx.GetValue(row);
int32_t expertIdx = expandedExpertIdxLocal.GetValue(row);
LocalTensor<T> inLocal = xCopyInQueue_.AllocTensor<T>();
LocalTensor<float> smoothLocal = smoothInQueue.AllocTensor<float>();
if constexpr (IsSameType<T, float>::value) {
DataCopyPad(inLocal, xGm_[srcIdx / this->k_ * this->cols_], dataXCopyParams, {false, 0, 0, 0});
} else {
DataCopyPad(inLocal[colsAlign], xGm_[srcIdx / this->k_ * this->cols_], dataXCopyParams, {false, 0, 0, 0});
}
DataCopyPad(smoothLocal, quantSmoothGm[expertIdx * this->cols_], smoothCopyParams, {false, 0, 0, 0});
xCopyInQueue_.EnQue<T>(inLocal);
smoothInQueue.EnQue(smoothLocal);
smoothLocal = smoothInQueue.DeQue<float>();
Compute(smoothLocal);
LocalTensor<float> quantScaleLocal = scaleOutQueue.DeQue<float>();
DataCopyPad(dynamicQuantScaleGm[row], quantScaleLocal, {1, 4, 0, 0, 0});
LocalTensor<int8_t> outLocal = inputXOutQueue.DeQue<int8_t>();
DataCopyPad(expandedXGm_[row * this->cols_], outLocal, intriParams);
xCopyInQueue_.FreeTensor(inLocal);
smoothInQueue.FreeTensor(smoothLocal);
inputXOutQueue.FreeTensor(outLocal);
scaleOutQueue.FreeTensor(quantScaleLocal);
}
expandDstToSrcRowQueue_.FreeTensor(expandDstToSrcRowLocal);
expandedExpertIdxCopyOutQueue_.FreeTensor(expandedExpertIdxLocal);
}
template <typename T> template <typename T>
__aicore__ inline void MoeV2FullLoadDynamicQuant<T>::Init(GM_ADDR x, GM_ADDR expertIdx, GM_ADDR expandedX, __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::Init(GM_ADDR x, GM_ADDR expertIdx, GM_ADDR expandedX,
GM_ADDR expandedRowIdx, GM_ADDR expertTokensCountOrCumsum, GM_ADDR expandedRowIdx, GM_ADDR expertTokensCountOrCumsum,
@@ -321,7 +384,6 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::Init(GM_ADDR x, GM_ADDR exp
this->k_ = tilingData->k; this->k_ = tilingData->k;
this->n_ = tilingData->n; this->n_ = tilingData->n;
this->cols_ = tilingData->cols; this->cols_ = tilingData->cols;
this->cols_scale_ = this->cols_ + ALIGN_512;
this->needCoreNum_ = this->gatherOutTilingData_->needCoreNum; this->needCoreNum_ = this->gatherOutTilingData_->needCoreNum;
this->perCoreRows_ = this->gatherOutTilingData_->perCoreRows; this->perCoreRows_ = this->gatherOutTilingData_->perCoreRows;
this->activateRows_ = this->gatherOutTilingData_->activateRows; this->activateRows_ = this->gatherOutTilingData_->activateRows;
@@ -352,6 +414,7 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::Init(GM_ADDR x, GM_ADDR exp
Align(this->expertNum, sizeof(int32_t))); Align(this->expertNum, sizeof(int32_t)));
} }
quantSmoothGm.SetGlobalBuffer((__gm__ float*)quantSmooth); quantSmoothGm.SetGlobalBuffer((__gm__ float*)quantSmooth);
dynamicQuantScaleGm.SetGlobalBuffer((__gm__ float*)dynamicQuantScale);
int64_t kvFactor = 2; int64_t kvFactor = 2;
int64_t buffSize = this->sortNum_ * sizeof(int32_t); int64_t buffSize = this->sortNum_ * sizeof(int32_t);
@@ -375,7 +438,8 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::Init(GM_ADDR x, GM_ADDR exp
} }
pipe->InitBuffer(smoothInQueue, 1, AlignBytes(this->cols_, sizeof(float))); pipe->InitBuffer(smoothInQueue, 1, AlignBytes(this->cols_, sizeof(float)));
pipe->InitBuffer(calcQueue, 1, AlignBytes(this->cols_, sizeof(float))); pipe->InitBuffer(calcQueue, 1, AlignBytes(this->cols_, sizeof(float)));
pipe->InitBuffer(inputXOutQueue, 1, AlignBytes(this->cols_scale_, sizeof(int8_t))); pipe->InitBuffer(inputXOutQueue, 1, AlignBytes(this->cols_, sizeof(int8_t)));
pipe->InitBuffer(scaleOutQueue, 1, BLOCK_BYTES + BLOCK_BYTES);
} }
template <typename T> template <typename T>
@@ -391,8 +455,12 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::Process() {
} else { } else {
CopyOutEmpty(); CopyOutEmpty();
} }
if (smoothType == 2) {
CopyOutXQuantEH();
} else {
CopyOutXQuant1H(); CopyOutXQuant1H();
} }
}
} }
} // namespace MoeInitRoutingQuantV2 } // namespace MoeInitRoutingQuantV2
#endif // MOE_V2_DYNAMIC_QUANT_FULL_LOAD_H #endif // MOE_V2_DYNAMIC_QUANT_FULL_LOAD_H

View File

@@ -66,7 +66,6 @@ class MoeV2GatherDynamicQuant {
int64_t needCoreNum; int64_t needCoreNum;
int64_t blockIdx; int64_t blockIdx;
int64_t cols; int64_t cols;
int64_t cols_scale_;
int64_t n; int64_t n;
int64_t k; int64_t k;
int64_t totalLength; int64_t totalLength;
@@ -118,7 +117,7 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::Compute(LocalTensor<float>& s
LocalTensor<float> tempLocal = calcQueue.AllocTensor<float>(); LocalTensor<float> tempLocal = calcQueue.AllocTensor<float>();
LocalTensor<int8_t> outLocal = inputXOutQueue.AllocTensor<int8_t>(); LocalTensor<int8_t> outLocal = inputXOutQueue.AllocTensor<int8_t>();
LocalTensor<float> dynamicQuantLocal = outLocal[this->cols].template ReinterpretCast<float>(); LocalTensor<float> dynamicQuantLocal = scaleOutQueue.AllocTensor<float>();
if constexpr (!IsSameType<T, float>::value) { if constexpr (!IsSameType<T, float>::value) {
Cast(inLocal, inLocal.ReinterpretCast<T>()[perLoopColsAlign], RoundMode::CAST_NONE, this->cols); Cast(inLocal, inLocal.ReinterpretCast<T>()[perLoopColsAlign], RoundMode::CAST_NONE, this->cols);
@@ -152,6 +151,7 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::Compute(LocalTensor<float>& s
calcQueue.FreeTensor(tempLocal); calcQueue.FreeTensor(tempLocal);
inputXOutQueue.EnQue(outLocal); inputXOutQueue.EnQue(outLocal);
scaleOutQueue.EnQue(dynamicQuantLocal);
} }
template <typename T> template <typename T>
@@ -163,7 +163,7 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::CopyOutXQuant1H(int64_t progr
int64_t currentLoopStartRow = initialRow / this->k; int64_t currentLoopStartRow = initialRow / this->k;
int64_t currentLoopLastRow = (initialRow + this->currentLoopRows - 1) / this->k; int64_t currentLoopLastRow = (initialRow + this->currentLoopRows - 1) / this->k;
DataCopyExtParams copyInParams{1, static_cast<uint32_t>(this->cols * sizeof(T)), 0, 0, 0}; DataCopyExtParams copyInParams{1, static_cast<uint32_t>(this->cols * sizeof(T)), 0, 0, 0};
DataCopyExtParams copyOutParams{1, static_cast<uint32_t>((this->cols + BLOCK_BYTES) * sizeof(int8_t)), 0, 0, 0}; DataCopyExtParams copyOutParams{1, static_cast<uint32_t>(this->cols * sizeof(int8_t)), 0, 0, 0};
DataCopyExtParams smoothParams{1, static_cast<uint32_t>(this->cols * sizeof(float)), 0, 0, 0}; DataCopyExtParams smoothParams{1, static_cast<uint32_t>(this->cols * sizeof(float)), 0, 0, 0};
LocalTensor<float> smoothLocal; LocalTensor<float> smoothLocal;
@@ -187,6 +187,7 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::CopyOutXQuant1H(int64_t progr
// Compute quantization // Compute quantization
Compute(smoothLocal); Compute(smoothLocal);
LocalTensor<float> quantScaleLocal = scaleOutQueue.DeQue<float>();
LocalTensor<int8_t> outLocal = inputXOutQueue.DeQue<int8_t>(); LocalTensor<int8_t> outLocal = inputXOutQueue.DeQue<int8_t>();
while (curLoopRow < this->currentLoopRows && initialRow / this->k == row) { while (curLoopRow < this->currentLoopRows && initialRow / this->k == row) {
@@ -196,11 +197,15 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::CopyOutXQuant1H(int64_t progr
if (outIndex == -1 || (this->dropPadMode == DROPLESS_MODE && outIndex >= this->activateRows)) { if (outIndex == -1 || (this->dropPadMode == DROPLESS_MODE && outIndex >= this->activateRows)) {
continue; continue;
} }
// Scale is placed after the data position DataCopyPad(expandedXGm[outIndex * cols], outLocal, copyOutParams);
DataCopyPad(expandedXGm[outIndex * cols_scale_], outLocal, copyOutParams); DataCopyPad(dynamicQuantScaleGm[outIndex], quantScaleLocal, {1, 4, 0, 0, 0});
} }
inputXInQueue.FreeTensor(inLocal); inputXInQueue.FreeTensor(inLocal);
inputXOutQueue.FreeTensor(outLocal); inputXOutQueue.FreeTensor(outLocal);
scaleOutQueue.FreeTensor(quantScaleLocal);
}
if (smoothType == 1) {
smoothInQueue.FreeTensor(smoothLocal);
} }
expandRowIdxInQueue.FreeTensor(indicesLocal); expandRowIdxInQueue.FreeTensor(indicesLocal);
} }
@@ -458,7 +463,6 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::Init(GM_ADDR inputX, GM_ADDR
this->needCoreNum = this->gatherOutTilingData->needCoreNum; this->needCoreNum = this->gatherOutTilingData->needCoreNum;
this->activateRows = this->gatherOutTilingData->activateRows; this->activateRows = this->gatherOutTilingData->activateRows;
this->cols = tilingData->cols; this->cols = tilingData->cols;
this->cols_scale_ = this->cols + ALIGN_512;
this->n = tilingData->n; this->n = tilingData->n;
this->k = tilingData->k; this->k = tilingData->k;
this->totalLength = tilingData->n * tilingData->k; this->totalLength = tilingData->n * tilingData->k;
@@ -514,15 +518,33 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::Init(GM_ADDR inputX, GM_ADDR
pipe->InitBuffer(smoothInQueue, BUFFER_NUM, AlignBytes(this->perLoopCols, sizeof(float))); pipe->InitBuffer(smoothInQueue, BUFFER_NUM, AlignBytes(this->perLoopCols, sizeof(float)));
pipe->InitBuffer(calcQueue, 1, AlignBytes(this->perLoopCols, sizeof(float))); pipe->InitBuffer(calcQueue, 1, AlignBytes(this->perLoopCols, sizeof(float)));
pipe->InitBuffer(inputXOutQueue, 1, AlignBytes(this->perLoopCols, sizeof(int8_t))); pipe->InitBuffer(inputXOutQueue, 1, AlignBytes(this->perLoopCols, sizeof(int8_t)));
pipe->InitBuffer(scaleOutQueue, 1, BLOCK_BYTES + BLOCK_BYTES);
} }
template <typename T> template <typename T>
__aicore__ inline void MoeV2GatherDynamicQuant<T>::Process() { __aicore__ inline void MoeV2GatherDynamicQuant<T>::Process() {
if (this->blockIdx < this->needCoreNum) { if (this->blockIdx < this->needCoreNum) {
currentLoopRows = perLoopRows; currentLoopRows = perLoopRows;
if (colLoops > 1) { // Cannot fit all data in one row, workspace is required
trap(); // Not supported if (colLoops > 1) { // A single row cannot be fully loaded; workspace is required
} else { // All data can fit in one row if (smoothType == 2) {
for (int64_t loop = 0; loop < this->rowLoops - 1; loop++) {
CopyInExpandedExpertIdx(loop);
CopyOutPartialXQuantEH(loop);
}
currentLoopRows = lastLoopRows;
CopyInExpandedExpertIdx(this->rowLoops - 1);
CopyOutPartialXQuantEH(this->rowLoops - 1);
} else {
for (int64_t loop = 0; loop < this->rowLoops - 1; loop++) {
CopyInExpandedRowIdx(loop);
CopyOutPartialXQuant1H(loop);
}
currentLoopRows = lastLoopRows;
CopyInExpandedRowIdx(this->rowLoops - 1);
CopyOutPartialXQuant1H(this->rowLoops - 1);
}
} else { // A single row can be fully loaded
if (smoothType == 2) { if (smoothType == 2) {
for (int64_t loop = 0; loop < this->rowLoops - 1; loop++) { for (int64_t loop = 0; loop < this->rowLoops - 1; loop++) {
CopyInExpandedExpertIdx(loop); CopyInExpandedExpertIdx(loop);

View File

@@ -85,9 +85,8 @@ KernelMoeTokenUnpermute<T1, T2, T3, PROBS>::Init(GM_ADDR permuted_tokens, GM_ADD
GM_ADDR unpermuted_tokens, GM_ADDR unpermuted_tokens,
const MoeTokenUnpermuteTilingData *__restrict tiling_data) const MoeTokenUnpermuteTilingData *__restrict tiling_data)
{ {
this->blockIdx = get_block_idx(); this->blockIdx = get_block_idx() + get_subblockid() * get_block_num();
this->blockNum = get_block_num(); this->blockNum = get_block_num() * get_subblockdim();
if (blockIdx >= blockNum) { if (blockIdx >= blockNum) {
return; return;
} }

View File

@@ -99,12 +99,20 @@ public:
eventUbDMTE3VList[i] = eventMTE3V++; eventUbDMTE3VList[i] = eventMTE3V++;
eventUbDVMTE3List[i] = eventVMTE3++; eventUbDVMTE3List[i] = eventVMTE3++;
AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(eventUbCVMTE2List[i]);
AscendC::SetFlag<AscendC::HardEvent::MTE3_V>(eventUbDMTE3VList[i]);
ubCFp32List[i] = resource.ubBuf.template GetBufferByByte<float>(ubOffset); ubCFp32List[i] = resource.ubBuf.template GetBufferByByte<float>(ubOffset);
ubOffset += blockN * sizeof(float); ubOffset += blockN * sizeof(float);
} }
} }
CATLASS_DEVICE
void SetFlag()
{
for (uint32_t i = 0; i < UB_STAGES; ++i) {
AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(eventUbCVMTE2List[i]);
AscendC::SetFlag<AscendC::HardEvent::MTE3_V>(eventUbDMTE3VList[i]);
}
}
CATLASS_DEVICE CATLASS_DEVICE
void Finalize() void Finalize()
{ {