[BugFix][0.18.0]dispatch_ffn_combine kernal rollback combine 、unpermute part and scale part (#8534)
cherry-pick https://github.com/vllm-project/vllm-ascend/pull/8539 <!-- Thanks for sending a pull request! BEFORE SUBMITTING, PLEASE READ https://docs.vllm.ai/en/latest/contributing/overview.html --> ### What this PR does / why we need it? <!-- - Please clarify what changes you are proposing. The purpose of this section is to outline the changes and how this PR fixes the issue. If possible, please consider writing useful notes for better and faster reviews in your PR. - Please clarify why the changes are needed. For instance, the use case and bug description. - Fixes # --> Due to end-to-end testing , three optimization points for the decode scenario have been reverted in dispatch_ffn_combine kernel. ### Does this PR introduce _any_ user-facing change? <!-- Note that it means *any* user-facing change including all aspects such as API, interface or other behavior changes. Documentation-only updates are not considered user-facing changes. --> ### How was this patch tested? <!-- CI passed with new added/existing test. If it was tested in a way different from regular unit tests, please clarify how you tested step by step, ideally copy and paste-able, so that other reviewers can test and check, and descendants can verify in the future. If tests were not added, please describe why they were not added and/or why it was difficult to add. --> --------- Signed-off-by: l00893928 <liuquanlu@huawei.com> Co-authored-by: l00893928 <liuquanlu@huawei.com>
This commit is contained in:
@@ -224,7 +224,7 @@ __aicore__ inline void DispatchFFNCombine<TemplateMMA2ACFunc>::Process()
|
||||
constexpr uint32_t ubStages = 2;
|
||||
|
||||
using EpilogueDispatchPolicy1 = Epilogue::EpilogueAtlasA2PerTokenDequantSwigluQuant<ubStages>;
|
||||
|
||||
|
||||
using ScaleType = Gemm::GemmType<uint64_t, layout::VectorLayout>;
|
||||
using PerTokenScaleType = Gemm::GemmType<float, layout::VectorLayout>;
|
||||
using ElementMulType = Gemm::GemmType<float, layout::RowMajor>;
|
||||
@@ -234,7 +234,8 @@ __aicore__ inline void DispatchFFNCombine<TemplateMMA2ACFunc>::Process()
|
||||
using BlockEpilogue1 = Epilogue::Block::BlockEpilogue<EpilogueDispatchPolicy1, CType, PerTokenScaleType,
|
||||
D1Type, TileElemWiseMuls, TileCopy1>;
|
||||
|
||||
using EpilogueDispatchPolicy2 = Epilogue::EpilogueAtlasA2PerTokenDequantV2<ubStages>;
|
||||
using EpilogueDispatchPolicy2 = Epilogue::EpilogueAtlasA2PerTokenDequant<ubStages>;
|
||||
|
||||
using TileCopy2 = Epilogue::Tile::TileCopy<ArchTag, CType, ScaleType, PerTokenScaleType, D2Type>;
|
||||
using BlockEpilogue2 = Epilogue::Block::BlockEpilogue<EpilogueDispatchPolicy2, CType,PerTokenScaleType,
|
||||
D2Type, TileCopy2>;
|
||||
@@ -254,9 +255,11 @@ __aicore__ inline void DispatchFFNCombine<TemplateMMA2ACFunc>::Process()
|
||||
|
||||
GemmCoord problemShape{static_cast<uint32_t>(m), static_cast<uint32_t>(n), static_cast<uint32_t>(k)};
|
||||
|
||||
uint32_t epilogueCoreNum = aivNum / 2;
|
||||
uint32_t epilogueGranularity = expertPerRank - 1;
|
||||
|
||||
uint32_t epilogueCoreNum = aivNum;
|
||||
uint32_t epilogueGranularity = expertPerRank - 3;
|
||||
if (expertPerRank <= 4) {
|
||||
epilogueGranularity = expertPerRank - 1;
|
||||
}
|
||||
typename MatmulKernel::Params params{
|
||||
problemShape, static_cast<uint32_t>(EP), static_cast<uint32_t>(listLen), static_cast<uint32_t>(expertPerRank), static_cast<uint32_t>(maxOutputSize),
|
||||
static_cast<uint32_t>(rank), static_cast<uint32_t>(rankSize),
|
||||
@@ -277,4 +280,4 @@ __aicore__ inline void DispatchFFNCombine<TemplateMMA2ACFunc>::Process()
|
||||
}
|
||||
|
||||
} // DispatchFFNCombineImpl
|
||||
#endif // DISPATCH_FFN_COMBINE_H
|
||||
#endif // DISPATCH_FFN_COMBINE_H
|
||||
@@ -571,6 +571,7 @@ private:
|
||||
if constexpr (BlockMmad::DispatchPolicy::ASYNC) {
|
||||
blockMmad.SynchronizeBlock();
|
||||
}
|
||||
blockMmad.Finalize(params.expertPerRank - 1, 0);
|
||||
}
|
||||
|
||||
|
||||
@@ -727,19 +728,6 @@ private:
|
||||
}
|
||||
|
||||
|
||||
CATLASS_DEVICE
|
||||
void CombineSetFlag() {
|
||||
AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID0);
|
||||
AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID1);
|
||||
AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID2);
|
||||
AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID3);
|
||||
AscendC::SetFlag<AscendC::HardEvent::S_MTE2>(EVENT_ID2);
|
||||
AscendC::SetFlag<AscendC::HardEvent::S_MTE2>(EVENT_ID3);
|
||||
AscendC::SetFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID0);
|
||||
AscendC::SetFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID1);
|
||||
}
|
||||
|
||||
|
||||
CATLASS_DEVICE
|
||||
void DispatchAndCombine(Params const ¶ms) {
|
||||
icache_preload(8);
|
||||
@@ -800,13 +788,17 @@ private:
|
||||
GM_ADDR otherRankPtr = shmem(0, dstEpIdx);
|
||||
AscendC::GlobalTensor<ElementA> gmRemoteA;
|
||||
gmRemoteA.SetGlobalBuffer(reinterpret_cast<__gm__ ElementA*>(otherRankPtr + peermemInfo.offsetA));
|
||||
|
||||
AscendC::GlobalTensor<ElementPerTokenScale> gmRemotePerTokenScale;
|
||||
gmRemotePerTokenScale.SetGlobalBuffer(reinterpret_cast<__gm__ ElementPerTokenScale*>(otherRankPtr + peermemInfo.offsetPeerPerTokenScale));
|
||||
MatrixCoord offsetA{rowStart, 0};
|
||||
MatrixCoord offsetPeer{rowSrc, 0};
|
||||
int64_t gmOffsetA = params.layoutA.GetOffset(offsetA);
|
||||
int64_t gmOffsetPeer = rowSrc * (params.problemShape.k() + ALIGN_512);
|
||||
int64_t gmOffsetPeer = params.layoutA.GetOffset(offsetPeer);
|
||||
|
||||
// Communication data
|
||||
CopyGMToGMPerToken(gmA[gmOffsetA], gmPerTokenScale1[rowStart], gmRemoteA[gmOffsetPeer], rows, params.problemShape.k());
|
||||
CopyGMToGM(gmA[gmOffsetA], gmRemoteA[gmOffsetPeer], rows * params.problemShape.k(), params.ubMoveNum);
|
||||
// Communication scale
|
||||
CopyGMToGM(gmPerTokenScale1[rowStart], gmRemotePerTokenScale[rowSrc], rows, rows);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -837,16 +829,12 @@ private:
|
||||
|
||||
uint32_t n2 = params.problemShape.k();
|
||||
|
||||
|
||||
typename BlockEpilogue2::Params epilogueParams{
|
||||
static_cast<int32_t>(params.EP),
|
||||
static_cast<int32_t>(params.expertPerRank),
|
||||
static_cast<int32_t>(params.rank),
|
||||
reinterpret_cast<__gm__ int32_t *>(shmem() + peermemInfo.offsetPeerTokenPerExpert),
|
||||
params.layoutD2,
|
||||
static_cast<int32_t>(n2),
|
||||
static_cast<int32_t>(L1TileShape::N),
|
||||
shmem,
|
||||
static_cast<int32_t>(peermemInfo.offsetD)
|
||||
static_cast<int32_t>(n2)
|
||||
};
|
||||
|
||||
uint32_t n = params.problemShape.n();
|
||||
@@ -890,109 +878,65 @@ private:
|
||||
|
||||
blockEpilogue1.Finalize();
|
||||
|
||||
|
||||
CombineSetFlag();
|
||||
|
||||
CombineV2(params, blockEpilogue2);
|
||||
|
||||
blockEpilogue2.SetFlag();
|
||||
CombineV1(params, blockEpilogue2);
|
||||
AscendC::SyncAll<true>();
|
||||
#ifndef __CROSSRANKSYNCANDALLGATHERV1__
|
||||
ResetTokenPerExpert(params.EP * AlignUp(params.EP * params.expertPerRank, 128));
|
||||
#endif
|
||||
shmem.InitStatusTargetSum();
|
||||
if (get_subblockid() == 0) {
|
||||
AscendC::LocalTensor<int32_t> ctrBuffer = resource.ubBuf.template GetBufferByByte<int32_t>(0);
|
||||
shmem.CrossRankSyncV2Set(ctrBuffer);
|
||||
} else {
|
||||
uint32_t uboffset = 0;
|
||||
uint32_t aicCoreNum = coreNum / 2;
|
||||
uint32_t aicCoreIdx = get_block_idx();
|
||||
uint32_t sendRankNum_ = params.EP / aicCoreNum;
|
||||
uint32_t remainderRankNum = params.EP % aicCoreNum;
|
||||
if (aicCoreIdx < remainderRankNum) {
|
||||
sendRankNum_++;
|
||||
}
|
||||
AscendC::LocalTensor<float> statusTensor = resource.ubBuf.template GetBufferByByte<float>(uboffset);
|
||||
uboffset += sendRankNum_ * UB_ALIGN;
|
||||
AscendC::LocalTensor<float> gatherMaskOutTensor = resource.ubBuf.template GetBufferByByte<float>(uboffset);
|
||||
uboffset += AlignUp(params.EP * sizeof(float), 32);
|
||||
AscendC::LocalTensor<uint32_t> gatherTmpTensor = resource.ubBuf.template GetBufferByByte<uint32_t>(uboffset);
|
||||
uboffset += AlignUp(sizeof(uint32_t), 32);
|
||||
AscendC::LocalTensor<float> statusSumOutTensor = resource.ubBuf.template GetBufferByByte<float>(uboffset);
|
||||
uboffset += AlignUp(sizeof(float), 32);
|
||||
shmem.CrossRankSyncV2Wait(statusTensor, gatherMaskOutTensor, gatherTmpTensor, statusSumOutTensor);
|
||||
MoeTokenUnpermuteTilingData tilingData;
|
||||
MoeTokenUnpermuteTiling(params.problemShape.m() * params.topK, n2, params.topK, tilingData, coreNum / 2);
|
||||
KernelMoeTokenUnpermute<ElementD2, int32_t, float, true> kernelMoeTokenUnpermuteOp;
|
||||
kernelMoeTokenUnpermuteOp.Init(shmem() + peermemInfo.offsetD, workspaceInfo.expandedRowIdx, params.probs, reinterpret_cast<GM_ADDR>(params.ptrOutput), &tilingData);
|
||||
kernelMoeTokenUnpermuteOp.Process();
|
||||
}
|
||||
|
||||
|
||||
shmem.CrossRankSync();
|
||||
|
||||
MoeTokenUnpermuteTilingData tilingData;
|
||||
MoeTokenUnpermuteTiling(params.problemShape.m() * params.topK, n2, params.topK, tilingData, coreNum);
|
||||
KernelMoeTokenUnpermute<ElementD2, int32_t, float, true> kernelMoeTokenUnpermuteOp;
|
||||
kernelMoeTokenUnpermuteOp.Init(shmem() + peermemInfo.offsetD, workspaceInfo.expandedRowIdx, params.probs, reinterpret_cast<GM_ADDR>(params.ptrOutput), &tilingData);
|
||||
kernelMoeTokenUnpermuteOp.Process();
|
||||
}
|
||||
|
||||
CATLASS_DEVICE
|
||||
void CombineV2(Params const ¶ms, BlockEpilogue2 & blockEpilogue) {
|
||||
BlockScheduler blockScheduler;
|
||||
int32_t syncLoopIdx = 0;
|
||||
uint32_t startCoreIdx = 0;
|
||||
uint32_t aicCoreNum = coreNum / 2;
|
||||
uint32_t aicCoreIdx = get_block_idx();
|
||||
uint32_t aivSubCoreIdx = get_subblockid();
|
||||
uint32_t preSrcExpertSum = 0;
|
||||
void CombineV1(Params const ¶ms, BlockEpilogue2 & blockEpilogue) {
|
||||
uint32_t n2 = params.problemShape.k();
|
||||
uint32_t k2 = params.problemShape.n() / 2;
|
||||
int32_t prevGroupSum2 = 0;
|
||||
|
||||
icache_preload(8);
|
||||
for (uint32_t groupIdx = 0; groupIdx < params.expertPerRank; ++groupIdx) {
|
||||
uint32_t currentExpertM = cumsumMM((params.EP - 1) * params.expertPerRank + groupIdx);
|
||||
if (preSrcExpertSum >= params.maxOutputSize) {
|
||||
currentExpertM = 0;
|
||||
} else if (preSrcExpertSum + currentExpertM > params.maxOutputSize) {
|
||||
currentExpertM = params.maxOutputSize - preSrcExpertSum;
|
||||
}
|
||||
GemmCoord inGroupProblemShape{currentExpertM, n2, k2}; // M N K
|
||||
blockScheduler.Update(inGroupProblemShape, MakeCoord(L1TileShape::M, L1TileShape::N));
|
||||
uint32_t coreLoops = blockScheduler.GetCoreLoops();
|
||||
uint32_t startLoopIdx = ((aicCoreIdx < startCoreIdx) ? (aicCoreIdx + aicCoreNum) : aicCoreIdx) - startCoreIdx;
|
||||
for (uint32_t t_groupIdx = 0; t_groupIdx < params.expertPerRank; ++t_groupIdx) {
|
||||
int32_t flagId = t_groupIdx / CROSS_CORE_FLAG_MAX_SET_COUNT;
|
||||
AscendC::CrossCoreWaitFlag<0x2>(flagId);
|
||||
AscendC::SyncAll<true>();
|
||||
|
||||
for (uint32_t loopIdx = startLoopIdx; loopIdx < coreLoops; loopIdx += aicCoreNum) {
|
||||
GemmCoord blockCoord = blockScheduler.GetBlockCoord(loopIdx);
|
||||
GemmCoord actualBlockShape = blockScheduler.GetActualBlockShape(blockCoord);
|
||||
int32_t m0 = 16;
|
||||
// Block count, the shape of each block is (m0, actualBlockShape.n())
|
||||
int32_t m_rows = (actualBlockShape.m() + m0 - 1) / m0;
|
||||
int32_t aiv_m_rows = m_rows / 2;
|
||||
if (aivSubCoreIdx == 1 && aiv_m_rows * 2 < m_rows) {
|
||||
aiv_m_rows += 1;
|
||||
}
|
||||
uint32_t m_offset = blockCoord.m() * L1TileShape::M;//blockOffset
|
||||
if(aivSubCoreIdx == 1) {
|
||||
m_offset += (m_rows / 2) * m0;
|
||||
}
|
||||
uint32_t groupIdx = t_groupIdx;
|
||||
|
||||
|
||||
for (;syncLoopIdx <= groupIdx; syncLoopIdx ++) {
|
||||
int32_t flag_id = syncLoopIdx / CROSS_CORE_FLAG_MAX_SET_COUNT;
|
||||
AscendC::CrossCoreWaitFlag<0x2>(flag_id);
|
||||
}
|
||||
|
||||
for (int32_t cur_row = 0; cur_row < aiv_m_rows; cur_row ++) {
|
||||
GemmCoord realTileCoord{m_offset, blockCoord.n() * L1TileShape::N, 1};
|
||||
uint32_t actualm = m0;
|
||||
if(aivSubCoreIdx == 1 && cur_row == aiv_m_rows - 1){
|
||||
actualm = actualBlockShape.m() - (m_rows / 2) * m0 - cur_row * m0;
|
||||
for(int32_t dstEpIdx = coreIdx; dstEpIdx < params.EP; dstEpIdx += coreNum) {
|
||||
__gm__ void* dstPeermemPtr = shmem(peermemInfo.offsetD, dstEpIdx);
|
||||
AscendC::GlobalTensor<ElementD2> gmRemotePeer;
|
||||
gmRemotePeer.SetGlobalBuffer(reinterpret_cast<__gm__ ElementD2*>(dstPeermemPtr));
|
||||
uint32_t srcRowOffset = (dstEpIdx == 0 ? 0 : cumsumMM((dstEpIdx - 1) * params.expertPerRank + groupIdx)) + prevGroupSum2;
|
||||
if (srcRowOffset < params.maxOutputSize) {
|
||||
uint32_t dataRows = tokenPerExpert(tokenPerExpertLayout(dstEpIdx, params.rank, groupIdx));
|
||||
if (srcRowOffset + dataRows > params.maxOutputSize) {
|
||||
dataRows = params.maxOutputSize - srcRowOffset;
|
||||
}
|
||||
//uint32_t dstRowOffset = preSumBeforeRank(2 * dstEpIdx * FLAGSTRIDE + groupIdx);
|
||||
int32_t tmpBlock = AlignUp(params.expertPerRank, FLAGSTRIDE);
|
||||
//uint32_t dstRowOffset = preSumBeforeRank(dstEpIdx * tmpBlock + groupIdx);
|
||||
uint32_t dstRowOffset = preSumBeforeRank(dstEpIdx * params.expertPerRank + groupIdx);
|
||||
MatrixCoord offsetC{srcRowOffset, 0};
|
||||
MatrixCoord offsetPeer{dstRowOffset, 0};
|
||||
MatrixCoord shapeC{dataRows, n2};
|
||||
int64_t gmOffsetC = params.layoutD2.GetOffset(offsetC);
|
||||
int64_t gmOffsetPeer = params.layoutD2.GetOffset(offsetPeer);
|
||||
if constexpr (std::is_same_v<ElementA, int8_t>) {
|
||||
blockEpilogue(gmC2[gmOffsetC], shapeC, gmPerTokenScale2[srcRowOffset], gmRemotePeer[gmOffsetPeer]);
|
||||
} else {
|
||||
blockEpilogue(gmC2[gmOffsetC], shapeC, gmRemotePeer[gmOffsetPeer]);
|
||||
}
|
||||
GemmCoord realTileShape{actualm, actualBlockShape.n(), 1};
|
||||
blockEpilogue(gmC2, gmPerTokenScale2, realTileCoord, realTileShape, groupIdx, preSrcExpertSum, preSumBeforeRank);
|
||||
m_offset += m0;
|
||||
}
|
||||
}
|
||||
preSrcExpertSum += currentExpertM;
|
||||
startCoreIdx = (startCoreIdx + coreLoops) % aicCoreNum;
|
||||
prevGroupSum2 += cumsumMM((params.EP - 1) * params.expertPerRank + groupIdx);
|
||||
}
|
||||
blockEpilogue.Finalize();
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
struct WorkspaceInfo {
|
||||
GM_ADDR ptrA;
|
||||
@@ -1096,4 +1040,4 @@ private:
|
||||
|
||||
} // namespace Catlass::Gemm::Kernel
|
||||
|
||||
#endif // DISPATCH_FFN_COMBINE_KERNEL_HPP
|
||||
#endif // DISPATCH_FFN_COMBINE_KERNEL_HPP
|
||||
@@ -35,6 +35,7 @@ class MoeV2FullLoadDynamicQuant : public MoeV2SortBase {
|
||||
__aicore__ inline void CopyOutIdx();
|
||||
__aicore__ inline void CopyOutEmpty();
|
||||
__aicore__ inline void CopyOutXQuant1H();
|
||||
__aicore__ inline void CopyOutXQuantEH();
|
||||
__aicore__ inline void ComputeExpertTokenCountOrCumsum();
|
||||
__aicore__ inline void Compute(LocalTensor<float>& smoothLocal);
|
||||
|
||||
@@ -48,7 +49,6 @@ class MoeV2FullLoadDynamicQuant : public MoeV2SortBase {
|
||||
int64_t k_;
|
||||
int64_t n_;
|
||||
int64_t cols_;
|
||||
int64_t cols_scale_;
|
||||
int64_t activateRows_;
|
||||
int64_t expertNum;
|
||||
int64_t expertCapacity;
|
||||
@@ -63,10 +63,12 @@ class MoeV2FullLoadDynamicQuant : public MoeV2SortBase {
|
||||
TQue<QuePosition::VECIN, 1> smoothInQueue;
|
||||
TQue<QuePosition::VECOUT, 1> calcQueue;
|
||||
TQue<QuePosition::VECOUT, 1> inputXOutQueue;
|
||||
TQue<QuePosition::VECOUT, 1> scaleOutQueue;
|
||||
|
||||
GlobalTensor<T> xGm_;
|
||||
GlobalTensor<int32_t> expertIdxGm_;
|
||||
GlobalTensor<float> quantSmoothGm;
|
||||
GlobalTensor<float> dynamicQuantScaleGm;
|
||||
|
||||
GlobalTensor<int8_t> expandedXGm_;
|
||||
GlobalTensor<int32_t> expandedRowIdxGm_;
|
||||
@@ -223,7 +225,7 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::Compute(LocalTensor<float>&
|
||||
|
||||
LocalTensor<float> tempLocal = calcQueue.AllocTensor<float>();
|
||||
LocalTensor<int8_t> outLocal = inputXOutQueue.AllocTensor<int8_t>();
|
||||
LocalTensor<float> dynamicQuantLocal = outLocal[this->cols_].template ReinterpretCast<float>();
|
||||
LocalTensor<float> dynamicQuantLocal = scaleOutQueue.AllocTensor<float>();
|
||||
|
||||
if constexpr (!IsSameType<T, float>::value) {
|
||||
Cast(inLocal, inLocal.ReinterpretCast<T>()[colsAlign], RoundMode::CAST_NONE, this->cols_);
|
||||
@@ -257,6 +259,7 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::Compute(LocalTensor<float>&
|
||||
|
||||
calcQueue.FreeTensor(tempLocal);
|
||||
inputXOutQueue.EnQue(outLocal);
|
||||
scaleOutQueue.EnQue(dynamicQuantLocal);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@@ -272,7 +275,7 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::CopyOutXQuant1H() {
|
||||
|
||||
DataCopyExtParams dataXCopyParams{1, static_cast<uint32_t>(this->cols_ * sizeof(T)), 0, 0, 0};
|
||||
DataCopyExtParams smoothCopyParams{1, static_cast<uint32_t>(this->cols_ * sizeof(float)), 0, 0, 0};
|
||||
DataCopyExtParams intriParams{1, static_cast<uint32_t>((this->cols_ + BLOCK_BYTES) * sizeof(int8_t)), 0, 0, 0};
|
||||
DataCopyExtParams intriParams{1, static_cast<uint32_t>(this->cols_ * sizeof(int8_t)), 0, 0, 0};
|
||||
|
||||
LocalTensor<float> smoothLocal;
|
||||
if (smoothType == 1) {
|
||||
@@ -292,6 +295,7 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::CopyOutXQuant1H() {
|
||||
xCopyInQueue_.EnQue<T>(xLocal);
|
||||
Compute(smoothLocal);
|
||||
|
||||
LocalTensor<float> quantScaleLocal = scaleOutQueue.DeQue<float>();
|
||||
LocalTensor<int8_t> outLocal = inputXOutQueue.DeQue<int8_t>();
|
||||
while (curRowsStart <= curRowsEnd && curRowsStart / this->k_ == row) {
|
||||
int32_t outIndex = expandedRowIdx.GetValue(curRowsStart);
|
||||
@@ -299,15 +303,74 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::CopyOutXQuant1H() {
|
||||
if (outIndex == -1 || (this->dropPadMode == DROPLESS_MODE && outIndex >= this->activateRows_)) {
|
||||
continue;
|
||||
}
|
||||
DataCopyPad(expandedXGm_[outIndex * this->cols_scale_], outLocal, intriParams);
|
||||
DataCopyPad(expandedXGm_[outIndex * cols_], outLocal, intriParams);
|
||||
DataCopyPad(dynamicQuantScaleGm[outIndex], quantScaleLocal, {1, 4, 0, 0, 0});
|
||||
}
|
||||
|
||||
xCopyInQueue_.FreeTensor(xLocal);
|
||||
inputXOutQueue.FreeTensor(outLocal);
|
||||
scaleOutQueue.FreeTensor(quantScaleLocal);
|
||||
}
|
||||
if (smoothType == 1) {
|
||||
smoothInQueue.FreeTensor(smoothLocal);
|
||||
}
|
||||
expandedRowIdxCopyOutQueue_.FreeTensor(expandedRowIdx);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeV2FullLoadDynamicQuant<T>::CopyOutXQuantEH() {
|
||||
LocalTensor<int32_t> expandedRowIdx = expandedRowIdxCopyOutQueue_.DeQue<int32_t>();
|
||||
expandedRowIdxCopyOutQueue_.FreeTensor(expandedRowIdx);
|
||||
Muls(expandDstToSrcRowLocal.ReinterpretCast<float>(), expandDstToSrcRowLocal.ReinterpretCast<float>(), (float)-1,
|
||||
this->totalLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
LocalTensor<int32_t> sortedRowIdx = expandDstToSrcRowLocal.ReinterpretCast<int32_t>();
|
||||
Cast(sortedRowIdx, expandDstToSrcRowLocal.ReinterpretCast<float>(), RoundMode::CAST_ROUND, this->totalLength);
|
||||
|
||||
int64_t curRowsStart = this->blockIdx_ * this->perCoreRows_;
|
||||
int64_t curRowsEnd = curRowsStart + this->coreRows_ - 1;
|
||||
|
||||
DataCopyExtParams dataXCopyParams{1, static_cast<uint32_t>(this->cols_ * sizeof(T)), 0, 0, 0};
|
||||
DataCopyExtParams smoothCopyParams{1, static_cast<uint32_t>(this->cols_ * sizeof(float)), 0, 0, 0};
|
||||
DataCopyExtParams intriParams{1, static_cast<uint32_t>(this->cols_ * sizeof(int8_t)), 0, 0, 0};
|
||||
|
||||
for (int64_t row = curRowsStart; row <= curRowsEnd; row++) {
|
||||
if (this->dropPadMode == DROPLESS_MODE && row >= this->activateRows_) {
|
||||
break;
|
||||
}
|
||||
int32_t srcIdx = sortedRowIdx.GetValue(row);
|
||||
int32_t expertIdx = expandedExpertIdxLocal.GetValue(row);
|
||||
|
||||
LocalTensor<T> inLocal = xCopyInQueue_.AllocTensor<T>();
|
||||
LocalTensor<float> smoothLocal = smoothInQueue.AllocTensor<float>();
|
||||
if constexpr (IsSameType<T, float>::value) {
|
||||
DataCopyPad(inLocal, xGm_[srcIdx / this->k_ * this->cols_], dataXCopyParams, {false, 0, 0, 0});
|
||||
} else {
|
||||
DataCopyPad(inLocal[colsAlign], xGm_[srcIdx / this->k_ * this->cols_], dataXCopyParams, {false, 0, 0, 0});
|
||||
}
|
||||
DataCopyPad(smoothLocal, quantSmoothGm[expertIdx * this->cols_], smoothCopyParams, {false, 0, 0, 0});
|
||||
xCopyInQueue_.EnQue<T>(inLocal);
|
||||
smoothInQueue.EnQue(smoothLocal);
|
||||
smoothLocal = smoothInQueue.DeQue<float>();
|
||||
|
||||
Compute(smoothLocal);
|
||||
|
||||
LocalTensor<float> quantScaleLocal = scaleOutQueue.DeQue<float>();
|
||||
DataCopyPad(dynamicQuantScaleGm[row], quantScaleLocal, {1, 4, 0, 0, 0});
|
||||
|
||||
LocalTensor<int8_t> outLocal = inputXOutQueue.DeQue<int8_t>();
|
||||
DataCopyPad(expandedXGm_[row * this->cols_], outLocal, intriParams);
|
||||
|
||||
xCopyInQueue_.FreeTensor(inLocal);
|
||||
smoothInQueue.FreeTensor(smoothLocal);
|
||||
inputXOutQueue.FreeTensor(outLocal);
|
||||
scaleOutQueue.FreeTensor(quantScaleLocal);
|
||||
}
|
||||
|
||||
expandDstToSrcRowQueue_.FreeTensor(expandDstToSrcRowLocal);
|
||||
expandedExpertIdxCopyOutQueue_.FreeTensor(expandedExpertIdxLocal);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeV2FullLoadDynamicQuant<T>::Init(GM_ADDR x, GM_ADDR expertIdx, GM_ADDR expandedX,
|
||||
GM_ADDR expandedRowIdx, GM_ADDR expertTokensCountOrCumsum,
|
||||
@@ -321,7 +384,6 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::Init(GM_ADDR x, GM_ADDR exp
|
||||
this->k_ = tilingData->k;
|
||||
this->n_ = tilingData->n;
|
||||
this->cols_ = tilingData->cols;
|
||||
this->cols_scale_ = this->cols_ + ALIGN_512;
|
||||
this->needCoreNum_ = this->gatherOutTilingData_->needCoreNum;
|
||||
this->perCoreRows_ = this->gatherOutTilingData_->perCoreRows;
|
||||
this->activateRows_ = this->gatherOutTilingData_->activateRows;
|
||||
@@ -352,6 +414,7 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::Init(GM_ADDR x, GM_ADDR exp
|
||||
Align(this->expertNum, sizeof(int32_t)));
|
||||
}
|
||||
quantSmoothGm.SetGlobalBuffer((__gm__ float*)quantSmooth);
|
||||
dynamicQuantScaleGm.SetGlobalBuffer((__gm__ float*)dynamicQuantScale);
|
||||
|
||||
int64_t kvFactor = 2;
|
||||
int64_t buffSize = this->sortNum_ * sizeof(int32_t);
|
||||
@@ -375,7 +438,8 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::Init(GM_ADDR x, GM_ADDR exp
|
||||
}
|
||||
pipe->InitBuffer(smoothInQueue, 1, AlignBytes(this->cols_, sizeof(float)));
|
||||
pipe->InitBuffer(calcQueue, 1, AlignBytes(this->cols_, sizeof(float)));
|
||||
pipe->InitBuffer(inputXOutQueue, 1, AlignBytes(this->cols_scale_, sizeof(int8_t)));
|
||||
pipe->InitBuffer(inputXOutQueue, 1, AlignBytes(this->cols_, sizeof(int8_t)));
|
||||
pipe->InitBuffer(scaleOutQueue, 1, BLOCK_BYTES + BLOCK_BYTES);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@@ -391,7 +455,11 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::Process() {
|
||||
} else {
|
||||
CopyOutEmpty();
|
||||
}
|
||||
CopyOutXQuant1H();
|
||||
if (smoothType == 2) {
|
||||
CopyOutXQuantEH();
|
||||
} else {
|
||||
CopyOutXQuant1H();
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace MoeInitRoutingQuantV2
|
||||
|
||||
@@ -66,7 +66,6 @@ class MoeV2GatherDynamicQuant {
|
||||
int64_t needCoreNum;
|
||||
int64_t blockIdx;
|
||||
int64_t cols;
|
||||
int64_t cols_scale_;
|
||||
int64_t n;
|
||||
int64_t k;
|
||||
int64_t totalLength;
|
||||
@@ -118,7 +117,7 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::Compute(LocalTensor<float>& s
|
||||
|
||||
LocalTensor<float> tempLocal = calcQueue.AllocTensor<float>();
|
||||
LocalTensor<int8_t> outLocal = inputXOutQueue.AllocTensor<int8_t>();
|
||||
LocalTensor<float> dynamicQuantLocal = outLocal[this->cols].template ReinterpretCast<float>();
|
||||
LocalTensor<float> dynamicQuantLocal = scaleOutQueue.AllocTensor<float>();
|
||||
|
||||
if constexpr (!IsSameType<T, float>::value) {
|
||||
Cast(inLocal, inLocal.ReinterpretCast<T>()[perLoopColsAlign], RoundMode::CAST_NONE, this->cols);
|
||||
@@ -152,6 +151,7 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::Compute(LocalTensor<float>& s
|
||||
|
||||
calcQueue.FreeTensor(tempLocal);
|
||||
inputXOutQueue.EnQue(outLocal);
|
||||
scaleOutQueue.EnQue(dynamicQuantLocal);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@@ -163,7 +163,7 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::CopyOutXQuant1H(int64_t progr
|
||||
int64_t currentLoopStartRow = initialRow / this->k;
|
||||
int64_t currentLoopLastRow = (initialRow + this->currentLoopRows - 1) / this->k;
|
||||
DataCopyExtParams copyInParams{1, static_cast<uint32_t>(this->cols * sizeof(T)), 0, 0, 0};
|
||||
DataCopyExtParams copyOutParams{1, static_cast<uint32_t>((this->cols + BLOCK_BYTES) * sizeof(int8_t)), 0, 0, 0};
|
||||
DataCopyExtParams copyOutParams{1, static_cast<uint32_t>(this->cols * sizeof(int8_t)), 0, 0, 0};
|
||||
DataCopyExtParams smoothParams{1, static_cast<uint32_t>(this->cols * sizeof(float)), 0, 0, 0};
|
||||
|
||||
LocalTensor<float> smoothLocal;
|
||||
@@ -187,6 +187,7 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::CopyOutXQuant1H(int64_t progr
|
||||
// Compute quantization
|
||||
Compute(smoothLocal);
|
||||
|
||||
LocalTensor<float> quantScaleLocal = scaleOutQueue.DeQue<float>();
|
||||
LocalTensor<int8_t> outLocal = inputXOutQueue.DeQue<int8_t>();
|
||||
|
||||
while (curLoopRow < this->currentLoopRows && initialRow / this->k == row) {
|
||||
@@ -196,11 +197,15 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::CopyOutXQuant1H(int64_t progr
|
||||
if (outIndex == -1 || (this->dropPadMode == DROPLESS_MODE && outIndex >= this->activateRows)) {
|
||||
continue;
|
||||
}
|
||||
// Scale is placed after the data position
|
||||
DataCopyPad(expandedXGm[outIndex * cols_scale_], outLocal, copyOutParams);
|
||||
DataCopyPad(expandedXGm[outIndex * cols], outLocal, copyOutParams);
|
||||
DataCopyPad(dynamicQuantScaleGm[outIndex], quantScaleLocal, {1, 4, 0, 0, 0});
|
||||
}
|
||||
inputXInQueue.FreeTensor(inLocal);
|
||||
inputXOutQueue.FreeTensor(outLocal);
|
||||
scaleOutQueue.FreeTensor(quantScaleLocal);
|
||||
}
|
||||
if (smoothType == 1) {
|
||||
smoothInQueue.FreeTensor(smoothLocal);
|
||||
}
|
||||
expandRowIdxInQueue.FreeTensor(indicesLocal);
|
||||
}
|
||||
@@ -458,7 +463,6 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::Init(GM_ADDR inputX, GM_ADDR
|
||||
this->needCoreNum = this->gatherOutTilingData->needCoreNum;
|
||||
this->activateRows = this->gatherOutTilingData->activateRows;
|
||||
this->cols = tilingData->cols;
|
||||
this->cols_scale_ = this->cols + ALIGN_512;
|
||||
this->n = tilingData->n;
|
||||
this->k = tilingData->k;
|
||||
this->totalLength = tilingData->n * tilingData->k;
|
||||
@@ -514,15 +518,33 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::Init(GM_ADDR inputX, GM_ADDR
|
||||
pipe->InitBuffer(smoothInQueue, BUFFER_NUM, AlignBytes(this->perLoopCols, sizeof(float)));
|
||||
pipe->InitBuffer(calcQueue, 1, AlignBytes(this->perLoopCols, sizeof(float)));
|
||||
pipe->InitBuffer(inputXOutQueue, 1, AlignBytes(this->perLoopCols, sizeof(int8_t)));
|
||||
pipe->InitBuffer(scaleOutQueue, 1, BLOCK_BYTES + BLOCK_BYTES);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeV2GatherDynamicQuant<T>::Process() {
|
||||
if (this->blockIdx < this->needCoreNum) {
|
||||
currentLoopRows = perLoopRows;
|
||||
if (colLoops > 1) { // Cannot fit all data in one row, workspace is required
|
||||
trap(); // Not supported
|
||||
} else { // All data can fit in one row
|
||||
|
||||
if (colLoops > 1) { // A single row cannot be fully loaded; workspace is required
|
||||
if (smoothType == 2) {
|
||||
for (int64_t loop = 0; loop < this->rowLoops - 1; loop++) {
|
||||
CopyInExpandedExpertIdx(loop);
|
||||
CopyOutPartialXQuantEH(loop);
|
||||
}
|
||||
currentLoopRows = lastLoopRows;
|
||||
CopyInExpandedExpertIdx(this->rowLoops - 1);
|
||||
CopyOutPartialXQuantEH(this->rowLoops - 1);
|
||||
} else {
|
||||
for (int64_t loop = 0; loop < this->rowLoops - 1; loop++) {
|
||||
CopyInExpandedRowIdx(loop);
|
||||
CopyOutPartialXQuant1H(loop);
|
||||
}
|
||||
currentLoopRows = lastLoopRows;
|
||||
CopyInExpandedRowIdx(this->rowLoops - 1);
|
||||
CopyOutPartialXQuant1H(this->rowLoops - 1);
|
||||
}
|
||||
} else { // A single row can be fully loaded
|
||||
if (smoothType == 2) {
|
||||
for (int64_t loop = 0; loop < this->rowLoops - 1; loop++) {
|
||||
CopyInExpandedExpertIdx(loop);
|
||||
|
||||
@@ -85,9 +85,8 @@ KernelMoeTokenUnpermute<T1, T2, T3, PROBS>::Init(GM_ADDR permuted_tokens, GM_ADD
|
||||
GM_ADDR unpermuted_tokens,
|
||||
const MoeTokenUnpermuteTilingData *__restrict tiling_data)
|
||||
{
|
||||
this->blockIdx = get_block_idx();
|
||||
this->blockNum = get_block_num();
|
||||
|
||||
this->blockIdx = get_block_idx() + get_subblockid() * get_block_num();
|
||||
this->blockNum = get_block_num() * get_subblockdim();
|
||||
if (blockIdx >= blockNum) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -99,12 +99,20 @@ public:
|
||||
eventUbDMTE3VList[i] = eventMTE3V++;
|
||||
eventUbDVMTE3List[i] = eventVMTE3++;
|
||||
|
||||
AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(eventUbCVMTE2List[i]);
|
||||
AscendC::SetFlag<AscendC::HardEvent::MTE3_V>(eventUbDMTE3VList[i]);
|
||||
|
||||
ubCFp32List[i] = resource.ubBuf.template GetBufferByByte<float>(ubOffset);
|
||||
ubOffset += blockN * sizeof(float);
|
||||
}
|
||||
}
|
||||
CATLASS_DEVICE
|
||||
void SetFlag()
|
||||
{
|
||||
for (uint32_t i = 0; i < UB_STAGES; ++i) {
|
||||
AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(eventUbCVMTE2List[i]);
|
||||
AscendC::SetFlag<AscendC::HardEvent::MTE3_V>(eventUbDMTE3VList[i]);
|
||||
}
|
||||
}
|
||||
|
||||
CATLASS_DEVICE
|
||||
void Finalize()
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user