[Build] Add support for Ascend950 chip (#7151)
### What this PR does / why we need it?
This PR adds support for the Ascend950 chip. This includes:
- Updating build scripts (`CMakeLists.txt` and `setup.py`) to recognize
the Ascend950 chip and set appropriate compilation flags.
- Disabling a set of custom operators that are not yet supported on the
Ascend950 hardware target.
- Performing a codebase-wide refactoring of `pipe_barrier()` calls to
the namespaced `AscendC::PipeBarrier<>()` for improved code consistency
and adherence to the latest API standards.
Ascend950DT e2e passed (Qwen3-32B-MXFP8) and CI passed
- vLLM version: v0.16.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: linfeng-yuan <1102311262@qq.com>
This commit is contained in:
@@ -99,9 +99,9 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::SortCompute() {
|
||||
LocalTensor<int32_t> expertIdxLocal = inLocal[0];
|
||||
LocalTensor<float> expertIdxLocalFp32 = expertIdxLocal.ReinterpretCast<float>();
|
||||
Cast(expertIdxLocalFp32, expertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Muls(expertIdxLocalFp32, expertIdxLocalFp32, (float)-1, this->totalLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
int64_t duplicateNum = this->totalLength % ONE_REPEAT_SORT_NUM;
|
||||
if (duplicateNum > 0) {
|
||||
int duplicateIndex = this->totalLength - duplicateNum;
|
||||
@@ -110,38 +110,38 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::SortCompute() {
|
||||
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
|
||||
uint64_t mask[2] = {mask0, 0};
|
||||
Duplicate(expertIdxLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
}
|
||||
LocalTensor<float> concatLocal;
|
||||
LocalTensor<float> tempTensor = tempBuffer.Get<float>(GetSortLen<float>(this->sortNum_));
|
||||
Concat(concatLocal, expertIdxLocalFp32, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
LocalTensor<uint32_t> rowIdxLocal = inLocal[this->sortNum_].template ReinterpretCast<uint32_t>();
|
||||
LocalTensor<float> sortedLocal = sortedBuffer.Get<float>(GetSortLen<float>(this->sortNum_));
|
||||
Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
LocalTensor<float> expandedExpertIdxLocal = expandedExpertIdxCopyOutQueue_.AllocTensor<float>();
|
||||
expandDstToSrcRowLocal = expandDstToSrcRowQueue_.AllocTensor<uint32_t>();
|
||||
LocalTensor<float> expandDstToSrcRowLocalFp32 = expandDstToSrcRowLocal.ReinterpretCast<float>();
|
||||
Extract(expandedExpertIdxLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Cast(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocal.ReinterpretCast<int32_t>(), RoundMode::CAST_ROUND,
|
||||
this->totalLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Muls(expandedExpertIdxLocal, expandedExpertIdxLocal, (float)-1, this->totalLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
LocalTensor<int32_t> expandedExpertIdxLocalInt32;
|
||||
expandedExpertIdxLocalInt32 = expandedExpertIdxLocal.ReinterpretCast<int32_t>();
|
||||
Cast(expandedExpertIdxLocalInt32, expandedExpertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
expandedExpertIdxCopyOutQueue_.EnQue<int32_t>(expandedExpertIdxLocalInt32);
|
||||
|
||||
LocalTensor<uint32_t> expandedRowIdx = expandedRowIdxCopyOutQueue_.AllocTensor<uint32_t>();
|
||||
LocalTensor<uint32_t> expandedRowIdxU32 = expandedRowIdx.ReinterpretCast<uint32_t>();
|
||||
Muls(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocalFp32, (float)-1, this->totalLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
ArithProgression<int32_t>(inLocal[this->sortNum_], 0, 1, this->totalLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
if (duplicateNum > 0) {
|
||||
int duplicateIndex = this->totalLength - duplicateNum;
|
||||
uint64_t mask0 = UINT64_MAX;
|
||||
@@ -149,14 +149,14 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::SortCompute() {
|
||||
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
|
||||
uint64_t mask[2] = {mask0, 0};
|
||||
Duplicate(expandDstToSrcRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
}
|
||||
Concat(concatLocal, expandDstToSrcRowLocalFp32, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Extract(tempTensor, expandedRowIdxU32, sortedLocal, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
expandedRowIdxCopyOutQueue_.EnQue<uint32_t>(expandedRowIdx);
|
||||
sortDataCopyInQueue.FreeTensor(inLocal);
|
||||
}
|
||||
@@ -227,31 +227,31 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::Compute(LocalTensor<float>&
|
||||
|
||||
if constexpr (!IsSameType<T, float>::value) {
|
||||
Cast(inLocal, inLocal.ReinterpretCast<T>()[colsAlign], RoundMode::CAST_NONE, this->cols_);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
}
|
||||
|
||||
if (smoothType != 0) {
|
||||
Mul(inLocal, inLocal, smoothLocal, this->cols_);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
}
|
||||
|
||||
Abs(tempLocal, inLocal, this->cols_);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
|
||||
ReduceMax(dynamicQuantLocal, tempLocal, tempLocal, this->cols_);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
|
||||
float maxValue = dynamicQuantLocal.GetValue(0) / 127.0f;
|
||||
|
||||
Duplicate<float>(dynamicQuantLocal, maxValue, 8);
|
||||
Duplicate<float>(tempLocal, maxValue, this->cols_);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
|
||||
Div(tempLocal, inLocal, tempLocal, this->cols_);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
|
||||
Cast(tempLocal.ReinterpretCast<half>(), tempLocal, RoundMode::CAST_TRUNC, this->cols_);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
|
||||
Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_ROUND, this->cols_);
|
||||
|
||||
|
||||
@@ -56,34 +56,34 @@ __aicore__ inline void MoeV2FullLoadQuant<T>::Compute(int64_t xLocalLength) {
|
||||
uint32_t elements = Align(this->cols, sizeof(int8_t)) * xLocalLength;
|
||||
if constexpr (IsSameType<T, bfloat16_t>::value) {
|
||||
Cast(floatLocal, inLocal, RoundMode::CAST_NONE, elements);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Cast(halfLocal, floatLocal, RoundMode::CAST_NONE, elements);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Muls(halfLocal, halfLocal, static_cast<half>(this->scale), elements);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Adds(halfLocal, halfLocal, static_cast<half>(this->offset), elements);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
LocalTensor<int32_t> intLocal = floatLocal.ReinterpretCast<int32_t>();
|
||||
Cast(intLocal, halfLocal, RoundMode::CAST_RINT, elements);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
SetDeqScale((half)1.000000e+00f);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Cast(halfLocal, intLocal, RoundMode::CAST_RINT, elements);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Cast(outLocal, halfLocal, RoundMode::CAST_RINT, elements);
|
||||
} else if constexpr (IsSameType<T, float>::value) {
|
||||
Cast(halfLocal, inLocal, RoundMode::CAST_NONE, elements);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Muls(halfLocal, halfLocal, static_cast<half>(this->scale), elements);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Adds(halfLocal, halfLocal, static_cast<half>(this->offset), elements);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Cast(outLocal, halfLocal, RoundMode::CAST_RINT, elements);
|
||||
} else {
|
||||
Muls(inLocal, inLocal, static_cast<T>(this->scale), elements);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Adds(inLocal, inLocal, static_cast<T>(this->offset), elements);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Cast(outLocal, inLocal, RoundMode::CAST_RINT, elements);
|
||||
}
|
||||
inputXCopyOutQueue.EnQue(outLocal);
|
||||
|
||||
@@ -94,9 +94,9 @@ __aicore__ inline void MoeV2FullLoadQuantBase::SortCompute() {
|
||||
LocalTensor<int32_t> expertIdxLocal = inLocal[0];
|
||||
LocalTensor<float> expertIdxLocalFp32 = expertIdxLocal.ReinterpretCast<float>();
|
||||
Cast(expertIdxLocalFp32, expertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Muls(expertIdxLocalFp32, expertIdxLocalFp32, (float)-1, this->totalLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
int64_t duplicateNum = this->totalLength % ONE_REPEAT_SORT_NUM;
|
||||
if (duplicateNum > 0) {
|
||||
int duplicateIndex = this->totalLength - duplicateNum;
|
||||
@@ -105,38 +105,38 @@ __aicore__ inline void MoeV2FullLoadQuantBase::SortCompute() {
|
||||
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
|
||||
uint64_t mask[2] = {mask0, 0};
|
||||
Duplicate(expertIdxLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
}
|
||||
LocalTensor<float> concatLocal;
|
||||
LocalTensor<float> tempTensor = tempBuffer.Get<float>(GetSortLen<float>(this->sortNum));
|
||||
Concat(concatLocal, expertIdxLocalFp32, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
LocalTensor<uint32_t> rowIdxLocal = inLocal[this->sortNum].template ReinterpretCast<uint32_t>();
|
||||
LocalTensor<float> sortedLocal = sortedBuffer.Get<float>(GetSortLen<float>(this->sortNum));
|
||||
Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
LocalTensor<float> expandedExpertIdxLocal = expandedExpertIdxCopyOutQueue.AllocTensor<float>();
|
||||
LocalTensor<uint32_t> expandDstToSrcRowLocal = expandDstToSrcRowQueue.AllocTensor<uint32_t>();
|
||||
LocalTensor<float> expandDstToSrcRowLocalFp32 = expandDstToSrcRowLocal.ReinterpretCast<float>();
|
||||
Extract(expandedExpertIdxLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum / ONE_REPEAT_SORT_NUM);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Cast(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocal.ReinterpretCast<int32_t>(), RoundMode::CAST_ROUND,
|
||||
this->totalLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Muls(expandedExpertIdxLocal, expandedExpertIdxLocal, (float)-1, this->totalLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
LocalTensor<int32_t> expandedExpertIdxLocalInt32;
|
||||
expandedExpertIdxLocalInt32 = expandedExpertIdxLocal.ReinterpretCast<int32_t>();
|
||||
Cast(expandedExpertIdxLocalInt32, expandedExpertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
expandedExpertIdxCopyOutQueue.EnQue<int32_t>(expandedExpertIdxLocalInt32);
|
||||
|
||||
LocalTensor<uint32_t> expandedRowIdx = expandedRowIdxCopyOutQueue.AllocTensor<uint32_t>();
|
||||
LocalTensor<uint32_t> expandedRowIdxU32 = expandedRowIdx.ReinterpretCast<uint32_t>();
|
||||
Muls(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocalFp32, (float)-1, this->totalLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
ArithProgression<int32_t>(inLocal[this->sortNum], 0, 1, this->totalLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
if (duplicateNum > 0) {
|
||||
int duplicateIndex = this->totalLength - duplicateNum;
|
||||
uint64_t mask0 = UINT64_MAX;
|
||||
@@ -144,14 +144,14 @@ __aicore__ inline void MoeV2FullLoadQuantBase::SortCompute() {
|
||||
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
|
||||
uint64_t mask[2] = {mask0, 0};
|
||||
Duplicate(expandDstToSrcRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
}
|
||||
Concat(concatLocal, expandDstToSrcRowLocalFp32, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Extract(tempTensor, expandedRowIdxU32, sortedLocal, this->sortNum / ONE_REPEAT_SORT_NUM);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
expandedRowIdxCopyOutQueue.EnQue<uint32_t>(expandedRowIdx);
|
||||
sortDataCopyInQueue.FreeTensor(inLocal);
|
||||
|
||||
|
||||
@@ -122,31 +122,31 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::Compute(LocalTensor<float>& s
|
||||
|
||||
if constexpr (!IsSameType<T, float>::value) {
|
||||
Cast(inLocal, inLocal.ReinterpretCast<T>()[perLoopColsAlign], RoundMode::CAST_NONE, this->cols);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
}
|
||||
|
||||
if (smoothType != 0) {
|
||||
Mul(inLocal, inLocal, smoothLocal, this->cols);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
}
|
||||
|
||||
Abs(tempLocal, inLocal, this->cols);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
|
||||
ReduceMax(dynamicQuantLocal, tempLocal, tempLocal, this->cols);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
|
||||
float maxValue = dynamicQuantLocal.GetValue(0) / 127.0f;
|
||||
|
||||
Duplicate<float>(dynamicQuantLocal, maxValue, 8);
|
||||
Duplicate<float>(tempLocal, maxValue, this->cols);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
|
||||
Div(tempLocal, inLocal, tempLocal, this->cols);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
|
||||
Cast(tempLocal.ReinterpretCast<half>(), tempLocal, RoundMode::CAST_TRUNC, this->cols);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
|
||||
Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_ROUND, this->cols);
|
||||
|
||||
@@ -285,16 +285,16 @@ __aicore__ inline float MoeV2GatherDynamicQuant<T>::ComputeMax(LocalTensor<float
|
||||
|
||||
if constexpr (!IsSameType<T, float>::value) {
|
||||
Cast(inLocal, inLocal.ReinterpretCast<T>()[perLoopColsAlign], RoundMode::CAST_NONE, colsTileLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
}
|
||||
|
||||
if (smoothType != 0) {
|
||||
Mul(inLocal, inLocal, smoothLocal, colsTileLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
}
|
||||
|
||||
Abs(tempLocal, inLocal, colsTileLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
|
||||
ReduceMax(dynamicQuantLocal[8], tempLocal, tempLocal, colsTileLength);
|
||||
|
||||
@@ -319,13 +319,13 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::ComputeScale(LocalTensor<floa
|
||||
inLocal = inputXInQueue.DeQue<float>();
|
||||
|
||||
Duplicate<float>(tempLocal, scaleTemp, colsTileLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
|
||||
Div(tempLocal, inLocal, tempLocal, colsTileLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
|
||||
Cast(tempLocal.ReinterpretCast<half>(), tempLocal, RoundMode::CAST_TRUNC, colsTileLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
|
||||
Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_ROUND, colsTileLength);
|
||||
|
||||
|
||||
@@ -95,34 +95,34 @@ __aicore__ inline void MoeV2GatherQuant<T>::Compute() {
|
||||
uint32_t elements = Align(this->colsTileLength, sizeof(T));
|
||||
if constexpr (IsSameType<T, bfloat16_t>::value) {
|
||||
Cast(floatLocal, inLocal, RoundMode::CAST_NONE, elements);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Cast(halfLocal, floatLocal, RoundMode::CAST_NONE, elements);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Muls(halfLocal, halfLocal, static_cast<half>(this->scale), elements);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Adds(halfLocal, halfLocal, static_cast<half>(this->offset), elements);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
LocalTensor<int32_t> intLocal = floatLocal.ReinterpretCast<int32_t>();
|
||||
Cast(intLocal, halfLocal, RoundMode::CAST_RINT, elements);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
SetDeqScale((half)1.000000e+00f);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Cast(halfLocal, intLocal, RoundMode::CAST_RINT, elements);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Cast(outLocal, halfLocal, RoundMode::CAST_RINT, elements);
|
||||
} else if constexpr (IsSameType<T, float>::value) {
|
||||
Cast(halfLocal, inLocal, RoundMode::CAST_NONE, elements);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Muls(halfLocal, halfLocal, static_cast<half>(this->scale), elements);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Adds(halfLocal, halfLocal, static_cast<half>(this->offset), elements);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Cast(outLocal, halfLocal, RoundMode::CAST_RINT, elements);
|
||||
} else {
|
||||
Muls(inLocal, inLocal, static_cast<T>(this->scale), elements);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Adds(inLocal, inLocal, static_cast<T>(this->offset), elements);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Cast(outLocal, inLocal, RoundMode::CAST_RINT, elements);
|
||||
}
|
||||
inputXCopyOutQueue.EnQue(outLocal);
|
||||
|
||||
@@ -88,9 +88,9 @@ __aicore__ inline void MoeV2FullLoad<T>::SortCompute() {
|
||||
LocalTensor<int32_t> expertIdxLocal = inLocal[0];
|
||||
LocalTensor<float> expertIdxLocalFp32 = expertIdxLocal.ReinterpretCast<float>();
|
||||
Cast(expertIdxLocalFp32, expertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Muls(expertIdxLocalFp32, expertIdxLocalFp32, (float)-1, this->totalLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
int64_t duplicateNum = this->totalLength % ONE_REPEAT_SORT_NUM;
|
||||
if (duplicateNum > 0) {
|
||||
int duplicateIndex = this->totalLength - duplicateNum;
|
||||
@@ -99,38 +99,38 @@ __aicore__ inline void MoeV2FullLoad<T>::SortCompute() {
|
||||
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
|
||||
uint64_t mask[2] = {mask0, 0};
|
||||
Duplicate(expertIdxLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
}
|
||||
LocalTensor<float> concatLocal;
|
||||
LocalTensor<float> tempTensor = tempBuffer.Get<float>(GetSortLen<float>(this->sortNum_));
|
||||
Concat(concatLocal, expertIdxLocalFp32, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
LocalTensor<uint32_t> rowIdxLocal = inLocal[this->sortNum_].template ReinterpretCast<uint32_t>();
|
||||
LocalTensor<float> sortedLocal = sortedBuffer.Get<float>(GetSortLen<float>(this->sortNum_));
|
||||
Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
LocalTensor<float> expandedExpertIdxLocal = expandedExpertIdxCopyOutQueue_.AllocTensor<float>();
|
||||
LocalTensor<uint32_t> expandDstToSrcRowLocal = expandDstToSrcRowQueue_.AllocTensor<uint32_t>();
|
||||
LocalTensor<float> expandDstToSrcRowLocalFp32 = expandDstToSrcRowLocal.ReinterpretCast<float>();
|
||||
Extract(expandedExpertIdxLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Cast(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocal.ReinterpretCast<int32_t>(), RoundMode::CAST_ROUND,
|
||||
this->totalLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Muls(expandedExpertIdxLocal, expandedExpertIdxLocal, (float)-1, this->totalLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
LocalTensor<int32_t> expandedExpertIdxLocalInt32;
|
||||
expandedExpertIdxLocalInt32 = expandedExpertIdxLocal.ReinterpretCast<int32_t>();
|
||||
Cast(expandedExpertIdxLocalInt32, expandedExpertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
expandedExpertIdxCopyOutQueue_.EnQue<int32_t>(expandedExpertIdxLocalInt32);
|
||||
|
||||
LocalTensor<uint32_t> expandedRowIdx = expandedRowIdxCopyOutQueue_.AllocTensor<uint32_t>();
|
||||
LocalTensor<uint32_t> expandedRowIdxU32 = expandedRowIdx.ReinterpretCast<uint32_t>();
|
||||
Muls(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocalFp32, (float)-1, this->totalLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
ArithProgression<int32_t>(inLocal[this->sortNum_], 0, 1, this->totalLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
if (duplicateNum > 0) {
|
||||
int duplicateIndex = this->totalLength - duplicateNum;
|
||||
uint64_t mask0 = UINT64_MAX;
|
||||
@@ -138,14 +138,14 @@ __aicore__ inline void MoeV2FullLoad<T>::SortCompute() {
|
||||
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
|
||||
uint64_t mask[2] = {mask0, 0};
|
||||
Duplicate(expandDstToSrcRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
}
|
||||
Concat(concatLocal, expandDstToSrcRowLocalFp32, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Extract(tempTensor, expandedRowIdxU32, sortedLocal, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
expandedRowIdxCopyOutQueue_.EnQue<uint32_t>(expandedRowIdx);
|
||||
sortDataCopyInQueue.FreeTensor(inLocal);
|
||||
|
||||
|
||||
@@ -168,9 +168,9 @@ __aicore__ inline void MoeV2MrgsortOut::UpdateSortInfo() {
|
||||
|
||||
__aicore__ inline void MoeV2MrgsortOut::Extract() {
|
||||
AscendC::Extract(this->ubOutput1, this->ubOutput2, this->tempBuffer, Ceil(curLoopSortedNum, ONE_REPEAT_SORT_NUM));
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Muls(this->ubOutput1, this->ubOutput1, (float)-1, Align(curLoopSortedNum, sizeof(float)));
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Cast(this->ubOutputInt1, this->ubOutput1, RoundMode::CAST_ROUND, Align(curLoopSortedNum, sizeof(float)));
|
||||
}
|
||||
|
||||
|
||||
@@ -106,9 +106,9 @@ __aicore__ inline void MoeV2SortMultiCore::UBSortCompute(int64_t progress, int64
|
||||
|
||||
expertForSourceRowLocalFp32 = expertForSourceRowLocal.ReinterpretCast<float>();
|
||||
Cast(expertForSourceRowLocalFp32, expertForSourceRowLocal, RoundMode::CAST_ROUND, sortNum);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Muls(expertForSourceRowLocalFp32, expertForSourceRowLocalFp32, (float)-1, sortNum);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
|
||||
int64_t duplicateNum = size % ONE_REPEAT_SORT_NUM;
|
||||
if (duplicateNum > 0) {
|
||||
@@ -118,7 +118,7 @@ __aicore__ inline void MoeV2SortMultiCore::UBSortCompute(int64_t progress, int64
|
||||
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
|
||||
uint64_t mask[2] = {mask0, 0};
|
||||
Duplicate(expertForSourceRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
}
|
||||
|
||||
LocalTensor<float> concatLocal = expertForSourceRowLocalFp32;
|
||||
|
||||
@@ -56,9 +56,9 @@ __aicore__ inline void MoeV2SortOneCore::SortCompute() {
|
||||
LocalTensor<int32_t> expertForSourceRowLocal = inLocal[0];
|
||||
LocalTensor<float> expertForSourceRowLocalFp32 = expertForSourceRowLocal.ReinterpretCast<float>();
|
||||
Cast(expertForSourceRowLocalFp32, expertForSourceRowLocal, RoundMode::CAST_ROUND, this->tileLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Muls(expertForSourceRowLocalFp32, expertForSourceRowLocalFp32, (float)-1, this->tileLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
|
||||
int64_t duplicateNum = this->totalLength % ONE_REPEAT_SORT_NUM;
|
||||
if (duplicateNum > 0) {
|
||||
@@ -68,28 +68,28 @@ __aicore__ inline void MoeV2SortOneCore::SortCompute() {
|
||||
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
|
||||
uint64_t mask[2] = {mask0, 0};
|
||||
Duplicate(expertForSourceRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
}
|
||||
|
||||
LocalTensor<float> concatLocal;
|
||||
LocalTensor<float> tempTensor = tempBuffer.Get<float>(GetSortLen<float>(this->sortNum));
|
||||
Concat(concatLocal, expertForSourceRowLocalFp32, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
|
||||
LocalTensor<float> sortedLocal = sortedBuffer.Get<float>(GetSortLen<float>(this->sortNum));
|
||||
LocalTensor<uint32_t> sourceRowLocal;
|
||||
sourceRowLocal = inLocal[this->sortNum].ReinterpretCast<uint32_t>();
|
||||
Sort<float, true>(sortedLocal, concatLocal, sourceRowLocal, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
|
||||
LocalTensor<float> outLocal = sortDataCopyOutQueue.AllocTensor<float>();
|
||||
LocalTensor<float> sortedExpertForSourceRowLocal = outLocal[0];
|
||||
LocalTensor<uint32_t> expandDstToSrcRowLocal;
|
||||
expandDstToSrcRowLocal = outLocal[this->sortNum].ReinterpretCast<uint32_t>();
|
||||
Extract(sortedExpertForSourceRowLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum / ONE_REPEAT_SORT_NUM);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Muls(sortedExpertForSourceRowLocal, sortedExpertForSourceRowLocal, (float)-1, this->tileLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
|
||||
LocalTensor<int32_t> expertForSourceRowLocalInt32;
|
||||
expertForSourceRowLocalInt32 = sortedExpertForSourceRowLocal.ReinterpretCast<int32_t>();
|
||||
|
||||
@@ -164,31 +164,31 @@ __aicore__ inline void MoeV2SrcToDstAndGather<T, TilingData>::Compute(int32_t sr
|
||||
|
||||
if constexpr (!IsSameType<T, float>::value) {
|
||||
Cast(inLocal, inLocal.template ReinterpretCast<T>()[perLoopColsAlign], RoundMode::CAST_NONE, this->cols);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
}
|
||||
|
||||
if (smoothType != 0) {
|
||||
Mul(inLocal, inLocal, smoothLocal, this->cols);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
}
|
||||
|
||||
Abs(tempLocal, inLocal, this->cols);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
|
||||
ReduceMax(dynamicQuantLocal, tempLocal, tempLocal, this->cols);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
|
||||
float maxValue = dynamicQuantLocal.GetValue(0) / 127.0f;
|
||||
|
||||
Duplicate<float>(dynamicQuantLocal, maxValue, 8);
|
||||
Duplicate<float>(tempLocal, maxValue, this->cols);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
|
||||
Div(tempLocal, inLocal, tempLocal, this->cols);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
|
||||
Cast(tempLocal.ReinterpretCast<half>(), tempLocal, RoundMode::CAST_TRUNC, this->cols);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
|
||||
Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_ROUND, this->cols);
|
||||
|
||||
@@ -274,7 +274,7 @@ __aicore__ inline float MoeV2SrcToDstAndGather<T, TilingData>::ComputeMax(LocalT
|
||||
|
||||
if constexpr (!IsSameType<T, float>::value) {
|
||||
Cast(inLocal, inLocal.ReinterpretCast<T>()[perLoopColsAlign], RoundMode::CAST_NONE, colsTileLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
}
|
||||
|
||||
if (smoothType != 0) {
|
||||
@@ -284,11 +284,11 @@ __aicore__ inline float MoeV2SrcToDstAndGather<T, TilingData>::ComputeMax(LocalT
|
||||
smoothLocal = smoothInQueue.DeQue<float>();
|
||||
|
||||
Mul(inLocal, inLocal, smoothLocal, colsTileLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
}
|
||||
|
||||
Abs(tempLocal, inLocal, colsTileLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
|
||||
ReduceMax(dynamicQuantLocal[8], tempLocal, tempLocal, colsTileLength);
|
||||
|
||||
@@ -314,13 +314,13 @@ __aicore__ inline void MoeV2SrcToDstAndGather<T, TilingData>::ComputeScale(Local
|
||||
inLocal = inputXInQueue.DeQue<float>();
|
||||
|
||||
Duplicate<float>(tempLocal, scaleTemp, colsTileLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
|
||||
Div(tempLocal, inLocal, tempLocal, colsTileLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
|
||||
Cast(tempLocal.ReinterpretCast<half>(), tempLocal, RoundMode::CAST_TRUNC, colsTileLength);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
|
||||
Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_ROUND, colsTileLength);
|
||||
|
||||
|
||||
@@ -75,13 +75,13 @@ __aicore__ inline void MoeV2SrcToDstOp::Compute(int64_t progress) {
|
||||
LocalTensor<int32_t> outLocal = copyOutQueue.AllocTensor<int32_t>();
|
||||
LocalTensor<int32_t> assistTensor = assistBuffer.Get<int32_t>(ASSIST_NUM);
|
||||
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
int64_t loops = Ceil(currentLoopRows, ASSIST_INDEX_NUM);
|
||||
for (int64_t i = 0; i < loops; i++) {
|
||||
Adds(outLocal[i * ASSIST_NUM], assistTensor,
|
||||
static_cast<int32_t>(this->perLoopRows * progress + i * ASSIST_INDEX_NUM), ASSIST_NUM);
|
||||
}
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
copyOutQueue.EnQue<int32_t>(outLocal);
|
||||
}
|
||||
|
||||
|
||||
@@ -226,7 +226,7 @@ public:
|
||||
AscendC::CrossCoreSetFlag<0x0, PIPE_MTE3>(RECV_SYNC_EVENT_ID);
|
||||
AscendC::CrossCoreSetFlag<0x0, PIPE_MTE3>(SEND_SYNC_EVENT_ID);
|
||||
AscendC::CrossCoreWaitFlag(SEND_SYNC_EVENT_ID);
|
||||
pipe_barrier(PIPE_ALL);
|
||||
AscendC::PipeBarrier<PIPE_ALL>();
|
||||
|
||||
ctrBuffer.SetValue(0, epStateValue_);
|
||||
AscendC::SetFlag<AscendC::HardEvent::S_MTE3>(EVENT_ID0);
|
||||
|
||||
Reference in New Issue
Block a user