[Build] Add support for Ascend950 chip (#7151)

### What this PR does / why we need it?
This PR adds support for the Ascend950 chip. This includes:
- Updating build scripts (`CMakeLists.txt` and `setup.py`) to recognize
the Ascend950 chip and set appropriate compilation flags.
- Disabling a set of custom operators that are not yet supported on the
Ascend950 hardware target.
- Performing a codebase-wide refactoring of `pipe_barrier()` calls to
the namespaced `AscendC::PipeBarrier<>()` for improved code consistency
and adherence to the latest API standards.

Ascend950DT e2e passed (Qwen3-32B-MXFP8) and CI passed
- vLLM version: v0.16.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: linfeng-yuan <1102311262@qq.com>
This commit is contained in:
linfeng-yuan
2026-03-12 10:25:51 +08:00
committed by GitHub
parent da01a74009
commit 5f3826b093
24 changed files with 246 additions and 227 deletions

View File

@@ -99,9 +99,9 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::SortCompute() {
LocalTensor<int32_t> expertIdxLocal = inLocal[0];
LocalTensor<float> expertIdxLocalFp32 = expertIdxLocal.ReinterpretCast<float>();
Cast(expertIdxLocalFp32, expertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Muls(expertIdxLocalFp32, expertIdxLocalFp32, (float)-1, this->totalLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
int64_t duplicateNum = this->totalLength % ONE_REPEAT_SORT_NUM;
if (duplicateNum > 0) {
int duplicateIndex = this->totalLength - duplicateNum;
@@ -110,38 +110,38 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::SortCompute() {
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
uint64_t mask[2] = {mask0, 0};
Duplicate(expertIdxLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
}
LocalTensor<float> concatLocal;
LocalTensor<float> tempTensor = tempBuffer.Get<float>(GetSortLen<float>(this->sortNum_));
Concat(concatLocal, expertIdxLocalFp32, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
LocalTensor<uint32_t> rowIdxLocal = inLocal[this->sortNum_].template ReinterpretCast<uint32_t>();
LocalTensor<float> sortedLocal = sortedBuffer.Get<float>(GetSortLen<float>(this->sortNum_));
Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
LocalTensor<float> expandedExpertIdxLocal = expandedExpertIdxCopyOutQueue_.AllocTensor<float>();
expandDstToSrcRowLocal = expandDstToSrcRowQueue_.AllocTensor<uint32_t>();
LocalTensor<float> expandDstToSrcRowLocalFp32 = expandDstToSrcRowLocal.ReinterpretCast<float>();
Extract(expandedExpertIdxLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum_ / ONE_REPEAT_SORT_NUM);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Cast(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocal.ReinterpretCast<int32_t>(), RoundMode::CAST_ROUND,
this->totalLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Muls(expandedExpertIdxLocal, expandedExpertIdxLocal, (float)-1, this->totalLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
LocalTensor<int32_t> expandedExpertIdxLocalInt32;
expandedExpertIdxLocalInt32 = expandedExpertIdxLocal.ReinterpretCast<int32_t>();
Cast(expandedExpertIdxLocalInt32, expandedExpertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
expandedExpertIdxCopyOutQueue_.EnQue<int32_t>(expandedExpertIdxLocalInt32);
LocalTensor<uint32_t> expandedRowIdx = expandedRowIdxCopyOutQueue_.AllocTensor<uint32_t>();
LocalTensor<uint32_t> expandedRowIdxU32 = expandedRowIdx.ReinterpretCast<uint32_t>();
Muls(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocalFp32, (float)-1, this->totalLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
ArithProgression<int32_t>(inLocal[this->sortNum_], 0, 1, this->totalLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
if (duplicateNum > 0) {
int duplicateIndex = this->totalLength - duplicateNum;
uint64_t mask0 = UINT64_MAX;
@@ -149,14 +149,14 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::SortCompute() {
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
uint64_t mask[2] = {mask0, 0};
Duplicate(expandDstToSrcRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
}
Concat(concatLocal, expandDstToSrcRowLocalFp32, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Extract(tempTensor, expandedRowIdxU32, sortedLocal, this->sortNum_ / ONE_REPEAT_SORT_NUM);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
expandedRowIdxCopyOutQueue_.EnQue<uint32_t>(expandedRowIdx);
sortDataCopyInQueue.FreeTensor(inLocal);
}
@@ -227,31 +227,31 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::Compute(LocalTensor<float>&
if constexpr (!IsSameType<T, float>::value) {
Cast(inLocal, inLocal.ReinterpretCast<T>()[colsAlign], RoundMode::CAST_NONE, this->cols_);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
}
if (smoothType != 0) {
Mul(inLocal, inLocal, smoothLocal, this->cols_);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
}
Abs(tempLocal, inLocal, this->cols_);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
ReduceMax(dynamicQuantLocal, tempLocal, tempLocal, this->cols_);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
float maxValue = dynamicQuantLocal.GetValue(0) / 127.0f;
Duplicate<float>(dynamicQuantLocal, maxValue, 8);
Duplicate<float>(tempLocal, maxValue, this->cols_);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Div(tempLocal, inLocal, tempLocal, this->cols_);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Cast(tempLocal.ReinterpretCast<half>(), tempLocal, RoundMode::CAST_TRUNC, this->cols_);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_ROUND, this->cols_);

View File

@@ -56,34 +56,34 @@ __aicore__ inline void MoeV2FullLoadQuant<T>::Compute(int64_t xLocalLength) {
uint32_t elements = Align(this->cols, sizeof(int8_t)) * xLocalLength;
if constexpr (IsSameType<T, bfloat16_t>::value) {
Cast(floatLocal, inLocal, RoundMode::CAST_NONE, elements);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Cast(halfLocal, floatLocal, RoundMode::CAST_NONE, elements);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Muls(halfLocal, halfLocal, static_cast<half>(this->scale), elements);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Adds(halfLocal, halfLocal, static_cast<half>(this->offset), elements);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
LocalTensor<int32_t> intLocal = floatLocal.ReinterpretCast<int32_t>();
Cast(intLocal, halfLocal, RoundMode::CAST_RINT, elements);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
SetDeqScale((half)1.000000e+00f);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Cast(halfLocal, intLocal, RoundMode::CAST_RINT, elements);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Cast(outLocal, halfLocal, RoundMode::CAST_RINT, elements);
} else if constexpr (IsSameType<T, float>::value) {
Cast(halfLocal, inLocal, RoundMode::CAST_NONE, elements);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Muls(halfLocal, halfLocal, static_cast<half>(this->scale), elements);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Adds(halfLocal, halfLocal, static_cast<half>(this->offset), elements);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Cast(outLocal, halfLocal, RoundMode::CAST_RINT, elements);
} else {
Muls(inLocal, inLocal, static_cast<T>(this->scale), elements);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Adds(inLocal, inLocal, static_cast<T>(this->offset), elements);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Cast(outLocal, inLocal, RoundMode::CAST_RINT, elements);
}
inputXCopyOutQueue.EnQue(outLocal);

View File

@@ -94,9 +94,9 @@ __aicore__ inline void MoeV2FullLoadQuantBase::SortCompute() {
LocalTensor<int32_t> expertIdxLocal = inLocal[0];
LocalTensor<float> expertIdxLocalFp32 = expertIdxLocal.ReinterpretCast<float>();
Cast(expertIdxLocalFp32, expertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Muls(expertIdxLocalFp32, expertIdxLocalFp32, (float)-1, this->totalLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
int64_t duplicateNum = this->totalLength % ONE_REPEAT_SORT_NUM;
if (duplicateNum > 0) {
int duplicateIndex = this->totalLength - duplicateNum;
@@ -105,38 +105,38 @@ __aicore__ inline void MoeV2FullLoadQuantBase::SortCompute() {
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
uint64_t mask[2] = {mask0, 0};
Duplicate(expertIdxLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
}
LocalTensor<float> concatLocal;
LocalTensor<float> tempTensor = tempBuffer.Get<float>(GetSortLen<float>(this->sortNum));
Concat(concatLocal, expertIdxLocalFp32, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
LocalTensor<uint32_t> rowIdxLocal = inLocal[this->sortNum].template ReinterpretCast<uint32_t>();
LocalTensor<float> sortedLocal = sortedBuffer.Get<float>(GetSortLen<float>(this->sortNum));
Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
LocalTensor<float> expandedExpertIdxLocal = expandedExpertIdxCopyOutQueue.AllocTensor<float>();
LocalTensor<uint32_t> expandDstToSrcRowLocal = expandDstToSrcRowQueue.AllocTensor<uint32_t>();
LocalTensor<float> expandDstToSrcRowLocalFp32 = expandDstToSrcRowLocal.ReinterpretCast<float>();
Extract(expandedExpertIdxLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum / ONE_REPEAT_SORT_NUM);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Cast(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocal.ReinterpretCast<int32_t>(), RoundMode::CAST_ROUND,
this->totalLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Muls(expandedExpertIdxLocal, expandedExpertIdxLocal, (float)-1, this->totalLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
LocalTensor<int32_t> expandedExpertIdxLocalInt32;
expandedExpertIdxLocalInt32 = expandedExpertIdxLocal.ReinterpretCast<int32_t>();
Cast(expandedExpertIdxLocalInt32, expandedExpertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
expandedExpertIdxCopyOutQueue.EnQue<int32_t>(expandedExpertIdxLocalInt32);
LocalTensor<uint32_t> expandedRowIdx = expandedRowIdxCopyOutQueue.AllocTensor<uint32_t>();
LocalTensor<uint32_t> expandedRowIdxU32 = expandedRowIdx.ReinterpretCast<uint32_t>();
Muls(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocalFp32, (float)-1, this->totalLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
ArithProgression<int32_t>(inLocal[this->sortNum], 0, 1, this->totalLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
if (duplicateNum > 0) {
int duplicateIndex = this->totalLength - duplicateNum;
uint64_t mask0 = UINT64_MAX;
@@ -144,14 +144,14 @@ __aicore__ inline void MoeV2FullLoadQuantBase::SortCompute() {
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
uint64_t mask[2] = {mask0, 0};
Duplicate(expandDstToSrcRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
}
Concat(concatLocal, expandDstToSrcRowLocalFp32, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Extract(tempTensor, expandedRowIdxU32, sortedLocal, this->sortNum / ONE_REPEAT_SORT_NUM);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
expandedRowIdxCopyOutQueue.EnQue<uint32_t>(expandedRowIdx);
sortDataCopyInQueue.FreeTensor(inLocal);

View File

@@ -122,31 +122,31 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::Compute(LocalTensor<float>& s
if constexpr (!IsSameType<T, float>::value) {
Cast(inLocal, inLocal.ReinterpretCast<T>()[perLoopColsAlign], RoundMode::CAST_NONE, this->cols);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
}
if (smoothType != 0) {
Mul(inLocal, inLocal, smoothLocal, this->cols);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
}
Abs(tempLocal, inLocal, this->cols);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
ReduceMax(dynamicQuantLocal, tempLocal, tempLocal, this->cols);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
float maxValue = dynamicQuantLocal.GetValue(0) / 127.0f;
Duplicate<float>(dynamicQuantLocal, maxValue, 8);
Duplicate<float>(tempLocal, maxValue, this->cols);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Div(tempLocal, inLocal, tempLocal, this->cols);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Cast(tempLocal.ReinterpretCast<half>(), tempLocal, RoundMode::CAST_TRUNC, this->cols);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_ROUND, this->cols);
@@ -285,16 +285,16 @@ __aicore__ inline float MoeV2GatherDynamicQuant<T>::ComputeMax(LocalTensor<float
if constexpr (!IsSameType<T, float>::value) {
Cast(inLocal, inLocal.ReinterpretCast<T>()[perLoopColsAlign], RoundMode::CAST_NONE, colsTileLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
}
if (smoothType != 0) {
Mul(inLocal, inLocal, smoothLocal, colsTileLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
}
Abs(tempLocal, inLocal, colsTileLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
ReduceMax(dynamicQuantLocal[8], tempLocal, tempLocal, colsTileLength);
@@ -319,13 +319,13 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::ComputeScale(LocalTensor<floa
inLocal = inputXInQueue.DeQue<float>();
Duplicate<float>(tempLocal, scaleTemp, colsTileLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Div(tempLocal, inLocal, tempLocal, colsTileLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Cast(tempLocal.ReinterpretCast<half>(), tempLocal, RoundMode::CAST_TRUNC, colsTileLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_ROUND, colsTileLength);

View File

@@ -95,34 +95,34 @@ __aicore__ inline void MoeV2GatherQuant<T>::Compute() {
uint32_t elements = Align(this->colsTileLength, sizeof(T));
if constexpr (IsSameType<T, bfloat16_t>::value) {
Cast(floatLocal, inLocal, RoundMode::CAST_NONE, elements);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Cast(halfLocal, floatLocal, RoundMode::CAST_NONE, elements);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Muls(halfLocal, halfLocal, static_cast<half>(this->scale), elements);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Adds(halfLocal, halfLocal, static_cast<half>(this->offset), elements);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
LocalTensor<int32_t> intLocal = floatLocal.ReinterpretCast<int32_t>();
Cast(intLocal, halfLocal, RoundMode::CAST_RINT, elements);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
SetDeqScale((half)1.000000e+00f);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Cast(halfLocal, intLocal, RoundMode::CAST_RINT, elements);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Cast(outLocal, halfLocal, RoundMode::CAST_RINT, elements);
} else if constexpr (IsSameType<T, float>::value) {
Cast(halfLocal, inLocal, RoundMode::CAST_NONE, elements);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Muls(halfLocal, halfLocal, static_cast<half>(this->scale), elements);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Adds(halfLocal, halfLocal, static_cast<half>(this->offset), elements);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Cast(outLocal, halfLocal, RoundMode::CAST_RINT, elements);
} else {
Muls(inLocal, inLocal, static_cast<T>(this->scale), elements);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Adds(inLocal, inLocal, static_cast<T>(this->offset), elements);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Cast(outLocal, inLocal, RoundMode::CAST_RINT, elements);
}
inputXCopyOutQueue.EnQue(outLocal);

View File

@@ -88,9 +88,9 @@ __aicore__ inline void MoeV2FullLoad<T>::SortCompute() {
LocalTensor<int32_t> expertIdxLocal = inLocal[0];
LocalTensor<float> expertIdxLocalFp32 = expertIdxLocal.ReinterpretCast<float>();
Cast(expertIdxLocalFp32, expertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Muls(expertIdxLocalFp32, expertIdxLocalFp32, (float)-1, this->totalLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
int64_t duplicateNum = this->totalLength % ONE_REPEAT_SORT_NUM;
if (duplicateNum > 0) {
int duplicateIndex = this->totalLength - duplicateNum;
@@ -99,38 +99,38 @@ __aicore__ inline void MoeV2FullLoad<T>::SortCompute() {
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
uint64_t mask[2] = {mask0, 0};
Duplicate(expertIdxLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
}
LocalTensor<float> concatLocal;
LocalTensor<float> tempTensor = tempBuffer.Get<float>(GetSortLen<float>(this->sortNum_));
Concat(concatLocal, expertIdxLocalFp32, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
LocalTensor<uint32_t> rowIdxLocal = inLocal[this->sortNum_].template ReinterpretCast<uint32_t>();
LocalTensor<float> sortedLocal = sortedBuffer.Get<float>(GetSortLen<float>(this->sortNum_));
Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
LocalTensor<float> expandedExpertIdxLocal = expandedExpertIdxCopyOutQueue_.AllocTensor<float>();
LocalTensor<uint32_t> expandDstToSrcRowLocal = expandDstToSrcRowQueue_.AllocTensor<uint32_t>();
LocalTensor<float> expandDstToSrcRowLocalFp32 = expandDstToSrcRowLocal.ReinterpretCast<float>();
Extract(expandedExpertIdxLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum_ / ONE_REPEAT_SORT_NUM);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Cast(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocal.ReinterpretCast<int32_t>(), RoundMode::CAST_ROUND,
this->totalLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Muls(expandedExpertIdxLocal, expandedExpertIdxLocal, (float)-1, this->totalLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
LocalTensor<int32_t> expandedExpertIdxLocalInt32;
expandedExpertIdxLocalInt32 = expandedExpertIdxLocal.ReinterpretCast<int32_t>();
Cast(expandedExpertIdxLocalInt32, expandedExpertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
expandedExpertIdxCopyOutQueue_.EnQue<int32_t>(expandedExpertIdxLocalInt32);
LocalTensor<uint32_t> expandedRowIdx = expandedRowIdxCopyOutQueue_.AllocTensor<uint32_t>();
LocalTensor<uint32_t> expandedRowIdxU32 = expandedRowIdx.ReinterpretCast<uint32_t>();
Muls(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocalFp32, (float)-1, this->totalLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
ArithProgression<int32_t>(inLocal[this->sortNum_], 0, 1, this->totalLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
if (duplicateNum > 0) {
int duplicateIndex = this->totalLength - duplicateNum;
uint64_t mask0 = UINT64_MAX;
@@ -138,14 +138,14 @@ __aicore__ inline void MoeV2FullLoad<T>::SortCompute() {
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
uint64_t mask[2] = {mask0, 0};
Duplicate(expandDstToSrcRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
}
Concat(concatLocal, expandDstToSrcRowLocalFp32, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Extract(tempTensor, expandedRowIdxU32, sortedLocal, this->sortNum_ / ONE_REPEAT_SORT_NUM);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
expandedRowIdxCopyOutQueue_.EnQue<uint32_t>(expandedRowIdx);
sortDataCopyInQueue.FreeTensor(inLocal);

View File

@@ -168,9 +168,9 @@ __aicore__ inline void MoeV2MrgsortOut::UpdateSortInfo() {
__aicore__ inline void MoeV2MrgsortOut::Extract() {
AscendC::Extract(this->ubOutput1, this->ubOutput2, this->tempBuffer, Ceil(curLoopSortedNum, ONE_REPEAT_SORT_NUM));
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Muls(this->ubOutput1, this->ubOutput1, (float)-1, Align(curLoopSortedNum, sizeof(float)));
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Cast(this->ubOutputInt1, this->ubOutput1, RoundMode::CAST_ROUND, Align(curLoopSortedNum, sizeof(float)));
}

View File

@@ -106,9 +106,9 @@ __aicore__ inline void MoeV2SortMultiCore::UBSortCompute(int64_t progress, int64
expertForSourceRowLocalFp32 = expertForSourceRowLocal.ReinterpretCast<float>();
Cast(expertForSourceRowLocalFp32, expertForSourceRowLocal, RoundMode::CAST_ROUND, sortNum);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Muls(expertForSourceRowLocalFp32, expertForSourceRowLocalFp32, (float)-1, sortNum);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
int64_t duplicateNum = size % ONE_REPEAT_SORT_NUM;
if (duplicateNum > 0) {
@@ -118,7 +118,7 @@ __aicore__ inline void MoeV2SortMultiCore::UBSortCompute(int64_t progress, int64
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
uint64_t mask[2] = {mask0, 0};
Duplicate(expertForSourceRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
}
LocalTensor<float> concatLocal = expertForSourceRowLocalFp32;

View File

@@ -56,9 +56,9 @@ __aicore__ inline void MoeV2SortOneCore::SortCompute() {
LocalTensor<int32_t> expertForSourceRowLocal = inLocal[0];
LocalTensor<float> expertForSourceRowLocalFp32 = expertForSourceRowLocal.ReinterpretCast<float>();
Cast(expertForSourceRowLocalFp32, expertForSourceRowLocal, RoundMode::CAST_ROUND, this->tileLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Muls(expertForSourceRowLocalFp32, expertForSourceRowLocalFp32, (float)-1, this->tileLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
int64_t duplicateNum = this->totalLength % ONE_REPEAT_SORT_NUM;
if (duplicateNum > 0) {
@@ -68,28 +68,28 @@ __aicore__ inline void MoeV2SortOneCore::SortCompute() {
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
uint64_t mask[2] = {mask0, 0};
Duplicate(expertForSourceRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
}
LocalTensor<float> concatLocal;
LocalTensor<float> tempTensor = tempBuffer.Get<float>(GetSortLen<float>(this->sortNum));
Concat(concatLocal, expertForSourceRowLocalFp32, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
LocalTensor<float> sortedLocal = sortedBuffer.Get<float>(GetSortLen<float>(this->sortNum));
LocalTensor<uint32_t> sourceRowLocal;
sourceRowLocal = inLocal[this->sortNum].ReinterpretCast<uint32_t>();
Sort<float, true>(sortedLocal, concatLocal, sourceRowLocal, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
LocalTensor<float> outLocal = sortDataCopyOutQueue.AllocTensor<float>();
LocalTensor<float> sortedExpertForSourceRowLocal = outLocal[0];
LocalTensor<uint32_t> expandDstToSrcRowLocal;
expandDstToSrcRowLocal = outLocal[this->sortNum].ReinterpretCast<uint32_t>();
Extract(sortedExpertForSourceRowLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum / ONE_REPEAT_SORT_NUM);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Muls(sortedExpertForSourceRowLocal, sortedExpertForSourceRowLocal, (float)-1, this->tileLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
LocalTensor<int32_t> expertForSourceRowLocalInt32;
expertForSourceRowLocalInt32 = sortedExpertForSourceRowLocal.ReinterpretCast<int32_t>();

View File

@@ -164,31 +164,31 @@ __aicore__ inline void MoeV2SrcToDstAndGather<T, TilingData>::Compute(int32_t sr
if constexpr (!IsSameType<T, float>::value) {
Cast(inLocal, inLocal.template ReinterpretCast<T>()[perLoopColsAlign], RoundMode::CAST_NONE, this->cols);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
}
if (smoothType != 0) {
Mul(inLocal, inLocal, smoothLocal, this->cols);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
}
Abs(tempLocal, inLocal, this->cols);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
ReduceMax(dynamicQuantLocal, tempLocal, tempLocal, this->cols);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
float maxValue = dynamicQuantLocal.GetValue(0) / 127.0f;
Duplicate<float>(dynamicQuantLocal, maxValue, 8);
Duplicate<float>(tempLocal, maxValue, this->cols);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Div(tempLocal, inLocal, tempLocal, this->cols);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Cast(tempLocal.ReinterpretCast<half>(), tempLocal, RoundMode::CAST_TRUNC, this->cols);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_ROUND, this->cols);
@@ -274,7 +274,7 @@ __aicore__ inline float MoeV2SrcToDstAndGather<T, TilingData>::ComputeMax(LocalT
if constexpr (!IsSameType<T, float>::value) {
Cast(inLocal, inLocal.ReinterpretCast<T>()[perLoopColsAlign], RoundMode::CAST_NONE, colsTileLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
}
if (smoothType != 0) {
@@ -284,11 +284,11 @@ __aicore__ inline float MoeV2SrcToDstAndGather<T, TilingData>::ComputeMax(LocalT
smoothLocal = smoothInQueue.DeQue<float>();
Mul(inLocal, inLocal, smoothLocal, colsTileLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
}
Abs(tempLocal, inLocal, colsTileLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
ReduceMax(dynamicQuantLocal[8], tempLocal, tempLocal, colsTileLength);
@@ -314,13 +314,13 @@ __aicore__ inline void MoeV2SrcToDstAndGather<T, TilingData>::ComputeScale(Local
inLocal = inputXInQueue.DeQue<float>();
Duplicate<float>(tempLocal, scaleTemp, colsTileLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Div(tempLocal, inLocal, tempLocal, colsTileLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Cast(tempLocal.ReinterpretCast<half>(), tempLocal, RoundMode::CAST_TRUNC, colsTileLength);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_ROUND, colsTileLength);

View File

@@ -75,13 +75,13 @@ __aicore__ inline void MoeV2SrcToDstOp::Compute(int64_t progress) {
LocalTensor<int32_t> outLocal = copyOutQueue.AllocTensor<int32_t>();
LocalTensor<int32_t> assistTensor = assistBuffer.Get<int32_t>(ASSIST_NUM);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
int64_t loops = Ceil(currentLoopRows, ASSIST_INDEX_NUM);
for (int64_t i = 0; i < loops; i++) {
Adds(outLocal[i * ASSIST_NUM], assistTensor,
static_cast<int32_t>(this->perLoopRows * progress + i * ASSIST_INDEX_NUM), ASSIST_NUM);
}
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
copyOutQueue.EnQue<int32_t>(outLocal);
}

View File

@@ -226,7 +226,7 @@ public:
AscendC::CrossCoreSetFlag<0x0, PIPE_MTE3>(RECV_SYNC_EVENT_ID);
AscendC::CrossCoreSetFlag<0x0, PIPE_MTE3>(SEND_SYNC_EVENT_ID);
AscendC::CrossCoreWaitFlag(SEND_SYNC_EVENT_ID);
pipe_barrier(PIPE_ALL);
AscendC::PipeBarrier<PIPE_ALL>();
ctrBuffer.SetValue(0, epStateValue_);
AscendC::SetFlag<AscendC::HardEvent::S_MTE3>(EVENT_ID0);