[Build] Add support for Ascend950 chip (#7151)
### What this PR does / why we need it?
This PR adds support for the Ascend950 chip. This includes:
- Updating build scripts (`CMakeLists.txt` and `setup.py`) to recognize
the Ascend950 chip and set appropriate compilation flags.
- Disabling a set of custom operators that are not yet supported on the
Ascend950 hardware target.
- Performing a codebase-wide refactoring of `pipe_barrier()` calls to
the namespaced `AscendC::PipeBarrier<>()` for improved code consistency
and adherence to the latest API standards.
Ascend950DT e2e passed (Qwen3-32B-MXFP8) and CI passed
- vLLM version: v0.16.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: linfeng-yuan <1102311262@qq.com>
This commit is contained in:
@@ -61,7 +61,7 @@ set(VLLM_ASCEND_CUSTOM_OP
|
|||||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_kernel/batch_matmul_transpose_kernel.cpp
|
${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_kernel/batch_matmul_transpose_kernel.cpp
|
||||||
)
|
)
|
||||||
|
|
||||||
set(VLLM_ASCEND_CUSTOM_OP_EXCLUDE
|
set(VLLM_ASCEND_CUSTOM_OP_EXCLUDE_310P
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernels/bgmv_expand.cpp
|
${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernels/bgmv_expand.cpp
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernels/bgmv_shrink.cpp
|
${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernels/bgmv_shrink.cpp
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernels/sgmv_expand.cpp
|
${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernels/sgmv_expand.cpp
|
||||||
@@ -70,10 +70,21 @@ set(VLLM_ASCEND_CUSTOM_OP_EXCLUDE
|
|||||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_kernel/batch_matmul_transpose_kernel.cpp
|
${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_kernel/batch_matmul_transpose_kernel.cpp
|
||||||
)
|
)
|
||||||
|
|
||||||
|
set(VLLM_ASCEND_CUSTOM_OP_EXCLUDE_ASCEND950
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/csrc/mla_preprocess/op_kernel/mla_preprocess_kernel.cpp
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_kernel/batch_matmul_transpose_kernel.cpp
|
||||||
|
)
|
||||||
|
|
||||||
if(SOC_VERSION MATCHES "ascend310p.*")
|
if(SOC_VERSION MATCHES "ascend310p.*")
|
||||||
message(STATUS "310P hardware detected: disabling MLAPO operators")
|
message(STATUS "310P hardware detected: disabling MLAPO operators")
|
||||||
message(STATUS "310P hardware detected: excluding batch_matmul_transpose operators")
|
message(STATUS "310P hardware detected: excluding batch_matmul_transpose operators")
|
||||||
list(REMOVE_ITEM VLLM_ASCEND_CUSTOM_OP ${VLLM_ASCEND_CUSTOM_OP_EXCLUDE})
|
list(REMOVE_ITEM VLLM_ASCEND_CUSTOM_OP ${VLLM_ASCEND_CUSTOM_OP_EXCLUDE_310P})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if(SOC_VERSION MATCHES "ascend950")
|
||||||
|
message(STATUS "A5 hardware detected: disabling MLAPO operators")
|
||||||
|
message(STATUS "A5 hardware detected: excluding batch_matmul_transpose operators")
|
||||||
|
list(REMOVE_ITEM VLLM_ASCEND_CUSTOM_OP ${VLLM_ASCEND_CUSTOM_OP_EXCLUDE_ASCEND950})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
ascendc_library(vllm_ascend_kernels SHARED
|
ascendc_library(vllm_ascend_kernels SHARED
|
||||||
|
|||||||
@@ -99,9 +99,9 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::SortCompute() {
|
|||||||
LocalTensor<int32_t> expertIdxLocal = inLocal[0];
|
LocalTensor<int32_t> expertIdxLocal = inLocal[0];
|
||||||
LocalTensor<float> expertIdxLocalFp32 = expertIdxLocal.ReinterpretCast<float>();
|
LocalTensor<float> expertIdxLocalFp32 = expertIdxLocal.ReinterpretCast<float>();
|
||||||
Cast(expertIdxLocalFp32, expertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
|
Cast(expertIdxLocalFp32, expertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Muls(expertIdxLocalFp32, expertIdxLocalFp32, (float)-1, this->totalLength);
|
Muls(expertIdxLocalFp32, expertIdxLocalFp32, (float)-1, this->totalLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
int64_t duplicateNum = this->totalLength % ONE_REPEAT_SORT_NUM;
|
int64_t duplicateNum = this->totalLength % ONE_REPEAT_SORT_NUM;
|
||||||
if (duplicateNum > 0) {
|
if (duplicateNum > 0) {
|
||||||
int duplicateIndex = this->totalLength - duplicateNum;
|
int duplicateIndex = this->totalLength - duplicateNum;
|
||||||
@@ -110,38 +110,38 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::SortCompute() {
|
|||||||
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
|
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
|
||||||
uint64_t mask[2] = {mask0, 0};
|
uint64_t mask[2] = {mask0, 0};
|
||||||
Duplicate(expertIdxLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
Duplicate(expertIdxLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
}
|
}
|
||||||
LocalTensor<float> concatLocal;
|
LocalTensor<float> concatLocal;
|
||||||
LocalTensor<float> tempTensor = tempBuffer.Get<float>(GetSortLen<float>(this->sortNum_));
|
LocalTensor<float> tempTensor = tempBuffer.Get<float>(GetSortLen<float>(this->sortNum_));
|
||||||
Concat(concatLocal, expertIdxLocalFp32, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
Concat(concatLocal, expertIdxLocalFp32, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
LocalTensor<uint32_t> rowIdxLocal = inLocal[this->sortNum_].template ReinterpretCast<uint32_t>();
|
LocalTensor<uint32_t> rowIdxLocal = inLocal[this->sortNum_].template ReinterpretCast<uint32_t>();
|
||||||
LocalTensor<float> sortedLocal = sortedBuffer.Get<float>(GetSortLen<float>(this->sortNum_));
|
LocalTensor<float> sortedLocal = sortedBuffer.Get<float>(GetSortLen<float>(this->sortNum_));
|
||||||
Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
LocalTensor<float> expandedExpertIdxLocal = expandedExpertIdxCopyOutQueue_.AllocTensor<float>();
|
LocalTensor<float> expandedExpertIdxLocal = expandedExpertIdxCopyOutQueue_.AllocTensor<float>();
|
||||||
expandDstToSrcRowLocal = expandDstToSrcRowQueue_.AllocTensor<uint32_t>();
|
expandDstToSrcRowLocal = expandDstToSrcRowQueue_.AllocTensor<uint32_t>();
|
||||||
LocalTensor<float> expandDstToSrcRowLocalFp32 = expandDstToSrcRowLocal.ReinterpretCast<float>();
|
LocalTensor<float> expandDstToSrcRowLocalFp32 = expandDstToSrcRowLocal.ReinterpretCast<float>();
|
||||||
Extract(expandedExpertIdxLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
Extract(expandedExpertIdxLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Cast(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocal.ReinterpretCast<int32_t>(), RoundMode::CAST_ROUND,
|
Cast(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocal.ReinterpretCast<int32_t>(), RoundMode::CAST_ROUND,
|
||||||
this->totalLength);
|
this->totalLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Muls(expandedExpertIdxLocal, expandedExpertIdxLocal, (float)-1, this->totalLength);
|
Muls(expandedExpertIdxLocal, expandedExpertIdxLocal, (float)-1, this->totalLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
LocalTensor<int32_t> expandedExpertIdxLocalInt32;
|
LocalTensor<int32_t> expandedExpertIdxLocalInt32;
|
||||||
expandedExpertIdxLocalInt32 = expandedExpertIdxLocal.ReinterpretCast<int32_t>();
|
expandedExpertIdxLocalInt32 = expandedExpertIdxLocal.ReinterpretCast<int32_t>();
|
||||||
Cast(expandedExpertIdxLocalInt32, expandedExpertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
|
Cast(expandedExpertIdxLocalInt32, expandedExpertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
expandedExpertIdxCopyOutQueue_.EnQue<int32_t>(expandedExpertIdxLocalInt32);
|
expandedExpertIdxCopyOutQueue_.EnQue<int32_t>(expandedExpertIdxLocalInt32);
|
||||||
|
|
||||||
LocalTensor<uint32_t> expandedRowIdx = expandedRowIdxCopyOutQueue_.AllocTensor<uint32_t>();
|
LocalTensor<uint32_t> expandedRowIdx = expandedRowIdxCopyOutQueue_.AllocTensor<uint32_t>();
|
||||||
LocalTensor<uint32_t> expandedRowIdxU32 = expandedRowIdx.ReinterpretCast<uint32_t>();
|
LocalTensor<uint32_t> expandedRowIdxU32 = expandedRowIdx.ReinterpretCast<uint32_t>();
|
||||||
Muls(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocalFp32, (float)-1, this->totalLength);
|
Muls(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocalFp32, (float)-1, this->totalLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
ArithProgression<int32_t>(inLocal[this->sortNum_], 0, 1, this->totalLength);
|
ArithProgression<int32_t>(inLocal[this->sortNum_], 0, 1, this->totalLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
if (duplicateNum > 0) {
|
if (duplicateNum > 0) {
|
||||||
int duplicateIndex = this->totalLength - duplicateNum;
|
int duplicateIndex = this->totalLength - duplicateNum;
|
||||||
uint64_t mask0 = UINT64_MAX;
|
uint64_t mask0 = UINT64_MAX;
|
||||||
@@ -149,14 +149,14 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::SortCompute() {
|
|||||||
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
|
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
|
||||||
uint64_t mask[2] = {mask0, 0};
|
uint64_t mask[2] = {mask0, 0};
|
||||||
Duplicate(expandDstToSrcRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
Duplicate(expandDstToSrcRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
}
|
}
|
||||||
Concat(concatLocal, expandDstToSrcRowLocalFp32, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
Concat(concatLocal, expandDstToSrcRowLocalFp32, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Extract(tempTensor, expandedRowIdxU32, sortedLocal, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
Extract(tempTensor, expandedRowIdxU32, sortedLocal, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
expandedRowIdxCopyOutQueue_.EnQue<uint32_t>(expandedRowIdx);
|
expandedRowIdxCopyOutQueue_.EnQue<uint32_t>(expandedRowIdx);
|
||||||
sortDataCopyInQueue.FreeTensor(inLocal);
|
sortDataCopyInQueue.FreeTensor(inLocal);
|
||||||
}
|
}
|
||||||
@@ -227,31 +227,31 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::Compute(LocalTensor<float>&
|
|||||||
|
|
||||||
if constexpr (!IsSameType<T, float>::value) {
|
if constexpr (!IsSameType<T, float>::value) {
|
||||||
Cast(inLocal, inLocal.ReinterpretCast<T>()[colsAlign], RoundMode::CAST_NONE, this->cols_);
|
Cast(inLocal, inLocal.ReinterpretCast<T>()[colsAlign], RoundMode::CAST_NONE, this->cols_);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (smoothType != 0) {
|
if (smoothType != 0) {
|
||||||
Mul(inLocal, inLocal, smoothLocal, this->cols_);
|
Mul(inLocal, inLocal, smoothLocal, this->cols_);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
}
|
}
|
||||||
|
|
||||||
Abs(tempLocal, inLocal, this->cols_);
|
Abs(tempLocal, inLocal, this->cols_);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
ReduceMax(dynamicQuantLocal, tempLocal, tempLocal, this->cols_);
|
ReduceMax(dynamicQuantLocal, tempLocal, tempLocal, this->cols_);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
float maxValue = dynamicQuantLocal.GetValue(0) / 127.0f;
|
float maxValue = dynamicQuantLocal.GetValue(0) / 127.0f;
|
||||||
|
|
||||||
Duplicate<float>(dynamicQuantLocal, maxValue, 8);
|
Duplicate<float>(dynamicQuantLocal, maxValue, 8);
|
||||||
Duplicate<float>(tempLocal, maxValue, this->cols_);
|
Duplicate<float>(tempLocal, maxValue, this->cols_);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
Div(tempLocal, inLocal, tempLocal, this->cols_);
|
Div(tempLocal, inLocal, tempLocal, this->cols_);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
Cast(tempLocal.ReinterpretCast<half>(), tempLocal, RoundMode::CAST_TRUNC, this->cols_);
|
Cast(tempLocal.ReinterpretCast<half>(), tempLocal, RoundMode::CAST_TRUNC, this->cols_);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_ROUND, this->cols_);
|
Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_ROUND, this->cols_);
|
||||||
|
|
||||||
|
|||||||
@@ -56,34 +56,34 @@ __aicore__ inline void MoeV2FullLoadQuant<T>::Compute(int64_t xLocalLength) {
|
|||||||
uint32_t elements = Align(this->cols, sizeof(int8_t)) * xLocalLength;
|
uint32_t elements = Align(this->cols, sizeof(int8_t)) * xLocalLength;
|
||||||
if constexpr (IsSameType<T, bfloat16_t>::value) {
|
if constexpr (IsSameType<T, bfloat16_t>::value) {
|
||||||
Cast(floatLocal, inLocal, RoundMode::CAST_NONE, elements);
|
Cast(floatLocal, inLocal, RoundMode::CAST_NONE, elements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Cast(halfLocal, floatLocal, RoundMode::CAST_NONE, elements);
|
Cast(halfLocal, floatLocal, RoundMode::CAST_NONE, elements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Muls(halfLocal, halfLocal, static_cast<half>(this->scale), elements);
|
Muls(halfLocal, halfLocal, static_cast<half>(this->scale), elements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Adds(halfLocal, halfLocal, static_cast<half>(this->offset), elements);
|
Adds(halfLocal, halfLocal, static_cast<half>(this->offset), elements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
LocalTensor<int32_t> intLocal = floatLocal.ReinterpretCast<int32_t>();
|
LocalTensor<int32_t> intLocal = floatLocal.ReinterpretCast<int32_t>();
|
||||||
Cast(intLocal, halfLocal, RoundMode::CAST_RINT, elements);
|
Cast(intLocal, halfLocal, RoundMode::CAST_RINT, elements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
SetDeqScale((half)1.000000e+00f);
|
SetDeqScale((half)1.000000e+00f);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Cast(halfLocal, intLocal, RoundMode::CAST_RINT, elements);
|
Cast(halfLocal, intLocal, RoundMode::CAST_RINT, elements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Cast(outLocal, halfLocal, RoundMode::CAST_RINT, elements);
|
Cast(outLocal, halfLocal, RoundMode::CAST_RINT, elements);
|
||||||
} else if constexpr (IsSameType<T, float>::value) {
|
} else if constexpr (IsSameType<T, float>::value) {
|
||||||
Cast(halfLocal, inLocal, RoundMode::CAST_NONE, elements);
|
Cast(halfLocal, inLocal, RoundMode::CAST_NONE, elements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Muls(halfLocal, halfLocal, static_cast<half>(this->scale), elements);
|
Muls(halfLocal, halfLocal, static_cast<half>(this->scale), elements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Adds(halfLocal, halfLocal, static_cast<half>(this->offset), elements);
|
Adds(halfLocal, halfLocal, static_cast<half>(this->offset), elements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Cast(outLocal, halfLocal, RoundMode::CAST_RINT, elements);
|
Cast(outLocal, halfLocal, RoundMode::CAST_RINT, elements);
|
||||||
} else {
|
} else {
|
||||||
Muls(inLocal, inLocal, static_cast<T>(this->scale), elements);
|
Muls(inLocal, inLocal, static_cast<T>(this->scale), elements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Adds(inLocal, inLocal, static_cast<T>(this->offset), elements);
|
Adds(inLocal, inLocal, static_cast<T>(this->offset), elements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Cast(outLocal, inLocal, RoundMode::CAST_RINT, elements);
|
Cast(outLocal, inLocal, RoundMode::CAST_RINT, elements);
|
||||||
}
|
}
|
||||||
inputXCopyOutQueue.EnQue(outLocal);
|
inputXCopyOutQueue.EnQue(outLocal);
|
||||||
|
|||||||
@@ -94,9 +94,9 @@ __aicore__ inline void MoeV2FullLoadQuantBase::SortCompute() {
|
|||||||
LocalTensor<int32_t> expertIdxLocal = inLocal[0];
|
LocalTensor<int32_t> expertIdxLocal = inLocal[0];
|
||||||
LocalTensor<float> expertIdxLocalFp32 = expertIdxLocal.ReinterpretCast<float>();
|
LocalTensor<float> expertIdxLocalFp32 = expertIdxLocal.ReinterpretCast<float>();
|
||||||
Cast(expertIdxLocalFp32, expertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
|
Cast(expertIdxLocalFp32, expertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Muls(expertIdxLocalFp32, expertIdxLocalFp32, (float)-1, this->totalLength);
|
Muls(expertIdxLocalFp32, expertIdxLocalFp32, (float)-1, this->totalLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
int64_t duplicateNum = this->totalLength % ONE_REPEAT_SORT_NUM;
|
int64_t duplicateNum = this->totalLength % ONE_REPEAT_SORT_NUM;
|
||||||
if (duplicateNum > 0) {
|
if (duplicateNum > 0) {
|
||||||
int duplicateIndex = this->totalLength - duplicateNum;
|
int duplicateIndex = this->totalLength - duplicateNum;
|
||||||
@@ -105,38 +105,38 @@ __aicore__ inline void MoeV2FullLoadQuantBase::SortCompute() {
|
|||||||
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
|
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
|
||||||
uint64_t mask[2] = {mask0, 0};
|
uint64_t mask[2] = {mask0, 0};
|
||||||
Duplicate(expertIdxLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
Duplicate(expertIdxLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
}
|
}
|
||||||
LocalTensor<float> concatLocal;
|
LocalTensor<float> concatLocal;
|
||||||
LocalTensor<float> tempTensor = tempBuffer.Get<float>(GetSortLen<float>(this->sortNum));
|
LocalTensor<float> tempTensor = tempBuffer.Get<float>(GetSortLen<float>(this->sortNum));
|
||||||
Concat(concatLocal, expertIdxLocalFp32, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
|
Concat(concatLocal, expertIdxLocalFp32, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
LocalTensor<uint32_t> rowIdxLocal = inLocal[this->sortNum].template ReinterpretCast<uint32_t>();
|
LocalTensor<uint32_t> rowIdxLocal = inLocal[this->sortNum].template ReinterpretCast<uint32_t>();
|
||||||
LocalTensor<float> sortedLocal = sortedBuffer.Get<float>(GetSortLen<float>(this->sortNum));
|
LocalTensor<float> sortedLocal = sortedBuffer.Get<float>(GetSortLen<float>(this->sortNum));
|
||||||
Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
|
Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
LocalTensor<float> expandedExpertIdxLocal = expandedExpertIdxCopyOutQueue.AllocTensor<float>();
|
LocalTensor<float> expandedExpertIdxLocal = expandedExpertIdxCopyOutQueue.AllocTensor<float>();
|
||||||
LocalTensor<uint32_t> expandDstToSrcRowLocal = expandDstToSrcRowQueue.AllocTensor<uint32_t>();
|
LocalTensor<uint32_t> expandDstToSrcRowLocal = expandDstToSrcRowQueue.AllocTensor<uint32_t>();
|
||||||
LocalTensor<float> expandDstToSrcRowLocalFp32 = expandDstToSrcRowLocal.ReinterpretCast<float>();
|
LocalTensor<float> expandDstToSrcRowLocalFp32 = expandDstToSrcRowLocal.ReinterpretCast<float>();
|
||||||
Extract(expandedExpertIdxLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum / ONE_REPEAT_SORT_NUM);
|
Extract(expandedExpertIdxLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum / ONE_REPEAT_SORT_NUM);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Cast(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocal.ReinterpretCast<int32_t>(), RoundMode::CAST_ROUND,
|
Cast(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocal.ReinterpretCast<int32_t>(), RoundMode::CAST_ROUND,
|
||||||
this->totalLength);
|
this->totalLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Muls(expandedExpertIdxLocal, expandedExpertIdxLocal, (float)-1, this->totalLength);
|
Muls(expandedExpertIdxLocal, expandedExpertIdxLocal, (float)-1, this->totalLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
LocalTensor<int32_t> expandedExpertIdxLocalInt32;
|
LocalTensor<int32_t> expandedExpertIdxLocalInt32;
|
||||||
expandedExpertIdxLocalInt32 = expandedExpertIdxLocal.ReinterpretCast<int32_t>();
|
expandedExpertIdxLocalInt32 = expandedExpertIdxLocal.ReinterpretCast<int32_t>();
|
||||||
Cast(expandedExpertIdxLocalInt32, expandedExpertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
|
Cast(expandedExpertIdxLocalInt32, expandedExpertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
expandedExpertIdxCopyOutQueue.EnQue<int32_t>(expandedExpertIdxLocalInt32);
|
expandedExpertIdxCopyOutQueue.EnQue<int32_t>(expandedExpertIdxLocalInt32);
|
||||||
|
|
||||||
LocalTensor<uint32_t> expandedRowIdx = expandedRowIdxCopyOutQueue.AllocTensor<uint32_t>();
|
LocalTensor<uint32_t> expandedRowIdx = expandedRowIdxCopyOutQueue.AllocTensor<uint32_t>();
|
||||||
LocalTensor<uint32_t> expandedRowIdxU32 = expandedRowIdx.ReinterpretCast<uint32_t>();
|
LocalTensor<uint32_t> expandedRowIdxU32 = expandedRowIdx.ReinterpretCast<uint32_t>();
|
||||||
Muls(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocalFp32, (float)-1, this->totalLength);
|
Muls(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocalFp32, (float)-1, this->totalLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
ArithProgression<int32_t>(inLocal[this->sortNum], 0, 1, this->totalLength);
|
ArithProgression<int32_t>(inLocal[this->sortNum], 0, 1, this->totalLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
if (duplicateNum > 0) {
|
if (duplicateNum > 0) {
|
||||||
int duplicateIndex = this->totalLength - duplicateNum;
|
int duplicateIndex = this->totalLength - duplicateNum;
|
||||||
uint64_t mask0 = UINT64_MAX;
|
uint64_t mask0 = UINT64_MAX;
|
||||||
@@ -144,14 +144,14 @@ __aicore__ inline void MoeV2FullLoadQuantBase::SortCompute() {
|
|||||||
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
|
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
|
||||||
uint64_t mask[2] = {mask0, 0};
|
uint64_t mask[2] = {mask0, 0};
|
||||||
Duplicate(expandDstToSrcRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
Duplicate(expandDstToSrcRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
}
|
}
|
||||||
Concat(concatLocal, expandDstToSrcRowLocalFp32, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
|
Concat(concatLocal, expandDstToSrcRowLocalFp32, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
|
Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Extract(tempTensor, expandedRowIdxU32, sortedLocal, this->sortNum / ONE_REPEAT_SORT_NUM);
|
Extract(tempTensor, expandedRowIdxU32, sortedLocal, this->sortNum / ONE_REPEAT_SORT_NUM);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
expandedRowIdxCopyOutQueue.EnQue<uint32_t>(expandedRowIdx);
|
expandedRowIdxCopyOutQueue.EnQue<uint32_t>(expandedRowIdx);
|
||||||
sortDataCopyInQueue.FreeTensor(inLocal);
|
sortDataCopyInQueue.FreeTensor(inLocal);
|
||||||
|
|
||||||
|
|||||||
@@ -122,31 +122,31 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::Compute(LocalTensor<float>& s
|
|||||||
|
|
||||||
if constexpr (!IsSameType<T, float>::value) {
|
if constexpr (!IsSameType<T, float>::value) {
|
||||||
Cast(inLocal, inLocal.ReinterpretCast<T>()[perLoopColsAlign], RoundMode::CAST_NONE, this->cols);
|
Cast(inLocal, inLocal.ReinterpretCast<T>()[perLoopColsAlign], RoundMode::CAST_NONE, this->cols);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (smoothType != 0) {
|
if (smoothType != 0) {
|
||||||
Mul(inLocal, inLocal, smoothLocal, this->cols);
|
Mul(inLocal, inLocal, smoothLocal, this->cols);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
}
|
}
|
||||||
|
|
||||||
Abs(tempLocal, inLocal, this->cols);
|
Abs(tempLocal, inLocal, this->cols);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
ReduceMax(dynamicQuantLocal, tempLocal, tempLocal, this->cols);
|
ReduceMax(dynamicQuantLocal, tempLocal, tempLocal, this->cols);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
float maxValue = dynamicQuantLocal.GetValue(0) / 127.0f;
|
float maxValue = dynamicQuantLocal.GetValue(0) / 127.0f;
|
||||||
|
|
||||||
Duplicate<float>(dynamicQuantLocal, maxValue, 8);
|
Duplicate<float>(dynamicQuantLocal, maxValue, 8);
|
||||||
Duplicate<float>(tempLocal, maxValue, this->cols);
|
Duplicate<float>(tempLocal, maxValue, this->cols);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
Div(tempLocal, inLocal, tempLocal, this->cols);
|
Div(tempLocal, inLocal, tempLocal, this->cols);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
Cast(tempLocal.ReinterpretCast<half>(), tempLocal, RoundMode::CAST_TRUNC, this->cols);
|
Cast(tempLocal.ReinterpretCast<half>(), tempLocal, RoundMode::CAST_TRUNC, this->cols);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_ROUND, this->cols);
|
Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_ROUND, this->cols);
|
||||||
|
|
||||||
@@ -285,16 +285,16 @@ __aicore__ inline float MoeV2GatherDynamicQuant<T>::ComputeMax(LocalTensor<float
|
|||||||
|
|
||||||
if constexpr (!IsSameType<T, float>::value) {
|
if constexpr (!IsSameType<T, float>::value) {
|
||||||
Cast(inLocal, inLocal.ReinterpretCast<T>()[perLoopColsAlign], RoundMode::CAST_NONE, colsTileLength);
|
Cast(inLocal, inLocal.ReinterpretCast<T>()[perLoopColsAlign], RoundMode::CAST_NONE, colsTileLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (smoothType != 0) {
|
if (smoothType != 0) {
|
||||||
Mul(inLocal, inLocal, smoothLocal, colsTileLength);
|
Mul(inLocal, inLocal, smoothLocal, colsTileLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
}
|
}
|
||||||
|
|
||||||
Abs(tempLocal, inLocal, colsTileLength);
|
Abs(tempLocal, inLocal, colsTileLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
ReduceMax(dynamicQuantLocal[8], tempLocal, tempLocal, colsTileLength);
|
ReduceMax(dynamicQuantLocal[8], tempLocal, tempLocal, colsTileLength);
|
||||||
|
|
||||||
@@ -319,13 +319,13 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::ComputeScale(LocalTensor<floa
|
|||||||
inLocal = inputXInQueue.DeQue<float>();
|
inLocal = inputXInQueue.DeQue<float>();
|
||||||
|
|
||||||
Duplicate<float>(tempLocal, scaleTemp, colsTileLength);
|
Duplicate<float>(tempLocal, scaleTemp, colsTileLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
Div(tempLocal, inLocal, tempLocal, colsTileLength);
|
Div(tempLocal, inLocal, tempLocal, colsTileLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
Cast(tempLocal.ReinterpretCast<half>(), tempLocal, RoundMode::CAST_TRUNC, colsTileLength);
|
Cast(tempLocal.ReinterpretCast<half>(), tempLocal, RoundMode::CAST_TRUNC, colsTileLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_ROUND, colsTileLength);
|
Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_ROUND, colsTileLength);
|
||||||
|
|
||||||
|
|||||||
@@ -95,34 +95,34 @@ __aicore__ inline void MoeV2GatherQuant<T>::Compute() {
|
|||||||
uint32_t elements = Align(this->colsTileLength, sizeof(T));
|
uint32_t elements = Align(this->colsTileLength, sizeof(T));
|
||||||
if constexpr (IsSameType<T, bfloat16_t>::value) {
|
if constexpr (IsSameType<T, bfloat16_t>::value) {
|
||||||
Cast(floatLocal, inLocal, RoundMode::CAST_NONE, elements);
|
Cast(floatLocal, inLocal, RoundMode::CAST_NONE, elements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Cast(halfLocal, floatLocal, RoundMode::CAST_NONE, elements);
|
Cast(halfLocal, floatLocal, RoundMode::CAST_NONE, elements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Muls(halfLocal, halfLocal, static_cast<half>(this->scale), elements);
|
Muls(halfLocal, halfLocal, static_cast<half>(this->scale), elements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Adds(halfLocal, halfLocal, static_cast<half>(this->offset), elements);
|
Adds(halfLocal, halfLocal, static_cast<half>(this->offset), elements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
LocalTensor<int32_t> intLocal = floatLocal.ReinterpretCast<int32_t>();
|
LocalTensor<int32_t> intLocal = floatLocal.ReinterpretCast<int32_t>();
|
||||||
Cast(intLocal, halfLocal, RoundMode::CAST_RINT, elements);
|
Cast(intLocal, halfLocal, RoundMode::CAST_RINT, elements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
SetDeqScale((half)1.000000e+00f);
|
SetDeqScale((half)1.000000e+00f);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Cast(halfLocal, intLocal, RoundMode::CAST_RINT, elements);
|
Cast(halfLocal, intLocal, RoundMode::CAST_RINT, elements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Cast(outLocal, halfLocal, RoundMode::CAST_RINT, elements);
|
Cast(outLocal, halfLocal, RoundMode::CAST_RINT, elements);
|
||||||
} else if constexpr (IsSameType<T, float>::value) {
|
} else if constexpr (IsSameType<T, float>::value) {
|
||||||
Cast(halfLocal, inLocal, RoundMode::CAST_NONE, elements);
|
Cast(halfLocal, inLocal, RoundMode::CAST_NONE, elements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Muls(halfLocal, halfLocal, static_cast<half>(this->scale), elements);
|
Muls(halfLocal, halfLocal, static_cast<half>(this->scale), elements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Adds(halfLocal, halfLocal, static_cast<half>(this->offset), elements);
|
Adds(halfLocal, halfLocal, static_cast<half>(this->offset), elements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Cast(outLocal, halfLocal, RoundMode::CAST_RINT, elements);
|
Cast(outLocal, halfLocal, RoundMode::CAST_RINT, elements);
|
||||||
} else {
|
} else {
|
||||||
Muls(inLocal, inLocal, static_cast<T>(this->scale), elements);
|
Muls(inLocal, inLocal, static_cast<T>(this->scale), elements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Adds(inLocal, inLocal, static_cast<T>(this->offset), elements);
|
Adds(inLocal, inLocal, static_cast<T>(this->offset), elements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Cast(outLocal, inLocal, RoundMode::CAST_RINT, elements);
|
Cast(outLocal, inLocal, RoundMode::CAST_RINT, elements);
|
||||||
}
|
}
|
||||||
inputXCopyOutQueue.EnQue(outLocal);
|
inputXCopyOutQueue.EnQue(outLocal);
|
||||||
|
|||||||
@@ -88,9 +88,9 @@ __aicore__ inline void MoeV2FullLoad<T>::SortCompute() {
|
|||||||
LocalTensor<int32_t> expertIdxLocal = inLocal[0];
|
LocalTensor<int32_t> expertIdxLocal = inLocal[0];
|
||||||
LocalTensor<float> expertIdxLocalFp32 = expertIdxLocal.ReinterpretCast<float>();
|
LocalTensor<float> expertIdxLocalFp32 = expertIdxLocal.ReinterpretCast<float>();
|
||||||
Cast(expertIdxLocalFp32, expertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
|
Cast(expertIdxLocalFp32, expertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Muls(expertIdxLocalFp32, expertIdxLocalFp32, (float)-1, this->totalLength);
|
Muls(expertIdxLocalFp32, expertIdxLocalFp32, (float)-1, this->totalLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
int64_t duplicateNum = this->totalLength % ONE_REPEAT_SORT_NUM;
|
int64_t duplicateNum = this->totalLength % ONE_REPEAT_SORT_NUM;
|
||||||
if (duplicateNum > 0) {
|
if (duplicateNum > 0) {
|
||||||
int duplicateIndex = this->totalLength - duplicateNum;
|
int duplicateIndex = this->totalLength - duplicateNum;
|
||||||
@@ -99,38 +99,38 @@ __aicore__ inline void MoeV2FullLoad<T>::SortCompute() {
|
|||||||
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
|
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
|
||||||
uint64_t mask[2] = {mask0, 0};
|
uint64_t mask[2] = {mask0, 0};
|
||||||
Duplicate(expertIdxLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
Duplicate(expertIdxLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
}
|
}
|
||||||
LocalTensor<float> concatLocal;
|
LocalTensor<float> concatLocal;
|
||||||
LocalTensor<float> tempTensor = tempBuffer.Get<float>(GetSortLen<float>(this->sortNum_));
|
LocalTensor<float> tempTensor = tempBuffer.Get<float>(GetSortLen<float>(this->sortNum_));
|
||||||
Concat(concatLocal, expertIdxLocalFp32, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
Concat(concatLocal, expertIdxLocalFp32, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
LocalTensor<uint32_t> rowIdxLocal = inLocal[this->sortNum_].template ReinterpretCast<uint32_t>();
|
LocalTensor<uint32_t> rowIdxLocal = inLocal[this->sortNum_].template ReinterpretCast<uint32_t>();
|
||||||
LocalTensor<float> sortedLocal = sortedBuffer.Get<float>(GetSortLen<float>(this->sortNum_));
|
LocalTensor<float> sortedLocal = sortedBuffer.Get<float>(GetSortLen<float>(this->sortNum_));
|
||||||
Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
LocalTensor<float> expandedExpertIdxLocal = expandedExpertIdxCopyOutQueue_.AllocTensor<float>();
|
LocalTensor<float> expandedExpertIdxLocal = expandedExpertIdxCopyOutQueue_.AllocTensor<float>();
|
||||||
LocalTensor<uint32_t> expandDstToSrcRowLocal = expandDstToSrcRowQueue_.AllocTensor<uint32_t>();
|
LocalTensor<uint32_t> expandDstToSrcRowLocal = expandDstToSrcRowQueue_.AllocTensor<uint32_t>();
|
||||||
LocalTensor<float> expandDstToSrcRowLocalFp32 = expandDstToSrcRowLocal.ReinterpretCast<float>();
|
LocalTensor<float> expandDstToSrcRowLocalFp32 = expandDstToSrcRowLocal.ReinterpretCast<float>();
|
||||||
Extract(expandedExpertIdxLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
Extract(expandedExpertIdxLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Cast(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocal.ReinterpretCast<int32_t>(), RoundMode::CAST_ROUND,
|
Cast(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocal.ReinterpretCast<int32_t>(), RoundMode::CAST_ROUND,
|
||||||
this->totalLength);
|
this->totalLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Muls(expandedExpertIdxLocal, expandedExpertIdxLocal, (float)-1, this->totalLength);
|
Muls(expandedExpertIdxLocal, expandedExpertIdxLocal, (float)-1, this->totalLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
LocalTensor<int32_t> expandedExpertIdxLocalInt32;
|
LocalTensor<int32_t> expandedExpertIdxLocalInt32;
|
||||||
expandedExpertIdxLocalInt32 = expandedExpertIdxLocal.ReinterpretCast<int32_t>();
|
expandedExpertIdxLocalInt32 = expandedExpertIdxLocal.ReinterpretCast<int32_t>();
|
||||||
Cast(expandedExpertIdxLocalInt32, expandedExpertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
|
Cast(expandedExpertIdxLocalInt32, expandedExpertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
expandedExpertIdxCopyOutQueue_.EnQue<int32_t>(expandedExpertIdxLocalInt32);
|
expandedExpertIdxCopyOutQueue_.EnQue<int32_t>(expandedExpertIdxLocalInt32);
|
||||||
|
|
||||||
LocalTensor<uint32_t> expandedRowIdx = expandedRowIdxCopyOutQueue_.AllocTensor<uint32_t>();
|
LocalTensor<uint32_t> expandedRowIdx = expandedRowIdxCopyOutQueue_.AllocTensor<uint32_t>();
|
||||||
LocalTensor<uint32_t> expandedRowIdxU32 = expandedRowIdx.ReinterpretCast<uint32_t>();
|
LocalTensor<uint32_t> expandedRowIdxU32 = expandedRowIdx.ReinterpretCast<uint32_t>();
|
||||||
Muls(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocalFp32, (float)-1, this->totalLength);
|
Muls(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocalFp32, (float)-1, this->totalLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
ArithProgression<int32_t>(inLocal[this->sortNum_], 0, 1, this->totalLength);
|
ArithProgression<int32_t>(inLocal[this->sortNum_], 0, 1, this->totalLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
if (duplicateNum > 0) {
|
if (duplicateNum > 0) {
|
||||||
int duplicateIndex = this->totalLength - duplicateNum;
|
int duplicateIndex = this->totalLength - duplicateNum;
|
||||||
uint64_t mask0 = UINT64_MAX;
|
uint64_t mask0 = UINT64_MAX;
|
||||||
@@ -138,14 +138,14 @@ __aicore__ inline void MoeV2FullLoad<T>::SortCompute() {
|
|||||||
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
|
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
|
||||||
uint64_t mask[2] = {mask0, 0};
|
uint64_t mask[2] = {mask0, 0};
|
||||||
Duplicate(expandDstToSrcRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
Duplicate(expandDstToSrcRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
}
|
}
|
||||||
Concat(concatLocal, expandDstToSrcRowLocalFp32, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
Concat(concatLocal, expandDstToSrcRowLocalFp32, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Extract(tempTensor, expandedRowIdxU32, sortedLocal, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
Extract(tempTensor, expandedRowIdxU32, sortedLocal, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
expandedRowIdxCopyOutQueue_.EnQue<uint32_t>(expandedRowIdx);
|
expandedRowIdxCopyOutQueue_.EnQue<uint32_t>(expandedRowIdx);
|
||||||
sortDataCopyInQueue.FreeTensor(inLocal);
|
sortDataCopyInQueue.FreeTensor(inLocal);
|
||||||
|
|
||||||
|
|||||||
@@ -168,9 +168,9 @@ __aicore__ inline void MoeV2MrgsortOut::UpdateSortInfo() {
|
|||||||
|
|
||||||
__aicore__ inline void MoeV2MrgsortOut::Extract() {
|
__aicore__ inline void MoeV2MrgsortOut::Extract() {
|
||||||
AscendC::Extract(this->ubOutput1, this->ubOutput2, this->tempBuffer, Ceil(curLoopSortedNum, ONE_REPEAT_SORT_NUM));
|
AscendC::Extract(this->ubOutput1, this->ubOutput2, this->tempBuffer, Ceil(curLoopSortedNum, ONE_REPEAT_SORT_NUM));
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Muls(this->ubOutput1, this->ubOutput1, (float)-1, Align(curLoopSortedNum, sizeof(float)));
|
Muls(this->ubOutput1, this->ubOutput1, (float)-1, Align(curLoopSortedNum, sizeof(float)));
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Cast(this->ubOutputInt1, this->ubOutput1, RoundMode::CAST_ROUND, Align(curLoopSortedNum, sizeof(float)));
|
Cast(this->ubOutputInt1, this->ubOutput1, RoundMode::CAST_ROUND, Align(curLoopSortedNum, sizeof(float)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -106,9 +106,9 @@ __aicore__ inline void MoeV2SortMultiCore::UBSortCompute(int64_t progress, int64
|
|||||||
|
|
||||||
expertForSourceRowLocalFp32 = expertForSourceRowLocal.ReinterpretCast<float>();
|
expertForSourceRowLocalFp32 = expertForSourceRowLocal.ReinterpretCast<float>();
|
||||||
Cast(expertForSourceRowLocalFp32, expertForSourceRowLocal, RoundMode::CAST_ROUND, sortNum);
|
Cast(expertForSourceRowLocalFp32, expertForSourceRowLocal, RoundMode::CAST_ROUND, sortNum);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Muls(expertForSourceRowLocalFp32, expertForSourceRowLocalFp32, (float)-1, sortNum);
|
Muls(expertForSourceRowLocalFp32, expertForSourceRowLocalFp32, (float)-1, sortNum);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
int64_t duplicateNum = size % ONE_REPEAT_SORT_NUM;
|
int64_t duplicateNum = size % ONE_REPEAT_SORT_NUM;
|
||||||
if (duplicateNum > 0) {
|
if (duplicateNum > 0) {
|
||||||
@@ -118,7 +118,7 @@ __aicore__ inline void MoeV2SortMultiCore::UBSortCompute(int64_t progress, int64
|
|||||||
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
|
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
|
||||||
uint64_t mask[2] = {mask0, 0};
|
uint64_t mask[2] = {mask0, 0};
|
||||||
Duplicate(expertForSourceRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
Duplicate(expertForSourceRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
}
|
}
|
||||||
|
|
||||||
LocalTensor<float> concatLocal = expertForSourceRowLocalFp32;
|
LocalTensor<float> concatLocal = expertForSourceRowLocalFp32;
|
||||||
|
|||||||
@@ -56,9 +56,9 @@ __aicore__ inline void MoeV2SortOneCore::SortCompute() {
|
|||||||
LocalTensor<int32_t> expertForSourceRowLocal = inLocal[0];
|
LocalTensor<int32_t> expertForSourceRowLocal = inLocal[0];
|
||||||
LocalTensor<float> expertForSourceRowLocalFp32 = expertForSourceRowLocal.ReinterpretCast<float>();
|
LocalTensor<float> expertForSourceRowLocalFp32 = expertForSourceRowLocal.ReinterpretCast<float>();
|
||||||
Cast(expertForSourceRowLocalFp32, expertForSourceRowLocal, RoundMode::CAST_ROUND, this->tileLength);
|
Cast(expertForSourceRowLocalFp32, expertForSourceRowLocal, RoundMode::CAST_ROUND, this->tileLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Muls(expertForSourceRowLocalFp32, expertForSourceRowLocalFp32, (float)-1, this->tileLength);
|
Muls(expertForSourceRowLocalFp32, expertForSourceRowLocalFp32, (float)-1, this->tileLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
int64_t duplicateNum = this->totalLength % ONE_REPEAT_SORT_NUM;
|
int64_t duplicateNum = this->totalLength % ONE_REPEAT_SORT_NUM;
|
||||||
if (duplicateNum > 0) {
|
if (duplicateNum > 0) {
|
||||||
@@ -68,28 +68,28 @@ __aicore__ inline void MoeV2SortOneCore::SortCompute() {
|
|||||||
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
|
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
|
||||||
uint64_t mask[2] = {mask0, 0};
|
uint64_t mask[2] = {mask0, 0};
|
||||||
Duplicate(expertForSourceRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
Duplicate(expertForSourceRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
}
|
}
|
||||||
|
|
||||||
LocalTensor<float> concatLocal;
|
LocalTensor<float> concatLocal;
|
||||||
LocalTensor<float> tempTensor = tempBuffer.Get<float>(GetSortLen<float>(this->sortNum));
|
LocalTensor<float> tempTensor = tempBuffer.Get<float>(GetSortLen<float>(this->sortNum));
|
||||||
Concat(concatLocal, expertForSourceRowLocalFp32, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
|
Concat(concatLocal, expertForSourceRowLocalFp32, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
LocalTensor<float> sortedLocal = sortedBuffer.Get<float>(GetSortLen<float>(this->sortNum));
|
LocalTensor<float> sortedLocal = sortedBuffer.Get<float>(GetSortLen<float>(this->sortNum));
|
||||||
LocalTensor<uint32_t> sourceRowLocal;
|
LocalTensor<uint32_t> sourceRowLocal;
|
||||||
sourceRowLocal = inLocal[this->sortNum].ReinterpretCast<uint32_t>();
|
sourceRowLocal = inLocal[this->sortNum].ReinterpretCast<uint32_t>();
|
||||||
Sort<float, true>(sortedLocal, concatLocal, sourceRowLocal, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
|
Sort<float, true>(sortedLocal, concatLocal, sourceRowLocal, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
LocalTensor<float> outLocal = sortDataCopyOutQueue.AllocTensor<float>();
|
LocalTensor<float> outLocal = sortDataCopyOutQueue.AllocTensor<float>();
|
||||||
LocalTensor<float> sortedExpertForSourceRowLocal = outLocal[0];
|
LocalTensor<float> sortedExpertForSourceRowLocal = outLocal[0];
|
||||||
LocalTensor<uint32_t> expandDstToSrcRowLocal;
|
LocalTensor<uint32_t> expandDstToSrcRowLocal;
|
||||||
expandDstToSrcRowLocal = outLocal[this->sortNum].ReinterpretCast<uint32_t>();
|
expandDstToSrcRowLocal = outLocal[this->sortNum].ReinterpretCast<uint32_t>();
|
||||||
Extract(sortedExpertForSourceRowLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum / ONE_REPEAT_SORT_NUM);
|
Extract(sortedExpertForSourceRowLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum / ONE_REPEAT_SORT_NUM);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Muls(sortedExpertForSourceRowLocal, sortedExpertForSourceRowLocal, (float)-1, this->tileLength);
|
Muls(sortedExpertForSourceRowLocal, sortedExpertForSourceRowLocal, (float)-1, this->tileLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
LocalTensor<int32_t> expertForSourceRowLocalInt32;
|
LocalTensor<int32_t> expertForSourceRowLocalInt32;
|
||||||
expertForSourceRowLocalInt32 = sortedExpertForSourceRowLocal.ReinterpretCast<int32_t>();
|
expertForSourceRowLocalInt32 = sortedExpertForSourceRowLocal.ReinterpretCast<int32_t>();
|
||||||
|
|||||||
@@ -164,31 +164,31 @@ __aicore__ inline void MoeV2SrcToDstAndGather<T, TilingData>::Compute(int32_t sr
|
|||||||
|
|
||||||
if constexpr (!IsSameType<T, float>::value) {
|
if constexpr (!IsSameType<T, float>::value) {
|
||||||
Cast(inLocal, inLocal.template ReinterpretCast<T>()[perLoopColsAlign], RoundMode::CAST_NONE, this->cols);
|
Cast(inLocal, inLocal.template ReinterpretCast<T>()[perLoopColsAlign], RoundMode::CAST_NONE, this->cols);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (smoothType != 0) {
|
if (smoothType != 0) {
|
||||||
Mul(inLocal, inLocal, smoothLocal, this->cols);
|
Mul(inLocal, inLocal, smoothLocal, this->cols);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
}
|
}
|
||||||
|
|
||||||
Abs(tempLocal, inLocal, this->cols);
|
Abs(tempLocal, inLocal, this->cols);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
ReduceMax(dynamicQuantLocal, tempLocal, tempLocal, this->cols);
|
ReduceMax(dynamicQuantLocal, tempLocal, tempLocal, this->cols);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
float maxValue = dynamicQuantLocal.GetValue(0) / 127.0f;
|
float maxValue = dynamicQuantLocal.GetValue(0) / 127.0f;
|
||||||
|
|
||||||
Duplicate<float>(dynamicQuantLocal, maxValue, 8);
|
Duplicate<float>(dynamicQuantLocal, maxValue, 8);
|
||||||
Duplicate<float>(tempLocal, maxValue, this->cols);
|
Duplicate<float>(tempLocal, maxValue, this->cols);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
Div(tempLocal, inLocal, tempLocal, this->cols);
|
Div(tempLocal, inLocal, tempLocal, this->cols);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
Cast(tempLocal.ReinterpretCast<half>(), tempLocal, RoundMode::CAST_TRUNC, this->cols);
|
Cast(tempLocal.ReinterpretCast<half>(), tempLocal, RoundMode::CAST_TRUNC, this->cols);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_ROUND, this->cols);
|
Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_ROUND, this->cols);
|
||||||
|
|
||||||
@@ -274,7 +274,7 @@ __aicore__ inline float MoeV2SrcToDstAndGather<T, TilingData>::ComputeMax(LocalT
|
|||||||
|
|
||||||
if constexpr (!IsSameType<T, float>::value) {
|
if constexpr (!IsSameType<T, float>::value) {
|
||||||
Cast(inLocal, inLocal.ReinterpretCast<T>()[perLoopColsAlign], RoundMode::CAST_NONE, colsTileLength);
|
Cast(inLocal, inLocal.ReinterpretCast<T>()[perLoopColsAlign], RoundMode::CAST_NONE, colsTileLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (smoothType != 0) {
|
if (smoothType != 0) {
|
||||||
@@ -284,11 +284,11 @@ __aicore__ inline float MoeV2SrcToDstAndGather<T, TilingData>::ComputeMax(LocalT
|
|||||||
smoothLocal = smoothInQueue.DeQue<float>();
|
smoothLocal = smoothInQueue.DeQue<float>();
|
||||||
|
|
||||||
Mul(inLocal, inLocal, smoothLocal, colsTileLength);
|
Mul(inLocal, inLocal, smoothLocal, colsTileLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
}
|
}
|
||||||
|
|
||||||
Abs(tempLocal, inLocal, colsTileLength);
|
Abs(tempLocal, inLocal, colsTileLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
ReduceMax(dynamicQuantLocal[8], tempLocal, tempLocal, colsTileLength);
|
ReduceMax(dynamicQuantLocal[8], tempLocal, tempLocal, colsTileLength);
|
||||||
|
|
||||||
@@ -314,13 +314,13 @@ __aicore__ inline void MoeV2SrcToDstAndGather<T, TilingData>::ComputeScale(Local
|
|||||||
inLocal = inputXInQueue.DeQue<float>();
|
inLocal = inputXInQueue.DeQue<float>();
|
||||||
|
|
||||||
Duplicate<float>(tempLocal, scaleTemp, colsTileLength);
|
Duplicate<float>(tempLocal, scaleTemp, colsTileLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
Div(tempLocal, inLocal, tempLocal, colsTileLength);
|
Div(tempLocal, inLocal, tempLocal, colsTileLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
Cast(tempLocal.ReinterpretCast<half>(), tempLocal, RoundMode::CAST_TRUNC, colsTileLength);
|
Cast(tempLocal.ReinterpretCast<half>(), tempLocal, RoundMode::CAST_TRUNC, colsTileLength);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_ROUND, colsTileLength);
|
Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_ROUND, colsTileLength);
|
||||||
|
|
||||||
|
|||||||
@@ -75,13 +75,13 @@ __aicore__ inline void MoeV2SrcToDstOp::Compute(int64_t progress) {
|
|||||||
LocalTensor<int32_t> outLocal = copyOutQueue.AllocTensor<int32_t>();
|
LocalTensor<int32_t> outLocal = copyOutQueue.AllocTensor<int32_t>();
|
||||||
LocalTensor<int32_t> assistTensor = assistBuffer.Get<int32_t>(ASSIST_NUM);
|
LocalTensor<int32_t> assistTensor = assistBuffer.Get<int32_t>(ASSIST_NUM);
|
||||||
|
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
int64_t loops = Ceil(currentLoopRows, ASSIST_INDEX_NUM);
|
int64_t loops = Ceil(currentLoopRows, ASSIST_INDEX_NUM);
|
||||||
for (int64_t i = 0; i < loops; i++) {
|
for (int64_t i = 0; i < loops; i++) {
|
||||||
Adds(outLocal[i * ASSIST_NUM], assistTensor,
|
Adds(outLocal[i * ASSIST_NUM], assistTensor,
|
||||||
static_cast<int32_t>(this->perLoopRows * progress + i * ASSIST_INDEX_NUM), ASSIST_NUM);
|
static_cast<int32_t>(this->perLoopRows * progress + i * ASSIST_INDEX_NUM), ASSIST_NUM);
|
||||||
}
|
}
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
copyOutQueue.EnQue<int32_t>(outLocal);
|
copyOutQueue.EnQue<int32_t>(outLocal);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -226,7 +226,7 @@ public:
|
|||||||
AscendC::CrossCoreSetFlag<0x0, PIPE_MTE3>(RECV_SYNC_EVENT_ID);
|
AscendC::CrossCoreSetFlag<0x0, PIPE_MTE3>(RECV_SYNC_EVENT_ID);
|
||||||
AscendC::CrossCoreSetFlag<0x0, PIPE_MTE3>(SEND_SYNC_EVENT_ID);
|
AscendC::CrossCoreSetFlag<0x0, PIPE_MTE3>(SEND_SYNC_EVENT_ID);
|
||||||
AscendC::CrossCoreWaitFlag(SEND_SYNC_EVENT_ID);
|
AscendC::CrossCoreWaitFlag(SEND_SYNC_EVENT_ID);
|
||||||
pipe_barrier(PIPE_ALL);
|
AscendC::PipeBarrier<PIPE_ALL>();
|
||||||
|
|
||||||
ctrBuffer.SetValue(0, epStateValue_);
|
ctrBuffer.SetValue(0, epStateValue_);
|
||||||
AscendC::SetFlag<AscendC::HardEvent::S_MTE3>(EVENT_ID0);
|
AscendC::SetFlag<AscendC::HardEvent::S_MTE3>(EVENT_ID0);
|
||||||
|
|||||||
@@ -264,7 +264,7 @@ __aicore__ inline void CamMoeDistributeCombine<TemplateMC2TypeFunc>::Init(
|
|||||||
DataCacheCleanAndInvalid<int32_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(
|
DataCacheCleanAndInvalid<int32_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(
|
||||||
selfDataStatusTensor[coreIdx_ * UB_ALIGN]);
|
selfDataStatusTensor[coreIdx_ * UB_ALIGN]);
|
||||||
__asm__ __volatile__("");
|
__asm__ __volatile__("");
|
||||||
pipe_barrier(PIPE_ALL);
|
AscendC::PipeBarrier<PIPE_ALL>();
|
||||||
|
|
||||||
workspaceGM_ = workspaceGM;
|
workspaceGM_ = workspaceGM;
|
||||||
expandXGM_.SetGlobalBuffer((__gm__ ExpandXType *)expandX);
|
expandXGM_.SetGlobalBuffer((__gm__ ExpandXType *)expandX);
|
||||||
@@ -480,13 +480,13 @@ __aicore__ inline void CamMoeDistributeCombine<TemplateMC2TypeFunc>::ReduceScatt
|
|||||||
template <TemplateMC2TypeClass>
|
template <TemplateMC2TypeClass>
|
||||||
__aicore__ inline void CamMoeDistributeCombine<TemplateMC2TypeFunc>::SetWaitTpStatusAndDisPatch()
|
__aicore__ inline void CamMoeDistributeCombine<TemplateMC2TypeFunc>::SetWaitTpStatusAndDisPatch()
|
||||||
{
|
{
|
||||||
pipe_barrier(PIPE_ALL);
|
AscendC::PipeBarrier<PIPE_ALL>();
|
||||||
if (startRankId_ >= epWorldSize_) {
|
if (startRankId_ >= epWorldSize_) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if constexpr (IsNeedReduceScatter) {
|
if constexpr (IsNeedReduceScatter) {
|
||||||
uint32_t tpToRankId = 1 - tpRankId_;
|
uint32_t tpToRankId = 1 - tpRankId_;
|
||||||
pipe_barrier(PIPE_ALL);
|
AscendC::PipeBarrier<PIPE_ALL>();
|
||||||
LocalTensor<float> statusFlagUb = readStateBuf_.Get<float>();
|
LocalTensor<float> statusFlagUb = readStateBuf_.Get<float>();
|
||||||
statusFlagUb(0) = sumTarget_;
|
statusFlagUb(0) = sumTarget_;
|
||||||
SyncFunc<AscendC::HardEvent::S_MTE3>();
|
SyncFunc<AscendC::HardEvent::S_MTE3>();
|
||||||
@@ -604,9 +604,9 @@ __aicore__ inline void CamMoeDistributeCombine<TemplateMC2TypeFunc>::CustomAdd(L
|
|||||||
if constexpr (AscendC::IsSameType<ExpandXType, bfloat16_t>::value) {
|
if constexpr (AscendC::IsSameType<ExpandXType, bfloat16_t>::value) {
|
||||||
Cast(winTpSendCountFloatTensor_, src0, RoundMode::CAST_NONE, dataCnt);
|
Cast(winTpSendCountFloatTensor_, src0, RoundMode::CAST_NONE, dataCnt);
|
||||||
Cast(gmTpSendCountFloatTensor_, src1, RoundMode::CAST_NONE, dataCnt);
|
Cast(gmTpSendCountFloatTensor_, src1, RoundMode::CAST_NONE, dataCnt);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Add(winTpSendCountFloatTensor_, winTpSendCountFloatTensor_, gmTpSendCountFloatTensor_, dataCnt);
|
Add(winTpSendCountFloatTensor_, winTpSendCountFloatTensor_, gmTpSendCountFloatTensor_, dataCnt);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Cast(dst, winTpSendCountFloatTensor_, RoundMode::CAST_ROUND, dataCnt);
|
Cast(dst, winTpSendCountFloatTensor_, RoundMode::CAST_ROUND, dataCnt);
|
||||||
} else {
|
} else {
|
||||||
Add(dst, src0, src1, dataCnt);
|
Add(dst, src0, src1, dataCnt);
|
||||||
@@ -616,7 +616,7 @@ __aicore__ inline void CamMoeDistributeCombine<TemplateMC2TypeFunc>::CustomAdd(L
|
|||||||
template <TemplateMC2TypeClass>
|
template <TemplateMC2TypeClass>
|
||||||
__aicore__ inline void CamMoeDistributeCombine<TemplateMC2TypeFunc>::SetStatus()
|
__aicore__ inline void CamMoeDistributeCombine<TemplateMC2TypeFunc>::SetStatus()
|
||||||
{
|
{
|
||||||
pipe_barrier(PIPE_ALL);
|
AscendC::PipeBarrier<PIPE_ALL>();
|
||||||
if (startRankId_ >= epWorldSize_) {
|
if (startRankId_ >= epWorldSize_) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -253,7 +253,7 @@ __aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::Init(
|
|||||||
DataCacheCleanAndInvalid<int32_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(
|
DataCacheCleanAndInvalid<int32_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(
|
||||||
selfDataStatusTensor[aivId_ * UB_ALIGN]);
|
selfDataStatusTensor[aivId_ * UB_ALIGN]);
|
||||||
__asm__ __volatile__("");
|
__asm__ __volatile__("");
|
||||||
pipe_barrier(PIPE_ALL);
|
AscendC::PipeBarrier<PIPE_ALL>();
|
||||||
axisBS_ = tilingData->disGmmDeqSwigluQuantGmmDeqComInfo.bs;
|
axisBS_ = tilingData->disGmmDeqSwigluQuantGmmDeqComInfo.bs;
|
||||||
axisH_ = tilingData->disGmmDeqSwigluQuantGmmDeqComInfo.h;
|
axisH_ = tilingData->disGmmDeqSwigluQuantGmmDeqComInfo.h;
|
||||||
epWorldSize_ = tilingData->disGmmDeqSwigluQuantGmmDeqComInfo.epRankSize;
|
epWorldSize_ = tilingData->disGmmDeqSwigluQuantGmmDeqComInfo.epRankSize;
|
||||||
@@ -568,7 +568,7 @@ __aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::Allto
|
|||||||
}
|
}
|
||||||
tableLocalTensor_((tokenIndex / axisK_ + 1) * moeExpertRankNumAligned_ + expertId) = 1;
|
tableLocalTensor_((tokenIndex / axisK_ + 1) * moeExpertRankNumAligned_ + expertId) = 1;
|
||||||
}
|
}
|
||||||
pipe_barrier(PIPE_ALL);
|
AscendC::PipeBarrier<PIPE_ALL>();
|
||||||
|
|
||||||
uint32_t sendTokenNum = expertIdsCnt / moeUsedAivNum_;
|
uint32_t sendTokenNum = expertIdsCnt / moeUsedAivNum_;
|
||||||
uint32_t remainderTokenNum = expertIdsCnt % moeUsedAivNum_;
|
uint32_t remainderTokenNum = expertIdsCnt % moeUsedAivNum_;
|
||||||
@@ -587,7 +587,7 @@ __aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::Allto
|
|||||||
Add(tableInt16LocalTensor_[row * moeExpertRankNumInt16Aligned_],
|
Add(tableInt16LocalTensor_[row * moeExpertRankNumInt16Aligned_],
|
||||||
tableInt16LocalTensor_[row * moeExpertRankNumInt16Aligned_],
|
tableInt16LocalTensor_[row * moeExpertRankNumInt16Aligned_],
|
||||||
tableInt16LocalTensor_[(row - 1) * moeExpertRankNumInt16Aligned_], moeExpertRankNumInt16Aligned_);
|
tableInt16LocalTensor_[(row - 1) * moeExpertRankNumInt16Aligned_], moeExpertRankNumInt16Aligned_);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
}
|
}
|
||||||
|
|
||||||
// row-i of tableLocalTensor_ is index of token
|
// row-i of tableLocalTensor_ is index of token
|
||||||
@@ -655,7 +655,7 @@ __aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::Allto
|
|||||||
template <TemplateDispatchTypeClass>
|
template <TemplateDispatchTypeClass>
|
||||||
__aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::SetStatus()
|
__aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::SetStatus()
|
||||||
{
|
{
|
||||||
pipe_barrier(PIPE_ALL);
|
AscendC::PipeBarrier<PIPE_ALL>();
|
||||||
SyncAll<true>();
|
SyncAll<true>();
|
||||||
totalExpertNum_ = sharedExpertRankNum_ + moeExpertNum_;
|
totalExpertNum_ = sharedExpertRankNum_ + moeExpertNum_;
|
||||||
sendExpertNum_ = totalExpertNum_ / aivNum_;
|
sendExpertNum_ = totalExpertNum_ / aivNum_;
|
||||||
@@ -695,7 +695,7 @@ __aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::Quant
|
|||||||
floatLocalTemp = receiveDataCastFloatBuf_.Get<float>();
|
floatLocalTemp = receiveDataCastFloatBuf_.Get<float>();
|
||||||
Cast(floatLocalTemp, xInTensor_, RoundMode::CAST_NONE, axisH_);
|
Cast(floatLocalTemp, xInTensor_, RoundMode::CAST_NONE, axisH_);
|
||||||
xInQueue_.FreeTensor<XType>(xInTensor_);
|
xInQueue_.FreeTensor<XType>(xInTensor_);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
if constexpr (IsSmoothScaleExist) {
|
if constexpr (IsSmoothScaleExist) {
|
||||||
if constexpr (DynamicQuant) {
|
if constexpr (DynamicQuant) {
|
||||||
SyncFunc<AscendC::HardEvent::V_MTE2>();
|
SyncFunc<AscendC::HardEvent::V_MTE2>();
|
||||||
@@ -703,28 +703,28 @@ __aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::Quant
|
|||||||
DataCopy(smoothScalesTensor_, scalesGMTensor_[expertIndex * axisH_], axisH_);
|
DataCopy(smoothScalesTensor_, scalesGMTensor_[expertIndex * axisH_], axisH_);
|
||||||
SyncFunc<AscendC::HardEvent::MTE2_V>();
|
SyncFunc<AscendC::HardEvent::MTE2_V>();
|
||||||
Mul(floatLocalTemp, floatLocalTemp, smoothScalesTensor_, axisH_);
|
Mul(floatLocalTemp, floatLocalTemp, smoothScalesTensor_, axisH_);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
}
|
}
|
||||||
if constexpr (DynamicQuant) {
|
if constexpr (DynamicQuant) {
|
||||||
LocalTensor<float> floatLocalAbsTemp = smoothScalesBuf_.Get<float>();
|
LocalTensor<float> floatLocalAbsTemp = smoothScalesBuf_.Get<float>();
|
||||||
rowMaxTensor_ = rowMaxBuf_.Get<float>();
|
rowMaxTensor_ = rowMaxBuf_.Get<float>();
|
||||||
Abs(floatLocalAbsTemp, floatLocalTemp, axisH_);
|
Abs(floatLocalAbsTemp, floatLocalTemp, axisH_);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
ReduceMax(rowMaxTensor_, floatLocalAbsTemp, floatLocalAbsTemp, axisH_, false);
|
ReduceMax(rowMaxTensor_, floatLocalAbsTemp, floatLocalAbsTemp, axisH_, false);
|
||||||
SyncFunc<AscendC::HardEvent::V_S>();
|
SyncFunc<AscendC::HardEvent::V_S>();
|
||||||
dynamicScale = float(127.0) / rowMaxTensor_.GetValue(0);
|
dynamicScale = float(127.0) / rowMaxTensor_.GetValue(0);
|
||||||
SyncFunc<AscendC::HardEvent::S_V>();
|
SyncFunc<AscendC::HardEvent::S_V>();
|
||||||
Muls(floatLocalTemp, floatLocalTemp, dynamicScale, axisH_);
|
Muls(floatLocalTemp, floatLocalTemp, dynamicScale, axisH_);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
}
|
}
|
||||||
LocalTensor<half> halfLocalTemp = floatLocalTemp.ReinterpretCast<half>();
|
LocalTensor<half> halfLocalTemp = floatLocalTemp.ReinterpretCast<half>();
|
||||||
LocalTensor<int32_t> int32LocalTemp = floatLocalTemp.ReinterpretCast<int32_t>();
|
LocalTensor<int32_t> int32LocalTemp = floatLocalTemp.ReinterpretCast<int32_t>();
|
||||||
Cast(int32LocalTemp, floatLocalTemp, RoundMode::CAST_RINT, axisH_);
|
Cast(int32LocalTemp, floatLocalTemp, RoundMode::CAST_RINT, axisH_);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
SetDeqScale((half)1.000000e+00f);
|
SetDeqScale((half)1.000000e+00f);
|
||||||
PipeBarrier<PIPE_V>();
|
PipeBarrier<PIPE_V>();
|
||||||
Cast(halfLocalTemp, int32LocalTemp, RoundMode::CAST_ROUND, axisH_);
|
Cast(halfLocalTemp, int32LocalTemp, RoundMode::CAST_ROUND, axisH_);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Cast(xOutTensor_, halfLocalTemp, RoundMode::CAST_TRUNC, axisH_);
|
Cast(xOutTensor_, halfLocalTemp, RoundMode::CAST_TRUNC, axisH_);
|
||||||
floatLocalTemp = xOutTensor_.template ReinterpretCast<float>();
|
floatLocalTemp = xOutTensor_.template ReinterpretCast<float>();
|
||||||
floatLocalTemp.SetValue(axisH_ / sizeof(float), float(1.0) / dynamicScale); // int8->float32
|
floatLocalTemp.SetValue(axisH_ / sizeof(float), float(1.0) / dynamicScale); // int8->float32
|
||||||
@@ -742,10 +742,10 @@ __aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::Local
|
|||||||
xQueue_.EnQue(xTmpTensor_);
|
xQueue_.EnQue(xTmpTensor_);
|
||||||
xTmpTensor_ = xQueue_.DeQue<ExpandXOutType>();
|
xTmpTensor_ = xQueue_.DeQue<ExpandXOutType>();
|
||||||
if constexpr (DynamicQuant || StaticQuant) {
|
if constexpr (DynamicQuant || StaticQuant) {
|
||||||
pipe_barrier(PIPE_ALL);
|
AscendC::PipeBarrier<PIPE_ALL>();
|
||||||
xOutFp32Tensor_ = xTmpTensor_.template ReinterpretCast<float>();
|
xOutFp32Tensor_ = xTmpTensor_.template ReinterpretCast<float>();
|
||||||
dynamicScalesTensor_.SetValue(dynamicScalesLocalIdx++, xOutFp32Tensor_.GetValue(axisH_ / sizeof(float)));
|
dynamicScalesTensor_.SetValue(dynamicScalesLocalIdx++, xOutFp32Tensor_.GetValue(axisH_ / sizeof(float)));
|
||||||
pipe_barrier(PIPE_ALL);
|
AscendC::PipeBarrier<PIPE_ALL>();
|
||||||
}
|
}
|
||||||
if constexpr (IsNeedAllgater) {
|
if constexpr (IsNeedAllgater) {
|
||||||
DataCopy(winTpGatherOutGMTensor_[tokenOffset * axisH_], xTmpTensor_, axisH_);
|
DataCopy(winTpGatherOutGMTensor_[tokenOffset * axisH_], xTmpTensor_, axisH_);
|
||||||
@@ -791,7 +791,7 @@ __aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::WaitD
|
|||||||
SyncFunc<AscendC::HardEvent::MTE2_V>();
|
SyncFunc<AscendC::HardEvent::MTE2_V>();
|
||||||
GatherMask(gatherMaskOutTensor, statusFp32Tensor_, gatherTmpTensor, true, mask,
|
GatherMask(gatherMaskOutTensor, statusFp32Tensor_, gatherTmpTensor, true, mask,
|
||||||
{1, (uint16_t)recStatusNumPerCore, 1, 0}, rsvdCnt);
|
{1, (uint16_t)recStatusNumPerCore, 1, 0}, rsvdCnt);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Sum(statusSumOutTensor, gatherMaskOutTensor, sumParams);
|
Sum(statusSumOutTensor, gatherMaskOutTensor, sumParams);
|
||||||
SyncFunc<AscendC::HardEvent::V_S>();
|
SyncFunc<AscendC::HardEvent::V_S>();
|
||||||
sumOfFlag = statusSumOutTensor.GetValue(0);
|
sumOfFlag = statusSumOutTensor.GetValue(0);
|
||||||
@@ -929,11 +929,11 @@ __aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::Local
|
|||||||
xQueue_.EnQue(xTmpTensor_);
|
xQueue_.EnQue(xTmpTensor_);
|
||||||
xTmpTensor_ = xQueue_.DeQue<ExpandXOutType>();
|
xTmpTensor_ = xQueue_.DeQue<ExpandXOutType>();
|
||||||
if constexpr (DynamicQuant || StaticQuant) {
|
if constexpr (DynamicQuant || StaticQuant) {
|
||||||
pipe_barrier(PIPE_ALL);
|
AscendC::PipeBarrier<PIPE_ALL>();
|
||||||
xOutFp32Tensor_ = xTmpTensor_.template ReinterpretCast<float>();
|
xOutFp32Tensor_ = xTmpTensor_.template ReinterpretCast<float>();
|
||||||
DataCopyPad(dynamicScalesOutGMTensor_[beginIdx + j], xOutFp32Tensor_[axisH_ / sizeof(float)],
|
DataCopyPad(dynamicScalesOutGMTensor_[beginIdx + j], xOutFp32Tensor_[axisH_ / sizeof(float)],
|
||||||
dataCopyParamsFloat);
|
dataCopyParamsFloat);
|
||||||
pipe_barrier(PIPE_ALL);
|
AscendC::PipeBarrier<PIPE_ALL>();
|
||||||
}
|
}
|
||||||
if constexpr (IsNeedAllgater) {
|
if constexpr (IsNeedAllgater) {
|
||||||
DataCopy(winTpGatherOutGMTensor_[(beginIdx + j) * axisHCommu_], xTmpTensor_, axisHCommu_);
|
DataCopy(winTpGatherOutGMTensor_[(beginIdx + j) * axisHCommu_], xTmpTensor_, axisHCommu_);
|
||||||
@@ -963,7 +963,7 @@ __aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::Local
|
|||||||
template <TemplateDispatchTypeClass>
|
template <TemplateDispatchTypeClass>
|
||||||
__aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::AllGatherSetStatusAndWait()
|
__aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::AllGatherSetStatusAndWait()
|
||||||
{
|
{
|
||||||
pipe_barrier(PIPE_ALL);
|
AscendC::PipeBarrier<PIPE_ALL>();
|
||||||
if (startExpertId_ >= totalExpertNum_) {
|
if (startExpertId_ >= totalExpertNum_) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -187,7 +187,7 @@ private:
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
Cast(xDup, xLocal, AscendC::RoundMode::CAST_NONE, maxLoRARank_);
|
Cast(xDup, xLocal, AscendC::RoundMode::CAST_NONE, maxLoRARank_);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
for (int32_t i = maxLoRARank_; i < NUM_ELEMENTS_PER_REPEAT; i += maxLoRARank_) {
|
for (int32_t i = maxLoRARank_; i < NUM_ELEMENTS_PER_REPEAT; i += maxLoRARank_) {
|
||||||
for (int32_t j = 0; j < maxLoRARank_; j++) {
|
for (int32_t j = 0; j < maxLoRARank_; j++) {
|
||||||
@@ -219,15 +219,15 @@ private:
|
|||||||
AscendC::LocalTensor<Y_T> yInLocal = inQueueY_.DeQue<Y_T>();
|
AscendC::LocalTensor<Y_T> yInLocal = inQueueY_.DeQue<Y_T>();
|
||||||
AscendC::LocalTensor<float> yInLocalFP32 = inBufferY_.Get<float>();
|
AscendC::LocalTensor<float> yInLocalFP32 = inBufferY_.Get<float>();
|
||||||
Cast(yInLocalFP32, yInLocal, AscendC::RoundMode::CAST_NONE, numElements);
|
Cast(yInLocalFP32, yInLocal, AscendC::RoundMode::CAST_NONE, numElements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
inQueueY_.FreeTensor(yInLocal);
|
inQueueY_.FreeTensor(yInLocal);
|
||||||
|
|
||||||
Add(yLocal, yLocal, yInLocalFP32, numElements);
|
Add(yLocal, yLocal, yInLocalFP32, numElements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.AllocTensor<Y_T>();
|
AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.AllocTensor<Y_T>();
|
||||||
Cast(yOutLocal, yLocal, AscendC::RoundMode::CAST_RINT, numElements);
|
Cast(yOutLocal, yLocal, AscendC::RoundMode::CAST_RINT, numElements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
outQueueY_.EnQue<Y_T>(yOutLocal);
|
outQueueY_.EnQue<Y_T>(yOutLocal);
|
||||||
}
|
}
|
||||||
@@ -243,40 +243,40 @@ private:
|
|||||||
AscendC::LocalTensor<float> wTmpTensor = tmpBufferW_.Get<float>();
|
AscendC::LocalTensor<float> wTmpTensor = tmpBufferW_.Get<float>();
|
||||||
|
|
||||||
Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, MASK_COUNT, blockReduceRepeatCount, castParams_);
|
Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, MASK_COUNT, blockReduceRepeatCount, castParams_);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
inQueueW_.FreeTensor(wLocal);
|
inQueueW_.FreeTensor(wLocal);
|
||||||
|
|
||||||
Mul(wTmpTensor, xDup, wTmpTensor, MASK_COUNT, blockReduceRepeatCount, dotProductParams_);
|
Mul(wTmpTensor, xDup, wTmpTensor, MASK_COUNT, blockReduceRepeatCount, dotProductParams_);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
if (maxLoRARank_ == LORA_RANK_8) {
|
if (maxLoRARank_ == LORA_RANK_8) {
|
||||||
BlockReduceSum(yLocal[progress], wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
|
BlockReduceSum(yLocal[progress], wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
|
||||||
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
|
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
} else if (maxLoRARank_ == LORA_RANK_16) {
|
} else if (maxLoRARank_ == LORA_RANK_16) {
|
||||||
BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
|
BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
|
||||||
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
|
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT,
|
PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT,
|
||||||
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
|
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
} else if (maxLoRARank_ == LORA_RANK_32) {
|
} else if (maxLoRARank_ == LORA_RANK_32) {
|
||||||
BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
|
BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
|
||||||
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
|
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
PairReduceSum(wTmpTensor, wTmpTensor, pairReduceRepeat16, MASK_COUNT,
|
PairReduceSum(wTmpTensor, wTmpTensor, pairReduceRepeat16, MASK_COUNT,
|
||||||
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
|
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat32, MASK_COUNT,
|
PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat32, MASK_COUNT,
|
||||||
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
|
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
} else if (maxLoRARank_ == LORA_RANK_64) {
|
} else if (maxLoRARank_ == LORA_RANK_64) {
|
||||||
BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
|
BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
|
||||||
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
|
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
BlockReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT,
|
BlockReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT,
|
||||||
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
|
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -92,7 +92,7 @@ private:
|
|||||||
AscendC::LocalTensor<float> xTmpTensor = tmpBufferX_.Get<float>();
|
AscendC::LocalTensor<float> xTmpTensor = tmpBufferX_.Get<float>();
|
||||||
AscendC::LocalTensor<X_T> xLocal = inQueueX_.DeQue<X_T>();
|
AscendC::LocalTensor<X_T> xLocal = inQueueX_.DeQue<X_T>();
|
||||||
Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, inputHiddenDim_);
|
Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, inputHiddenDim_);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
inQueueX_.FreeTensor(xLocal);
|
inQueueX_.FreeTensor(xLocal);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -141,20 +141,20 @@ private:
|
|||||||
AscendC::LocalTensor<X_T> xLocal = inQueueX_.DeQue<X_T>();
|
AscendC::LocalTensor<X_T> xLocal = inQueueX_.DeQue<X_T>();
|
||||||
Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, numElements);
|
Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, numElements);
|
||||||
Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements);
|
Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
inQueueX_.FreeTensor(xLocal);
|
inQueueX_.FreeTensor(xLocal);
|
||||||
inQueueW_.FreeTensor(wLocal);
|
inQueueW_.FreeTensor(wLocal);
|
||||||
} else {
|
} else {
|
||||||
Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements);
|
Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
inQueueW_.FreeTensor(wLocal);
|
inQueueW_.FreeTensor(wLocal);
|
||||||
}
|
}
|
||||||
// dot product of the one tile of X and W
|
// dot product of the one tile of X and W
|
||||||
Mul(wTmpTensor, xTmpTensor, wTmpTensor, numElements);
|
Mul(wTmpTensor, xTmpTensor, wTmpTensor, numElements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
// reduce sum generate one number, which is the summation of all the dot product
|
// reduce sum generate one number, which is the summation of all the dot product
|
||||||
ReduceSum<float>(wTmpTensor, wTmpTensor, wTmpTensor, numElements);
|
ReduceSum<float>(wTmpTensor, wTmpTensor, wTmpTensor, numElements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
acc += wTmpTensor.GetValue(0);
|
acc += wTmpTensor.GetValue(0);
|
||||||
}
|
}
|
||||||
@@ -180,7 +180,7 @@ private:
|
|||||||
AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.AllocTensor<Y_T>();
|
AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.AllocTensor<Y_T>();
|
||||||
|
|
||||||
Muls(yOutLocal, yLocal, scale_, maxLoRARank_);
|
Muls(yOutLocal, yLocal, scale_, maxLoRARank_);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
outQueueY_.EnQue<Y_T>(yOutLocal);
|
outQueueY_.EnQue<Y_T>(yOutLocal);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -198,7 +198,7 @@ private:
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
Cast(xDup, xLocal, AscendC::RoundMode::CAST_NONE, maxLoRARank_);
|
Cast(xDup, xLocal, AscendC::RoundMode::CAST_NONE, maxLoRARank_);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
for (int32_t i = maxLoRARank_; i < NUM_ELEMENTS_PER_REPEAT; i += maxLoRARank_) {
|
for (int32_t i = maxLoRARank_; i < NUM_ELEMENTS_PER_REPEAT; i += maxLoRARank_) {
|
||||||
for (int32_t j = 0; j < maxLoRARank_; j++) {
|
for (int32_t j = 0; j < maxLoRARank_; j++) {
|
||||||
@@ -230,15 +230,15 @@ private:
|
|||||||
AscendC::LocalTensor<Y_T> yInLocal = inQueueY_.DeQue<Y_T>();
|
AscendC::LocalTensor<Y_T> yInLocal = inQueueY_.DeQue<Y_T>();
|
||||||
AscendC::LocalTensor<float> yInLocalFP32 = inBufferY_.Get<float>();
|
AscendC::LocalTensor<float> yInLocalFP32 = inBufferY_.Get<float>();
|
||||||
Cast(yInLocalFP32, yInLocal, AscendC::RoundMode::CAST_NONE, numElements);
|
Cast(yInLocalFP32, yInLocal, AscendC::RoundMode::CAST_NONE, numElements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
inQueueY_.FreeTensor(yInLocal);
|
inQueueY_.FreeTensor(yInLocal);
|
||||||
|
|
||||||
Add(yLocal, yLocal, yInLocalFP32, numElements);
|
Add(yLocal, yLocal, yInLocalFP32, numElements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.AllocTensor<Y_T>();
|
AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.AllocTensor<Y_T>();
|
||||||
Cast(yOutLocal, yLocal, AscendC::RoundMode::CAST_RINT, numElements);
|
Cast(yOutLocal, yLocal, AscendC::RoundMode::CAST_RINT, numElements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
outQueueY_.EnQue<Y_T>(yOutLocal);
|
outQueueY_.EnQue<Y_T>(yOutLocal);
|
||||||
}
|
}
|
||||||
@@ -254,40 +254,40 @@ private:
|
|||||||
AscendC::LocalTensor<float> wTmpTensor = tmpBufferW_.Get<float>();
|
AscendC::LocalTensor<float> wTmpTensor = tmpBufferW_.Get<float>();
|
||||||
|
|
||||||
Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, MASK_COUNT, blockReduceRepeatCount, castParams_);
|
Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, MASK_COUNT, blockReduceRepeatCount, castParams_);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
inQueueW_.FreeTensor(wLocal);
|
inQueueW_.FreeTensor(wLocal);
|
||||||
|
|
||||||
Mul(wTmpTensor, xDup, wTmpTensor, MASK_COUNT, blockReduceRepeatCount, dotProductParams_);
|
Mul(wTmpTensor, xDup, wTmpTensor, MASK_COUNT, blockReduceRepeatCount, dotProductParams_);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
if (maxLoRARank_ == LORA_RANK_8) {
|
if (maxLoRARank_ == LORA_RANK_8) {
|
||||||
BlockReduceSum(yLocal[progress], wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
|
BlockReduceSum(yLocal[progress], wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
|
||||||
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
|
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
} else if (maxLoRARank_ == LORA_RANK_16) {
|
} else if (maxLoRARank_ == LORA_RANK_16) {
|
||||||
BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
|
BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
|
||||||
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
|
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT,
|
PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT,
|
||||||
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
|
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
} else if (maxLoRARank_ == LORA_RANK_32) {
|
} else if (maxLoRARank_ == LORA_RANK_32) {
|
||||||
BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
|
BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
|
||||||
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
|
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
PairReduceSum(wTmpTensor, wTmpTensor, pairReduceRepeat16, MASK_COUNT,
|
PairReduceSum(wTmpTensor, wTmpTensor, pairReduceRepeat16, MASK_COUNT,
|
||||||
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
|
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat32, MASK_COUNT,
|
PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat32, MASK_COUNT,
|
||||||
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
|
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
} else if (maxLoRARank_ == LORA_RANK_64) {
|
} else if (maxLoRARank_ == LORA_RANK_64) {
|
||||||
BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
|
BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
|
||||||
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
|
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
BlockReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT,
|
BlockReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT,
|
||||||
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
|
reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -94,7 +94,7 @@ private:
|
|||||||
AscendC::LocalTensor<float> xTmpTensor = tmpBufferX_.Get<float>();
|
AscendC::LocalTensor<float> xTmpTensor = tmpBufferX_.Get<float>();
|
||||||
AscendC::LocalTensor<X_T> xLocal = inQueueX_.DeQue<X_T>();
|
AscendC::LocalTensor<X_T> xLocal = inQueueX_.DeQue<X_T>();
|
||||||
Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, inputHiddenDim_);
|
Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, inputHiddenDim_);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
inQueueX_.FreeTensor(xLocal);
|
inQueueX_.FreeTensor(xLocal);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -153,20 +153,20 @@ private:
|
|||||||
AscendC::LocalTensor<X_T> xLocal = inQueueX_.DeQue<X_T>();
|
AscendC::LocalTensor<X_T> xLocal = inQueueX_.DeQue<X_T>();
|
||||||
Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, numElements);
|
Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, numElements);
|
||||||
Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements);
|
Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
inQueueX_.FreeTensor(xLocal);
|
inQueueX_.FreeTensor(xLocal);
|
||||||
inQueueW_.FreeTensor(wLocal);
|
inQueueW_.FreeTensor(wLocal);
|
||||||
} else {
|
} else {
|
||||||
Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements);
|
Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
inQueueW_.FreeTensor(wLocal);
|
inQueueW_.FreeTensor(wLocal);
|
||||||
}
|
}
|
||||||
// dot product of the one tile of X and W
|
// dot product of the one tile of X and W
|
||||||
Mul(wTmpTensor, xTmpTensor, wTmpTensor, numElements);
|
Mul(wTmpTensor, xTmpTensor, wTmpTensor, numElements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
// reduce sum generate one number, which is the summation of all the dot product
|
// reduce sum generate one number, which is the summation of all the dot product
|
||||||
ReduceSum<float>(wTmpTensor, wTmpTensor, wTmpTensor, numElements);
|
ReduceSum<float>(wTmpTensor, wTmpTensor, wTmpTensor, numElements);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
acc += wTmpTensor.GetValue(0);
|
acc += wTmpTensor.GetValue(0);
|
||||||
}
|
}
|
||||||
@@ -192,7 +192,7 @@ private:
|
|||||||
AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.AllocTensor<Y_T>();
|
AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.AllocTensor<Y_T>();
|
||||||
|
|
||||||
Muls(yOutLocal, yLocal, scale_, maxLoRARank_);
|
Muls(yOutLocal, yLocal, scale_, maxLoRARank_);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
outQueueY_.EnQue<Y_T>(yOutLocal);
|
outQueueY_.EnQue<Y_T>(yOutLocal);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -159,7 +159,7 @@ private:
|
|||||||
for (int i = copyOffset; i < copyOffset + copyLen; ++i) {
|
for (int i = copyOffset; i < copyOffset + copyLen; ++i) {
|
||||||
CpUB2GM((__gm__ int64_t *)(shareAddrs[i]) + rank * FLAG_UNIT_INT_NUM, inputUB, sizeof(int64_t));
|
CpUB2GM((__gm__ int64_t *)(shareAddrs[i]) + rank * FLAG_UNIT_INT_NUM, inputUB, sizeof(int64_t));
|
||||||
}
|
}
|
||||||
pipe_barrier(PIPE_ALL);
|
AscendC::PipeBarrier<PIPE_ALL>();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -458,7 +458,7 @@ __aicore__ inline void SFAVectorService<SFAT>::SoftmaxFlashV2Compute(
|
|||||||
} else {
|
} else {
|
||||||
uint32_t dealRowCountAlign = SFAAlign(dealRowCount, FP32_BLOCK_ELEMENT_NUM);
|
uint32_t dealRowCountAlign = SFAAlign(dealRowCount, FP32_BLOCK_ELEMENT_NUM);
|
||||||
DataCopy(softmaxSumUb[softmaxOutOffset], inSumTensor, dealRowCountAlign);
|
DataCopy(softmaxSumUb[softmaxOutOffset], inSumTensor, dealRowCountAlign);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
DataCopy(softmaxMaxUb[softmaxOutOffset], inMaxTensor, dealRowCountAlign);
|
DataCopy(softmaxMaxUb[softmaxOutOffset], inMaxTensor, dealRowCountAlign);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -477,9 +477,9 @@ __aicore__ inline void SFAVectorService<SFAT>::AmlaVecCompute(
|
|||||||
LocalTensor<T> nUpdateTmp = nTmp[SOFTMAX_TMP_BUFFER_OFFSET / sizeof(T)];
|
LocalTensor<T> nUpdateTmp = nTmp[SOFTMAX_TMP_BUFFER_OFFSET / sizeof(T)];
|
||||||
Muls(nTmp, softmaxMaxUb[softmaxOutOffset], ((T)(-1.0)) * RECIP_OF_LN2, calCount);
|
Muls(nTmp, softmaxMaxUb[softmaxOutOffset], ((T)(-1.0)) * RECIP_OF_LN2, calCount);
|
||||||
|
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Cast(nTmp, nTmp, RoundMode::CAST_ROUND, calCount);
|
Cast(nTmp, nTmp, RoundMode::CAST_ROUND, calCount);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
uint32_t prOutIdx = (info.loop - 1) % (constInfo.preLoadNum);
|
uint32_t prOutIdx = (info.loop - 1) % (constInfo.preLoadNum);
|
||||||
uint32_t PreSoftmaxOutOffset = prOutIdx * SOFTMAX_TMP_BUFFER_OFFSET / sizeof(T) + baseOffset;
|
uint32_t PreSoftmaxOutOffset = prOutIdx * SOFTMAX_TMP_BUFFER_OFFSET / sizeof(T) + baseOffset;
|
||||||
@@ -489,10 +489,10 @@ __aicore__ inline void SFAVectorService<SFAT>::AmlaVecCompute(
|
|||||||
} else {
|
} else {
|
||||||
Sub(nUpdateTmp, nTmp, nValueUb[PreSoftmaxOutOffset], calCount);
|
Sub(nUpdateTmp, nTmp, nValueUb[PreSoftmaxOutOffset], calCount);
|
||||||
}
|
}
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
// update n(i), DataCopy not support when calCount is not align 32B, so use Adds
|
// update n(i), DataCopy not support when calCount is not align 32B, so use Adds
|
||||||
Adds(nValueUb[softmaxOutOffset], nTmp, ConstInfo::FLOAT_ZERO, calCount);
|
Adds(nValueUb[softmaxOutOffset], nTmp, ConstInfo::FLOAT_ZERO, calCount);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
// update softmax res
|
// update softmax res
|
||||||
LocalTensor<T> nUpdateTmp2 = nTmp[2 * SOFTMAX_TMP_BUFFER_OFFSET / sizeof(T)];
|
LocalTensor<T> nUpdateTmp2 = nTmp[2 * SOFTMAX_TMP_BUFFER_OFFSET / sizeof(T)];
|
||||||
@@ -500,17 +500,17 @@ __aicore__ inline void SFAVectorService<SFAT>::AmlaVecCompute(
|
|||||||
LocalTensor<T> tmpCofUb = nTmp[4 * SOFTMAX_TMP_BUFFER_OFFSET / sizeof(T)];
|
LocalTensor<T> tmpCofUb = nTmp[4 * SOFTMAX_TMP_BUFFER_OFFSET / sizeof(T)];
|
||||||
LocalTensor<T> epsUb = nTmp[5 * SOFTMAX_TMP_BUFFER_OFFSET / sizeof(T)];
|
LocalTensor<T> epsUb = nTmp[5 * SOFTMAX_TMP_BUFFER_OFFSET / sizeof(T)];
|
||||||
Muls(nUpdateTmp2, softmaxMaxUb[softmaxOutOffset], RECIP_OF_LN2, calCount);
|
Muls(nUpdateTmp2, softmaxMaxUb[softmaxOutOffset], RECIP_OF_LN2, calCount);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Add(nTmp, nUpdateTmp2, nTmp, calCount);
|
Add(nTmp, nUpdateTmp2, nTmp, calCount);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Muls(nTmp, nTmp, LN2, calCount);
|
Muls(nTmp, nTmp, LN2, calCount);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Exp(nTmp, nTmp, calCount);
|
Exp(nTmp, nTmp, calCount);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Cast(nTmp_KvT, nTmp, RoundMode::CAST_ROUND, calCount); // fp32->fp16/bf16
|
Cast(nTmp_KvT, nTmp, RoundMode::CAST_ROUND, calCount); // fp32->fp16/bf16
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Cast(nUpdateTmp2, nTmp_KvT, RoundMode::CAST_NONE, calCount); // fp16/bf16->fp32
|
Cast(nUpdateTmp2, nTmp_KvT, RoundMode::CAST_NONE, calCount); // fp16/bf16->fp32
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
if (info.s2Idx + 1 == info.curSInnerLoopTimes) {
|
if (info.s2Idx + 1 == info.curSInnerLoopTimes) {
|
||||||
Mul(aMlaSumUb[softmaxOutOffset], softmaxSumUb[softmaxOutOffset], nUpdateTmp2, calCount);
|
Mul(aMlaSumUb[softmaxOutOffset], softmaxSumUb[softmaxOutOffset], nUpdateTmp2, calCount);
|
||||||
}
|
}
|
||||||
@@ -521,33 +521,33 @@ __aicore__ inline void SFAVectorService<SFAT>::AmlaVecCompute(
|
|||||||
}
|
}
|
||||||
LocalTensor<T> nTmp3 = nTmp[6 * SOFTMAX_TMP_BUFFER_OFFSET / sizeof(T)];
|
LocalTensor<T> nTmp3 = nTmp[6 * SOFTMAX_TMP_BUFFER_OFFSET / sizeof(T)];
|
||||||
Brcb(nTmp3, nUpdateTmp2, (dealRowCount + 7) / 8, {1, 8});
|
Brcb(nTmp3, nUpdateTmp2, (dealRowCount + 7) / 8, {1, 8});
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
RowMuls(mmResUb, mmResUb, nTmp3, dealRowCount, columnCount, actualColumnCount);
|
RowMuls(mmResUb, mmResUb, nTmp3, dealRowCount, columnCount, actualColumnCount);
|
||||||
|
|
||||||
Div(tmpCofUb, nTmp, nUpdateTmp2, calCount); // cof(i)=tmpS32/tmpS16
|
Div(tmpCofUb, nTmp, nUpdateTmp2, calCount); // cof(i)=tmpS32/tmpS16
|
||||||
if (info.isFirstSInnerLoop) {
|
if (info.isFirstSInnerLoop) {
|
||||||
Duplicate(cofValueUb[softmaxOutOffset], (T)1.0, calCount); // cof_0=1
|
Duplicate(cofValueUb[softmaxOutOffset], (T)1.0, calCount); // cof_0=1
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Div(epsUb, cofValueUb[softmaxOutOffset], tmpCofUb, calCount); // 1 / cof(i)
|
Div(epsUb, cofValueUb[softmaxOutOffset], tmpCofUb, calCount); // 1 / cof(i)
|
||||||
} else {
|
} else {
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Div(epsUb, cofValueUb[PreSoftmaxOutOffset], tmpCofUb, calCount); // cof(i - 1) / cof(i)
|
Div(epsUb, cofValueUb[PreSoftmaxOutOffset], tmpCofUb, calCount); // cof(i - 1) / cof(i)
|
||||||
}
|
}
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
Adds(cofValueUb[softmaxOutOffset], tmpCofUb, ConstInfo::FLOAT_ZERO, calCount); // store cof(i)
|
Adds(cofValueUb[softmaxOutOffset], tmpCofUb, ConstInfo::FLOAT_ZERO, calCount); // store cof(i)
|
||||||
Adds(epsUb, epsUb, (T)(-1.0), calCount); // cof(i - 1) / cof(i) - 1
|
Adds(epsUb, epsUb, (T)(-1.0), calCount); // cof(i - 1) / cof(i) - 1
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Muls(epsUb, epsUb, (T)1.5, calCount); // (cof(i - 1) - cof(i)) / cof(i) * 1.5
|
Muls(epsUb, epsUb, (T)1.5, calCount); // (cof(i - 1) - cof(i)) / cof(i) * 1.5
|
||||||
|
|
||||||
Maxs(nUpdateTmp, nUpdateTmp, (T)(-30.0), calCount); // N = max(n(i) - n(i-1), -30)
|
Maxs(nUpdateTmp, nUpdateTmp, (T)(-30.0), calCount); // N = max(n(i) - n(i-1), -30)
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Adds(epsUb, epsUb, (T)(0.000001), calCount);
|
Adds(epsUb, epsUb, (T)(0.000001), calCount);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Add(nUpdateTmp, nUpdateTmp, epsUb, calCount);
|
Add(nUpdateTmp, nUpdateTmp, epsUb, calCount);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Muls(nUpdateTmp, nUpdateTmp, FLOAT_E_SCALAR, calCount); // N = N * pow(2, 23)
|
Muls(nUpdateTmp, nUpdateTmp, FLOAT_E_SCALAR, calCount); // N = N * pow(2, 23)
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
// nUpdate int32 out
|
// nUpdate int32 out
|
||||||
LocalTensor<int32_t> tmQue = outputBuff2.Get<int32_t>();
|
LocalTensor<int32_t> tmQue = outputBuff2.Get<int32_t>();
|
||||||
@@ -555,7 +555,7 @@ __aicore__ inline void SFAVectorService<SFAT>::AmlaVecCompute(
|
|||||||
LocalTensor<int32_t> nInt32Out = tmQue[startRow];
|
LocalTensor<int32_t> nInt32Out = tmQue[startRow];
|
||||||
|
|
||||||
Cast(nInt32Out, nUpdateTmp, RoundMode::CAST_ROUND, dealRowCount);
|
Cast(nInt32Out, nUpdateTmp, RoundMode::CAST_ROUND, dealRowCount);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
SetFlag<AscendC::HardEvent::MTE3_V>(SYNC_OUTPUT_BUF2_FLAG);
|
SetFlag<AscendC::HardEvent::MTE3_V>(SYNC_OUTPUT_BUF2_FLAG);
|
||||||
}
|
}
|
||||||
@@ -583,18 +583,18 @@ __aicore__ inline void SFAVectorService<SFAT>::DealBmm1ResBaseBlock(
|
|||||||
|
|
||||||
ElewiseCompute(info, mmResUb, dealRowCount, columnCount);
|
ElewiseCompute(info, mmResUb, dealRowCount, columnCount);
|
||||||
|
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
LocalTensor<T> tmpAFloorUb = tmpBuff1.Get<T>();
|
LocalTensor<T> tmpAFloorUb = tmpBuff1.Get<T>();
|
||||||
LocalTensor<uint8_t> softmaxTmpUb = tmpAFloorUb.template ReinterpretCast<uint8_t>();
|
LocalTensor<uint8_t> softmaxTmpUb = tmpAFloorUb.template ReinterpretCast<uint8_t>();
|
||||||
|
|
||||||
SoftmaxFlashV2Compute(info, mSplitInfo, mmResUb, softmaxTmpUb, startRow, dealRowCount, columnCount,
|
SoftmaxFlashV2Compute(info, mSplitInfo, mmResUb, softmaxTmpUb, startRow, dealRowCount, columnCount,
|
||||||
info.actualSingleProcessSInnerSize);
|
info.actualSingleProcessSInnerSize);
|
||||||
|
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
AmlaVecCompute(info, mSplitInfo, mmResUb, softmaxTmpUb, startRow, dealRowCount, columnCount,
|
AmlaVecCompute(info, mSplitInfo, mmResUb, softmaxTmpUb, startRow, dealRowCount, columnCount,
|
||||||
info.actualSingleProcessSInnerSize);
|
info.actualSingleProcessSInnerSize);
|
||||||
|
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
LocalTensor<KV_T> tmpMMResCastTensor = outputBuff1.Get<KV_T>();
|
LocalTensor<KV_T> tmpMMResCastTensor = outputBuff1.Get<KV_T>();
|
||||||
WaitFlag<AscendC::HardEvent::MTE3_V>(SYNC_OUTPUT_BUF1_FLAG);
|
WaitFlag<AscendC::HardEvent::MTE3_V>(SYNC_OUTPUT_BUF1_FLAG);
|
||||||
|
|
||||||
@@ -1197,20 +1197,20 @@ SFAVectorService<SFAT>::DealBmm2ResBaseBlock(const RunInfo &info, const MSplitIn
|
|||||||
bmm2ResUb.SetSize(vec2ComputeSize);
|
bmm2ResUb.SetSize(vec2ComputeSize);
|
||||||
LocalTensor<T> absBmm2ResUb = bmm2ResUb.template ReinterpretCast<T>();
|
LocalTensor<T> absBmm2ResUb = bmm2ResUb.template ReinterpretCast<T>();
|
||||||
Abs(absBmm2ResUb, tmpBmm2ResUb, vec2ComputeSize);
|
Abs(absBmm2ResUb, tmpBmm2ResUb, vec2ComputeSize);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
LocalTensor<uint8_t> cmpMaskUb = absBmm2ResUb.template ReinterpretCast<uint8_t>();
|
LocalTensor<uint8_t> cmpMaskUb = absBmm2ResUb.template ReinterpretCast<uint8_t>();
|
||||||
CompareScalar(cmpMaskUb, absBmm2ResUb, (T)1e10, CMPMODE::LE, vec2ComputeSize);
|
CompareScalar(cmpMaskUb, absBmm2ResUb, (T)1e10, CMPMODE::LE, vec2ComputeSize);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Select(tmpBmm2ResUb, cmpMaskUb, tmpBmm2ResUb, ConstInfo::FLOAT_ZERO,
|
Select(tmpBmm2ResUb, cmpMaskUb, tmpBmm2ResUb, ConstInfo::FLOAT_ZERO,
|
||||||
SELMODE::VSEL_TENSOR_SCALAR_MODE, vec2ComputeSize);
|
SELMODE::VSEL_TENSOR_SCALAR_MODE, vec2ComputeSize);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
uint32_t baseOffset = mSplitInfo.nBufferStartM / 2 + startRow;
|
uint32_t baseOffset = mSplitInfo.nBufferStartM / 2 + startRow;
|
||||||
uint32_t idx = info.loop % (constInfo.preLoadNum);
|
uint32_t idx = info.loop % (constInfo.preLoadNum);
|
||||||
LocalTensor<T> tmpSumUb = v0ValidSizeBuff.Get<T>()[384];
|
LocalTensor<T> tmpSumUb = v0ValidSizeBuff.Get<T>()[384];
|
||||||
Brcb(tmpSumUb, aMlaSumUb[idx * SOFTMAX_TMP_BUFFER_OFFSET / sizeof(T) + baseOffset], (dealRowCount + 7) / 8, {1, 8});
|
Brcb(tmpSumUb, aMlaSumUb[idx * SOFTMAX_TMP_BUFFER_OFFSET / sizeof(T) + baseOffset], (dealRowCount + 7) / 8, {1, 8});
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
RowDivs(bmm2ResUb, tmpBmm2ResUb, tmpSumUb, dealRowCount, columnCount, actualColumnCount);
|
RowDivs(bmm2ResUb, tmpBmm2ResUb, tmpSumUb, dealRowCount, columnCount, actualColumnCount);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
SetFlag<AscendC::HardEvent::V_MTE2>(SYNC_INPUT_BUF1_FLAG + pingpongFlag);
|
SetFlag<AscendC::HardEvent::V_MTE2>(SYNC_INPUT_BUF1_FLAG + pingpongFlag);
|
||||||
Bmm2ResCopyOut(info, bmm2ResUb, mStart, dealRowCount, columnCount, actualColumnCount);
|
Bmm2ResCopyOut(info, bmm2ResUb, mStart, dealRowCount, columnCount, actualColumnCount);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -336,7 +336,7 @@ __aicore__ inline void PseSlopeCopyIn(LocalTensor<T> &dstTensor, LocalTensor<hal
|
|||||||
if (pseInfo.needCast) {
|
if (pseInfo.needCast) {
|
||||||
int64_t computeSize = pseInfo.vec1S1RealSize * pseInfo.s2AlignedSize;
|
int64_t computeSize = pseInfo.vec1S1RealSize * pseInfo.s2AlignedSize;
|
||||||
Cast(dstTensor, helpTensor, RoundMode::CAST_NONE, computeSize);
|
Cast(dstTensor, helpTensor, RoundMode::CAST_NONE, computeSize);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
int64_t s1Offset = pseInfo.s1oIdx * pseInfo.s1BaseSize + pseInfo.vecCoreOffset +
|
int64_t s1Offset = pseInfo.s1oIdx * pseInfo.s1BaseSize + pseInfo.vecCoreOffset +
|
||||||
pseInfo.loopIdx * pseInfo.vec1S1BaseSize;
|
pseInfo.loopIdx * pseInfo.vec1S1BaseSize;
|
||||||
@@ -345,16 +345,16 @@ __aicore__ inline void PseSlopeCopyIn(LocalTensor<T> &dstTensor, LocalTensor<hal
|
|||||||
float posShift = float(s2Offset + pseInfo.kvStartIdx - s1Offset - pseInfo.qStartIdx);
|
float posShift = float(s2Offset + pseInfo.kvStartIdx - s1Offset - pseInfo.qStartIdx);
|
||||||
|
|
||||||
Adds(dstTensor, dstTensor, posShift, computeSize);
|
Adds(dstTensor, dstTensor, posShift, computeSize);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Abs(dstTensor, dstTensor, computeSize);
|
Abs(dstTensor, dstTensor, computeSize);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
float slopes = ((__gm__ T *)pseSlope)[offset] * -1;
|
float slopes = ((__gm__ T *)pseSlope)[offset] * -1;
|
||||||
if (pseInfo.pseType == (uint32_t)PseTypeEnum::PSE_INNER_MUL_ADD_SQRT_TYPE) {
|
if (pseInfo.pseType == (uint32_t)PseTypeEnum::PSE_INNER_MUL_ADD_SQRT_TYPE) {
|
||||||
Sqrt(dstTensor, dstTensor, computeSize);
|
Sqrt(dstTensor, dstTensor, computeSize);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
}
|
}
|
||||||
Muls(dstTensor, dstTensor, slopes, computeSize);
|
Muls(dstTensor, dstTensor, slopes, computeSize);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -373,7 +373,7 @@ __aicore__ inline void PseSlopeCast(LocalTensor<T> &dstTensor, LocalTensor<half>
|
|||||||
int64_t offset = bOffset + n2Offset + gOffset;
|
int64_t offset = bOffset + n2Offset + gOffset;
|
||||||
int64_t computeSize = pseInfo.vec1S1RealSize * pseInfo.s2AlignedSize;
|
int64_t computeSize = pseInfo.vec1S1RealSize * pseInfo.s2AlignedSize;
|
||||||
Cast(dstTensor, helpTensor, RoundMode::CAST_NONE, computeSize);
|
Cast(dstTensor, helpTensor, RoundMode::CAST_NONE, computeSize);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
|
|
||||||
int64_t s1Offset = pseInfo.s1oIdx * pseInfo.s1BaseSize + pseInfo.vecCoreOffset +
|
int64_t s1Offset = pseInfo.s1oIdx * pseInfo.s1BaseSize + pseInfo.vecCoreOffset +
|
||||||
pseInfo.loopIdx * pseInfo.vec1S1BaseSize;
|
pseInfo.loopIdx * pseInfo.vec1S1BaseSize;
|
||||||
@@ -382,16 +382,16 @@ __aicore__ inline void PseSlopeCast(LocalTensor<T> &dstTensor, LocalTensor<half>
|
|||||||
float posShift = float(s2Offset + pseInfo.kvStartIdx - s1Offset - pseInfo.qStartIdx);
|
float posShift = float(s2Offset + pseInfo.kvStartIdx - s1Offset - pseInfo.qStartIdx);
|
||||||
|
|
||||||
Adds(dstTensor, dstTensor, posShift, computeSize);
|
Adds(dstTensor, dstTensor, posShift, computeSize);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
Abs(dstTensor, dstTensor, computeSize);
|
Abs(dstTensor, dstTensor, computeSize);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
float slopes = ((__gm__ T *)pseSlope)[offset] * -1;
|
float slopes = ((__gm__ T *)pseSlope)[offset] * -1;
|
||||||
if (pseInfo.pseType == (uint32_t)PseTypeEnum::PSE_INNER_MUL_ADD_SQRT_TYPE) {
|
if (pseInfo.pseType == (uint32_t)PseTypeEnum::PSE_INNER_MUL_ADD_SQRT_TYPE) {
|
||||||
Sqrt(dstTensor, dstTensor, computeSize);
|
Sqrt(dstTensor, dstTensor, computeSize);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
}
|
}
|
||||||
Muls(dstTensor, dstTensor, slopes, computeSize);
|
Muls(dstTensor, dstTensor, slopes, computeSize);
|
||||||
pipe_barrier(PIPE_V);
|
AscendC::PipeBarrier<PIPE_V>();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
15
setup.py
15
setup.py
@@ -97,8 +97,10 @@ def get_chip_type() -> str:
|
|||||||
# A3 case
|
# A3 case
|
||||||
assert npu_name
|
assert npu_name
|
||||||
return (chip_name + "_" + npu_name).lower()
|
return (chip_name + "_" + npu_name).lower()
|
||||||
|
elif "950" in chip_name:
|
||||||
|
assert npu_name
|
||||||
|
return (chip_name + "_" + npu_name).lower()
|
||||||
else:
|
else:
|
||||||
# TODO(zzzzwwjj): Currently, A5's chip name has not determined yet.
|
|
||||||
raise ValueError(f"Unable to recognize chip name: {chip_name}, please manually set env SOC_VERSION")
|
raise ValueError(f"Unable to recognize chip name: {chip_name}, please manually set env SOC_VERSION")
|
||||||
except subprocess.CalledProcessError as e:
|
except subprocess.CalledProcessError as e:
|
||||||
raise RuntimeError(f"Get chip info failed: {e}")
|
raise RuntimeError(f"Get chip info failed: {e}")
|
||||||
@@ -153,11 +155,14 @@ def gen_build_info():
|
|||||||
"ascend310p3vir02": "_310P",
|
"ascend310p3vir02": "_310P",
|
||||||
"ascend310p3vir04": "_310P",
|
"ascend310p3vir04": "_310P",
|
||||||
"ascend310p3vir08": "_310P",
|
"ascend310p3vir08": "_310P",
|
||||||
"ascend910_9579": "A5",
|
|
||||||
}
|
}
|
||||||
|
if "ascend950" in soc_version:
|
||||||
assert soc_version in soc_to_device, f"Undefined soc_version: {soc_version}. Please file an issue to vllm-ascend."
|
device_type = "A5"
|
||||||
device_type = soc_to_device[soc_version]
|
else:
|
||||||
|
assert soc_version in soc_to_device, (
|
||||||
|
f"Undefined soc_version: {soc_version}. Please file an issue to vllm-ascend."
|
||||||
|
)
|
||||||
|
device_type = soc_to_device[soc_version]
|
||||||
|
|
||||||
package_dir = os.path.join(ROOT_DIR, "vllm_ascend", "_build_info.py")
|
package_dir = os.path.join(ROOT_DIR, "vllm_ascend", "_build_info.py")
|
||||||
with open(package_dir, "w+") as f:
|
with open(package_dir, "w+") as f:
|
||||||
|
|||||||
@@ -267,7 +267,10 @@ def enable_custom_op():
|
|||||||
|
|
||||||
# There are some customed operators which aren't implemented
|
# There are some customed operators which aren't implemented
|
||||||
# with batch invariant in vllm-ascend, we need to disable them.
|
# with batch invariant in vllm-ascend, we need to disable them.
|
||||||
if vllm_is_batch_invariant():
|
# FIXME(linfeng): Currently custom op compilation and execution are partially available
|
||||||
|
# in ASCEND950 chip, we temporarily disable all custom ops. Please refer to
|
||||||
|
# https://github.com/vllm-project/vllm-ascend/issues/7157 for latest update about custom op.
|
||||||
|
if vllm_is_batch_invariant() or get_ascend_device_type() == AscendDeviceType.A5:
|
||||||
_CUSTOM_OP_ENABLED = False
|
_CUSTOM_OP_ENABLED = False
|
||||||
return _CUSTOM_OP_ENABLED
|
return _CUSTOM_OP_ENABLED
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user