[Build] Add support for Ascend950 chip (#7151)
### What this PR does / why we need it?
This PR adds support for the Ascend950 chip. This includes:
- Updating build scripts (`CMakeLists.txt` and `setup.py`) to recognize
the Ascend950 chip and set appropriate compilation flags.
- Disabling a set of custom operators that are not yet supported on the
Ascend950 hardware target.
- Performing a codebase-wide refactoring of `pipe_barrier()` calls to
the namespaced `AscendC::PipeBarrier<>()` for improved code consistency
and adherence to the latest API standards.
Ascend950DT e2e passed (Qwen3-32B-MXFP8) and CI passed
- vLLM version: v0.16.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: linfeng-yuan <1102311262@qq.com>
This commit is contained in:
@@ -264,7 +264,7 @@ __aicore__ inline void CamMoeDistributeCombine<TemplateMC2TypeFunc>::Init(
|
||||
DataCacheCleanAndInvalid<int32_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(
|
||||
selfDataStatusTensor[coreIdx_ * UB_ALIGN]);
|
||||
__asm__ __volatile__("");
|
||||
pipe_barrier(PIPE_ALL);
|
||||
AscendC::PipeBarrier<PIPE_ALL>();
|
||||
|
||||
workspaceGM_ = workspaceGM;
|
||||
expandXGM_.SetGlobalBuffer((__gm__ ExpandXType *)expandX);
|
||||
@@ -480,13 +480,13 @@ __aicore__ inline void CamMoeDistributeCombine<TemplateMC2TypeFunc>::ReduceScatt
|
||||
template <TemplateMC2TypeClass>
|
||||
__aicore__ inline void CamMoeDistributeCombine<TemplateMC2TypeFunc>::SetWaitTpStatusAndDisPatch()
|
||||
{
|
||||
pipe_barrier(PIPE_ALL);
|
||||
AscendC::PipeBarrier<PIPE_ALL>();
|
||||
if (startRankId_ >= epWorldSize_) {
|
||||
return;
|
||||
}
|
||||
if constexpr (IsNeedReduceScatter) {
|
||||
uint32_t tpToRankId = 1 - tpRankId_;
|
||||
pipe_barrier(PIPE_ALL);
|
||||
AscendC::PipeBarrier<PIPE_ALL>();
|
||||
LocalTensor<float> statusFlagUb = readStateBuf_.Get<float>();
|
||||
statusFlagUb(0) = sumTarget_;
|
||||
SyncFunc<AscendC::HardEvent::S_MTE3>();
|
||||
@@ -604,9 +604,9 @@ __aicore__ inline void CamMoeDistributeCombine<TemplateMC2TypeFunc>::CustomAdd(L
|
||||
if constexpr (AscendC::IsSameType<ExpandXType, bfloat16_t>::value) {
|
||||
Cast(winTpSendCountFloatTensor_, src0, RoundMode::CAST_NONE, dataCnt);
|
||||
Cast(gmTpSendCountFloatTensor_, src1, RoundMode::CAST_NONE, dataCnt);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Add(winTpSendCountFloatTensor_, winTpSendCountFloatTensor_, gmTpSendCountFloatTensor_, dataCnt);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Cast(dst, winTpSendCountFloatTensor_, RoundMode::CAST_ROUND, dataCnt);
|
||||
} else {
|
||||
Add(dst, src0, src1, dataCnt);
|
||||
@@ -616,7 +616,7 @@ __aicore__ inline void CamMoeDistributeCombine<TemplateMC2TypeFunc>::CustomAdd(L
|
||||
template <TemplateMC2TypeClass>
|
||||
__aicore__ inline void CamMoeDistributeCombine<TemplateMC2TypeFunc>::SetStatus()
|
||||
{
|
||||
pipe_barrier(PIPE_ALL);
|
||||
AscendC::PipeBarrier<PIPE_ALL>();
|
||||
if (startRankId_ >= epWorldSize_) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -253,7 +253,7 @@ __aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::Init(
|
||||
DataCacheCleanAndInvalid<int32_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(
|
||||
selfDataStatusTensor[aivId_ * UB_ALIGN]);
|
||||
__asm__ __volatile__("");
|
||||
pipe_barrier(PIPE_ALL);
|
||||
AscendC::PipeBarrier<PIPE_ALL>();
|
||||
axisBS_ = tilingData->disGmmDeqSwigluQuantGmmDeqComInfo.bs;
|
||||
axisH_ = tilingData->disGmmDeqSwigluQuantGmmDeqComInfo.h;
|
||||
epWorldSize_ = tilingData->disGmmDeqSwigluQuantGmmDeqComInfo.epRankSize;
|
||||
@@ -568,7 +568,7 @@ __aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::Allto
|
||||
}
|
||||
tableLocalTensor_((tokenIndex / axisK_ + 1) * moeExpertRankNumAligned_ + expertId) = 1;
|
||||
}
|
||||
pipe_barrier(PIPE_ALL);
|
||||
AscendC::PipeBarrier<PIPE_ALL>();
|
||||
|
||||
uint32_t sendTokenNum = expertIdsCnt / moeUsedAivNum_;
|
||||
uint32_t remainderTokenNum = expertIdsCnt % moeUsedAivNum_;
|
||||
@@ -587,7 +587,7 @@ __aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::Allto
|
||||
Add(tableInt16LocalTensor_[row * moeExpertRankNumInt16Aligned_],
|
||||
tableInt16LocalTensor_[row * moeExpertRankNumInt16Aligned_],
|
||||
tableInt16LocalTensor_[(row - 1) * moeExpertRankNumInt16Aligned_], moeExpertRankNumInt16Aligned_);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
}
|
||||
|
||||
// row-i of tableLocalTensor_ is index of token
|
||||
@@ -655,7 +655,7 @@ __aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::Allto
|
||||
template <TemplateDispatchTypeClass>
|
||||
__aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::SetStatus()
|
||||
{
|
||||
pipe_barrier(PIPE_ALL);
|
||||
AscendC::PipeBarrier<PIPE_ALL>();
|
||||
SyncAll<true>();
|
||||
totalExpertNum_ = sharedExpertRankNum_ + moeExpertNum_;
|
||||
sendExpertNum_ = totalExpertNum_ / aivNum_;
|
||||
@@ -695,7 +695,7 @@ __aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::Quant
|
||||
floatLocalTemp = receiveDataCastFloatBuf_.Get<float>();
|
||||
Cast(floatLocalTemp, xInTensor_, RoundMode::CAST_NONE, axisH_);
|
||||
xInQueue_.FreeTensor<XType>(xInTensor_);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
if constexpr (IsSmoothScaleExist) {
|
||||
if constexpr (DynamicQuant) {
|
||||
SyncFunc<AscendC::HardEvent::V_MTE2>();
|
||||
@@ -703,28 +703,28 @@ __aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::Quant
|
||||
DataCopy(smoothScalesTensor_, scalesGMTensor_[expertIndex * axisH_], axisH_);
|
||||
SyncFunc<AscendC::HardEvent::MTE2_V>();
|
||||
Mul(floatLocalTemp, floatLocalTemp, smoothScalesTensor_, axisH_);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
}
|
||||
if constexpr (DynamicQuant) {
|
||||
LocalTensor<float> floatLocalAbsTemp = smoothScalesBuf_.Get<float>();
|
||||
rowMaxTensor_ = rowMaxBuf_.Get<float>();
|
||||
Abs(floatLocalAbsTemp, floatLocalTemp, axisH_);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
ReduceMax(rowMaxTensor_, floatLocalAbsTemp, floatLocalAbsTemp, axisH_, false);
|
||||
SyncFunc<AscendC::HardEvent::V_S>();
|
||||
dynamicScale = float(127.0) / rowMaxTensor_.GetValue(0);
|
||||
SyncFunc<AscendC::HardEvent::S_V>();
|
||||
Muls(floatLocalTemp, floatLocalTemp, dynamicScale, axisH_);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
}
|
||||
LocalTensor<half> halfLocalTemp = floatLocalTemp.ReinterpretCast<half>();
|
||||
LocalTensor<int32_t> int32LocalTemp = floatLocalTemp.ReinterpretCast<int32_t>();
|
||||
Cast(int32LocalTemp, floatLocalTemp, RoundMode::CAST_RINT, axisH_);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
SetDeqScale((half)1.000000e+00f);
|
||||
PipeBarrier<PIPE_V>();
|
||||
Cast(halfLocalTemp, int32LocalTemp, RoundMode::CAST_ROUND, axisH_);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Cast(xOutTensor_, halfLocalTemp, RoundMode::CAST_TRUNC, axisH_);
|
||||
floatLocalTemp = xOutTensor_.template ReinterpretCast<float>();
|
||||
floatLocalTemp.SetValue(axisH_ / sizeof(float), float(1.0) / dynamicScale); // int8->float32
|
||||
@@ -742,10 +742,10 @@ __aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::Local
|
||||
xQueue_.EnQue(xTmpTensor_);
|
||||
xTmpTensor_ = xQueue_.DeQue<ExpandXOutType>();
|
||||
if constexpr (DynamicQuant || StaticQuant) {
|
||||
pipe_barrier(PIPE_ALL);
|
||||
AscendC::PipeBarrier<PIPE_ALL>();
|
||||
xOutFp32Tensor_ = xTmpTensor_.template ReinterpretCast<float>();
|
||||
dynamicScalesTensor_.SetValue(dynamicScalesLocalIdx++, xOutFp32Tensor_.GetValue(axisH_ / sizeof(float)));
|
||||
pipe_barrier(PIPE_ALL);
|
||||
AscendC::PipeBarrier<PIPE_ALL>();
|
||||
}
|
||||
if constexpr (IsNeedAllgater) {
|
||||
DataCopy(winTpGatherOutGMTensor_[tokenOffset * axisH_], xTmpTensor_, axisH_);
|
||||
@@ -791,7 +791,7 @@ __aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::WaitD
|
||||
SyncFunc<AscendC::HardEvent::MTE2_V>();
|
||||
GatherMask(gatherMaskOutTensor, statusFp32Tensor_, gatherTmpTensor, true, mask,
|
||||
{1, (uint16_t)recStatusNumPerCore, 1, 0}, rsvdCnt);
|
||||
pipe_barrier(PIPE_V);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
Sum(statusSumOutTensor, gatherMaskOutTensor, sumParams);
|
||||
SyncFunc<AscendC::HardEvent::V_S>();
|
||||
sumOfFlag = statusSumOutTensor.GetValue(0);
|
||||
@@ -929,11 +929,11 @@ __aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::Local
|
||||
xQueue_.EnQue(xTmpTensor_);
|
||||
xTmpTensor_ = xQueue_.DeQue<ExpandXOutType>();
|
||||
if constexpr (DynamicQuant || StaticQuant) {
|
||||
pipe_barrier(PIPE_ALL);
|
||||
AscendC::PipeBarrier<PIPE_ALL>();
|
||||
xOutFp32Tensor_ = xTmpTensor_.template ReinterpretCast<float>();
|
||||
DataCopyPad(dynamicScalesOutGMTensor_[beginIdx + j], xOutFp32Tensor_[axisH_ / sizeof(float)],
|
||||
dataCopyParamsFloat);
|
||||
pipe_barrier(PIPE_ALL);
|
||||
AscendC::PipeBarrier<PIPE_ALL>();
|
||||
}
|
||||
if constexpr (IsNeedAllgater) {
|
||||
DataCopy(winTpGatherOutGMTensor_[(beginIdx + j) * axisHCommu_], xTmpTensor_, axisHCommu_);
|
||||
@@ -963,7 +963,7 @@ __aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::Local
|
||||
template <TemplateDispatchTypeClass>
|
||||
__aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::AllGatherSetStatusAndWait()
|
||||
{
|
||||
pipe_barrier(PIPE_ALL);
|
||||
AscendC::PipeBarrier<PIPE_ALL>();
|
||||
if (startExpertId_ >= totalExpertNum_) {
|
||||
return;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user