[Build] Add support for Ascend950 chip (#7151)

### What this PR does / why we need it?
This PR adds support for the Ascend950 chip. This includes:
- Updating build scripts (`CMakeLists.txt` and `setup.py`) to recognize
the Ascend950 chip and set appropriate compilation flags.
- Disabling a set of custom operators that are not yet supported on the
Ascend950 hardware target.
- Performing a codebase-wide refactoring of `pipe_barrier()` calls to
the namespaced `AscendC::PipeBarrier<>()` for improved code consistency
and adherence to the latest API standards.

Ascend950DT e2e passed (Qwen3-32B-MXFP8) and CI passed
- vLLM version: v0.16.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: linfeng-yuan <1102311262@qq.com>
This commit is contained in:
linfeng-yuan
2026-03-12 10:25:51 +08:00
committed by GitHub
parent da01a74009
commit 5f3826b093
24 changed files with 246 additions and 227 deletions

View File

@@ -264,7 +264,7 @@ __aicore__ inline void CamMoeDistributeCombine<TemplateMC2TypeFunc>::Init(
DataCacheCleanAndInvalid<int32_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(
selfDataStatusTensor[coreIdx_ * UB_ALIGN]);
__asm__ __volatile__("");
pipe_barrier(PIPE_ALL);
AscendC::PipeBarrier<PIPE_ALL>();
workspaceGM_ = workspaceGM;
expandXGM_.SetGlobalBuffer((__gm__ ExpandXType *)expandX);
@@ -480,13 +480,13 @@ __aicore__ inline void CamMoeDistributeCombine<TemplateMC2TypeFunc>::ReduceScatt
template <TemplateMC2TypeClass>
__aicore__ inline void CamMoeDistributeCombine<TemplateMC2TypeFunc>::SetWaitTpStatusAndDisPatch()
{
pipe_barrier(PIPE_ALL);
AscendC::PipeBarrier<PIPE_ALL>();
if (startRankId_ >= epWorldSize_) {
return;
}
if constexpr (IsNeedReduceScatter) {
uint32_t tpToRankId = 1 - tpRankId_;
pipe_barrier(PIPE_ALL);
AscendC::PipeBarrier<PIPE_ALL>();
LocalTensor<float> statusFlagUb = readStateBuf_.Get<float>();
statusFlagUb(0) = sumTarget_;
SyncFunc<AscendC::HardEvent::S_MTE3>();
@@ -604,9 +604,9 @@ __aicore__ inline void CamMoeDistributeCombine<TemplateMC2TypeFunc>::CustomAdd(L
if constexpr (AscendC::IsSameType<ExpandXType, bfloat16_t>::value) {
Cast(winTpSendCountFloatTensor_, src0, RoundMode::CAST_NONE, dataCnt);
Cast(gmTpSendCountFloatTensor_, src1, RoundMode::CAST_NONE, dataCnt);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Add(winTpSendCountFloatTensor_, winTpSendCountFloatTensor_, gmTpSendCountFloatTensor_, dataCnt);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Cast(dst, winTpSendCountFloatTensor_, RoundMode::CAST_ROUND, dataCnt);
} else {
Add(dst, src0, src1, dataCnt);
@@ -616,7 +616,7 @@ __aicore__ inline void CamMoeDistributeCombine<TemplateMC2TypeFunc>::CustomAdd(L
template <TemplateMC2TypeClass>
__aicore__ inline void CamMoeDistributeCombine<TemplateMC2TypeFunc>::SetStatus()
{
pipe_barrier(PIPE_ALL);
AscendC::PipeBarrier<PIPE_ALL>();
if (startRankId_ >= epWorldSize_) {
return;
}

View File

@@ -253,7 +253,7 @@ __aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::Init(
DataCacheCleanAndInvalid<int32_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(
selfDataStatusTensor[aivId_ * UB_ALIGN]);
__asm__ __volatile__("");
pipe_barrier(PIPE_ALL);
AscendC::PipeBarrier<PIPE_ALL>();
axisBS_ = tilingData->disGmmDeqSwigluQuantGmmDeqComInfo.bs;
axisH_ = tilingData->disGmmDeqSwigluQuantGmmDeqComInfo.h;
epWorldSize_ = tilingData->disGmmDeqSwigluQuantGmmDeqComInfo.epRankSize;
@@ -568,7 +568,7 @@ __aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::Allto
}
tableLocalTensor_((tokenIndex / axisK_ + 1) * moeExpertRankNumAligned_ + expertId) = 1;
}
pipe_barrier(PIPE_ALL);
AscendC::PipeBarrier<PIPE_ALL>();
uint32_t sendTokenNum = expertIdsCnt / moeUsedAivNum_;
uint32_t remainderTokenNum = expertIdsCnt % moeUsedAivNum_;
@@ -587,7 +587,7 @@ __aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::Allto
Add(tableInt16LocalTensor_[row * moeExpertRankNumInt16Aligned_],
tableInt16LocalTensor_[row * moeExpertRankNumInt16Aligned_],
tableInt16LocalTensor_[(row - 1) * moeExpertRankNumInt16Aligned_], moeExpertRankNumInt16Aligned_);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
}
// row-i of tableLocalTensor_ is index of token
@@ -655,7 +655,7 @@ __aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::Allto
template <TemplateDispatchTypeClass>
__aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::SetStatus()
{
pipe_barrier(PIPE_ALL);
AscendC::PipeBarrier<PIPE_ALL>();
SyncAll<true>();
totalExpertNum_ = sharedExpertRankNum_ + moeExpertNum_;
sendExpertNum_ = totalExpertNum_ / aivNum_;
@@ -695,7 +695,7 @@ __aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::Quant
floatLocalTemp = receiveDataCastFloatBuf_.Get<float>();
Cast(floatLocalTemp, xInTensor_, RoundMode::CAST_NONE, axisH_);
xInQueue_.FreeTensor<XType>(xInTensor_);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
if constexpr (IsSmoothScaleExist) {
if constexpr (DynamicQuant) {
SyncFunc<AscendC::HardEvent::V_MTE2>();
@@ -703,28 +703,28 @@ __aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::Quant
DataCopy(smoothScalesTensor_, scalesGMTensor_[expertIndex * axisH_], axisH_);
SyncFunc<AscendC::HardEvent::MTE2_V>();
Mul(floatLocalTemp, floatLocalTemp, smoothScalesTensor_, axisH_);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
}
if constexpr (DynamicQuant) {
LocalTensor<float> floatLocalAbsTemp = smoothScalesBuf_.Get<float>();
rowMaxTensor_ = rowMaxBuf_.Get<float>();
Abs(floatLocalAbsTemp, floatLocalTemp, axisH_);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
ReduceMax(rowMaxTensor_, floatLocalAbsTemp, floatLocalAbsTemp, axisH_, false);
SyncFunc<AscendC::HardEvent::V_S>();
dynamicScale = float(127.0) / rowMaxTensor_.GetValue(0);
SyncFunc<AscendC::HardEvent::S_V>();
Muls(floatLocalTemp, floatLocalTemp, dynamicScale, axisH_);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
}
LocalTensor<half> halfLocalTemp = floatLocalTemp.ReinterpretCast<half>();
LocalTensor<int32_t> int32LocalTemp = floatLocalTemp.ReinterpretCast<int32_t>();
Cast(int32LocalTemp, floatLocalTemp, RoundMode::CAST_RINT, axisH_);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
SetDeqScale((half)1.000000e+00f);
PipeBarrier<PIPE_V>();
Cast(halfLocalTemp, int32LocalTemp, RoundMode::CAST_ROUND, axisH_);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Cast(xOutTensor_, halfLocalTemp, RoundMode::CAST_TRUNC, axisH_);
floatLocalTemp = xOutTensor_.template ReinterpretCast<float>();
floatLocalTemp.SetValue(axisH_ / sizeof(float), float(1.0) / dynamicScale); // int8->float32
@@ -742,10 +742,10 @@ __aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::Local
xQueue_.EnQue(xTmpTensor_);
xTmpTensor_ = xQueue_.DeQue<ExpandXOutType>();
if constexpr (DynamicQuant || StaticQuant) {
pipe_barrier(PIPE_ALL);
AscendC::PipeBarrier<PIPE_ALL>();
xOutFp32Tensor_ = xTmpTensor_.template ReinterpretCast<float>();
dynamicScalesTensor_.SetValue(dynamicScalesLocalIdx++, xOutFp32Tensor_.GetValue(axisH_ / sizeof(float)));
pipe_barrier(PIPE_ALL);
AscendC::PipeBarrier<PIPE_ALL>();
}
if constexpr (IsNeedAllgater) {
DataCopy(winTpGatherOutGMTensor_[tokenOffset * axisH_], xTmpTensor_, axisH_);
@@ -791,7 +791,7 @@ __aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::WaitD
SyncFunc<AscendC::HardEvent::MTE2_V>();
GatherMask(gatherMaskOutTensor, statusFp32Tensor_, gatherTmpTensor, true, mask,
{1, (uint16_t)recStatusNumPerCore, 1, 0}, rsvdCnt);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Sum(statusSumOutTensor, gatherMaskOutTensor, sumParams);
SyncFunc<AscendC::HardEvent::V_S>();
sumOfFlag = statusSumOutTensor.GetValue(0);
@@ -929,11 +929,11 @@ __aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::Local
xQueue_.EnQue(xTmpTensor_);
xTmpTensor_ = xQueue_.DeQue<ExpandXOutType>();
if constexpr (DynamicQuant || StaticQuant) {
pipe_barrier(PIPE_ALL);
AscendC::PipeBarrier<PIPE_ALL>();
xOutFp32Tensor_ = xTmpTensor_.template ReinterpretCast<float>();
DataCopyPad(dynamicScalesOutGMTensor_[beginIdx + j], xOutFp32Tensor_[axisH_ / sizeof(float)],
dataCopyParamsFloat);
pipe_barrier(PIPE_ALL);
AscendC::PipeBarrier<PIPE_ALL>();
}
if constexpr (IsNeedAllgater) {
DataCopy(winTpGatherOutGMTensor_[(beginIdx + j) * axisHCommu_], xTmpTensor_, axisHCommu_);
@@ -963,7 +963,7 @@ __aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::Local
template <TemplateDispatchTypeClass>
__aicore__ inline void CamMoeDistributeDispatch<TemplateDispatchTypeFunc>::AllGatherSetStatusAndWait()
{
pipe_barrier(PIPE_ALL);
AscendC::PipeBarrier<PIPE_ALL>();
if (startExpertId_ >= totalExpertNum_) {
return;
}