[Build] Add support for Ascend950 chip (#7151)

### What this PR does / why we need it?
This PR adds support for the Ascend950 chip. This includes:
- Updating build scripts (`CMakeLists.txt` and `setup.py`) to recognize
the Ascend950 chip and set appropriate compilation flags.
- Disabling a set of custom operators that are not yet supported on the
Ascend950 hardware target.
- Performing a codebase-wide refactoring of `pipe_barrier()` calls to
the namespaced `AscendC::PipeBarrier<>()` for improved code consistency
and adherence to the latest API standards.

Ascend950DT e2e passed (Qwen3-32B-MXFP8) and CI passed
- vLLM version: v0.16.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: linfeng-yuan <1102311262@qq.com>
This commit is contained in:
linfeng-yuan
2026-03-12 10:25:51 +08:00
committed by GitHub
parent da01a74009
commit 5f3826b093
24 changed files with 246 additions and 227 deletions

View File

@@ -458,7 +458,7 @@ __aicore__ inline void SFAVectorService<SFAT>::SoftmaxFlashV2Compute(
} else {
uint32_t dealRowCountAlign = SFAAlign(dealRowCount, FP32_BLOCK_ELEMENT_NUM);
DataCopy(softmaxSumUb[softmaxOutOffset], inSumTensor, dealRowCountAlign);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
DataCopy(softmaxMaxUb[softmaxOutOffset], inMaxTensor, dealRowCountAlign);
}
}
@@ -477,9 +477,9 @@ __aicore__ inline void SFAVectorService<SFAT>::AmlaVecCompute(
LocalTensor<T> nUpdateTmp = nTmp[SOFTMAX_TMP_BUFFER_OFFSET / sizeof(T)];
Muls(nTmp, softmaxMaxUb[softmaxOutOffset], ((T)(-1.0)) * RECIP_OF_LN2, calCount);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Cast(nTmp, nTmp, RoundMode::CAST_ROUND, calCount);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
uint32_t prOutIdx = (info.loop - 1) % (constInfo.preLoadNum);
uint32_t PreSoftmaxOutOffset = prOutIdx * SOFTMAX_TMP_BUFFER_OFFSET / sizeof(T) + baseOffset;
@@ -489,10 +489,10 @@ __aicore__ inline void SFAVectorService<SFAT>::AmlaVecCompute(
} else {
Sub(nUpdateTmp, nTmp, nValueUb[PreSoftmaxOutOffset], calCount);
}
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
// update n(i), DataCopy not support when calCount is not align 32B, so use Adds
Adds(nValueUb[softmaxOutOffset], nTmp, ConstInfo::FLOAT_ZERO, calCount);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
// update softmax res
LocalTensor<T> nUpdateTmp2 = nTmp[2 * SOFTMAX_TMP_BUFFER_OFFSET / sizeof(T)];
@@ -500,17 +500,17 @@ __aicore__ inline void SFAVectorService<SFAT>::AmlaVecCompute(
LocalTensor<T> tmpCofUb = nTmp[4 * SOFTMAX_TMP_BUFFER_OFFSET / sizeof(T)];
LocalTensor<T> epsUb = nTmp[5 * SOFTMAX_TMP_BUFFER_OFFSET / sizeof(T)];
Muls(nUpdateTmp2, softmaxMaxUb[softmaxOutOffset], RECIP_OF_LN2, calCount);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Add(nTmp, nUpdateTmp2, nTmp, calCount);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Muls(nTmp, nTmp, LN2, calCount);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Exp(nTmp, nTmp, calCount);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Cast(nTmp_KvT, nTmp, RoundMode::CAST_ROUND, calCount); // fp32->fp16/bf16
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Cast(nUpdateTmp2, nTmp_KvT, RoundMode::CAST_NONE, calCount); // fp16/bf16->fp32
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
if (info.s2Idx + 1 == info.curSInnerLoopTimes) {
Mul(aMlaSumUb[softmaxOutOffset], softmaxSumUb[softmaxOutOffset], nUpdateTmp2, calCount);
}
@@ -521,33 +521,33 @@ __aicore__ inline void SFAVectorService<SFAT>::AmlaVecCompute(
}
LocalTensor<T> nTmp3 = nTmp[6 * SOFTMAX_TMP_BUFFER_OFFSET / sizeof(T)];
Brcb(nTmp3, nUpdateTmp2, (dealRowCount + 7) / 8, {1, 8});
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
RowMuls(mmResUb, mmResUb, nTmp3, dealRowCount, columnCount, actualColumnCount);
Div(tmpCofUb, nTmp, nUpdateTmp2, calCount); // cof(i)=tmpS32/tmpS16
if (info.isFirstSInnerLoop) {
Duplicate(cofValueUb[softmaxOutOffset], (T)1.0, calCount); // cof_0=1
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Div(epsUb, cofValueUb[softmaxOutOffset], tmpCofUb, calCount); // 1 / cof(i)
} else {
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Div(epsUb, cofValueUb[PreSoftmaxOutOffset], tmpCofUb, calCount); // cof(i - 1) / cof(i)
}
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Adds(cofValueUb[softmaxOutOffset], tmpCofUb, ConstInfo::FLOAT_ZERO, calCount); // store cof(i)
Adds(epsUb, epsUb, (T)(-1.0), calCount); // cof(i - 1) / cof(i) - 1
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Muls(epsUb, epsUb, (T)1.5, calCount); // (cof(i - 1) - cof(i)) / cof(i) * 1.5
Maxs(nUpdateTmp, nUpdateTmp, (T)(-30.0), calCount); // N = max(n(i) - n(i-1), -30)
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Adds(epsUb, epsUb, (T)(0.000001), calCount);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Add(nUpdateTmp, nUpdateTmp, epsUb, calCount);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Muls(nUpdateTmp, nUpdateTmp, FLOAT_E_SCALAR, calCount); // N = N * pow(2, 23)
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
// nUpdate int32 out
LocalTensor<int32_t> tmQue = outputBuff2.Get<int32_t>();
@@ -555,7 +555,7 @@ __aicore__ inline void SFAVectorService<SFAT>::AmlaVecCompute(
LocalTensor<int32_t> nInt32Out = tmQue[startRow];
Cast(nInt32Out, nUpdateTmp, RoundMode::CAST_ROUND, dealRowCount);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
SetFlag<AscendC::HardEvent::MTE3_V>(SYNC_OUTPUT_BUF2_FLAG);
}
@@ -583,18 +583,18 @@ __aicore__ inline void SFAVectorService<SFAT>::DealBmm1ResBaseBlock(
ElewiseCompute(info, mmResUb, dealRowCount, columnCount);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
LocalTensor<T> tmpAFloorUb = tmpBuff1.Get<T>();
LocalTensor<uint8_t> softmaxTmpUb = tmpAFloorUb.template ReinterpretCast<uint8_t>();
SoftmaxFlashV2Compute(info, mSplitInfo, mmResUb, softmaxTmpUb, startRow, dealRowCount, columnCount,
info.actualSingleProcessSInnerSize);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
AmlaVecCompute(info, mSplitInfo, mmResUb, softmaxTmpUb, startRow, dealRowCount, columnCount,
info.actualSingleProcessSInnerSize);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
LocalTensor<KV_T> tmpMMResCastTensor = outputBuff1.Get<KV_T>();
WaitFlag<AscendC::HardEvent::MTE3_V>(SYNC_OUTPUT_BUF1_FLAG);
@@ -1197,20 +1197,20 @@ SFAVectorService<SFAT>::DealBmm2ResBaseBlock(const RunInfo &info, const MSplitIn
bmm2ResUb.SetSize(vec2ComputeSize);
LocalTensor<T> absBmm2ResUb = bmm2ResUb.template ReinterpretCast<T>();
Abs(absBmm2ResUb, tmpBmm2ResUb, vec2ComputeSize);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
LocalTensor<uint8_t> cmpMaskUb = absBmm2ResUb.template ReinterpretCast<uint8_t>();
CompareScalar(cmpMaskUb, absBmm2ResUb, (T)1e10, CMPMODE::LE, vec2ComputeSize);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
Select(tmpBmm2ResUb, cmpMaskUb, tmpBmm2ResUb, ConstInfo::FLOAT_ZERO,
SELMODE::VSEL_TENSOR_SCALAR_MODE, vec2ComputeSize);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
uint32_t baseOffset = mSplitInfo.nBufferStartM / 2 + startRow;
uint32_t idx = info.loop % (constInfo.preLoadNum);
LocalTensor<T> tmpSumUb = v0ValidSizeBuff.Get<T>()[384];
Brcb(tmpSumUb, aMlaSumUb[idx * SOFTMAX_TMP_BUFFER_OFFSET / sizeof(T) + baseOffset], (dealRowCount + 7) / 8, {1, 8});
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
RowDivs(bmm2ResUb, tmpBmm2ResUb, tmpSumUb, dealRowCount, columnCount, actualColumnCount);
pipe_barrier(PIPE_V);
AscendC::PipeBarrier<PIPE_V>();
SetFlag<AscendC::HardEvent::V_MTE2>(SYNC_INPUT_BUF1_FLAG + pingpongFlag);
Bmm2ResCopyOut(info, bmm2ResUb, mStart, dealRowCount, columnCount, actualColumnCount);
}