[Build] Add support for Ascend950 chip (#7151)

### What this PR does / why we need it? This PR adds support for the Ascend950 chip. This includes: - Updating build scripts (`CMakeLists.txt` and `setup.py`) to recognize the Ascend950 chip and set appropriate compilation flags. - Disabling a set of custom operators that are not yet supported on the Ascend950 hardware target. - Performing a codebase-wide refactoring of `pipe_barrier()` calls to the namespaced `AscendC::PipeBarrier<>()` for improved code consistency and adherence to the latest API standards. Ascend950DT e2e passed (Qwen3-32B-MXFP8) and CI passed - vLLM version: v0.16.0 - vLLM main: 4034c3d32e --------- Signed-off-by: linfeng-yuan <1102311262@qq.com>
2026-03-12 10:25:51 +08:00
parent da01a74009
commit 5f3826b093
24 changed files with 246 additions and 227 deletions
--- a/csrc/kernels/bgmv_expand.cpp
+++ b/csrc/kernels/bgmv_expand.cpp
@@ -187,7 +187,7 @@ private:
            }
        } else {
            Cast(xDup, xLocal, AscendC::RoundMode::CAST_NONE, maxLoRARank_);
-            pipe_barrier(PIPE_V);
+            AscendC::PipeBarrier<PIPE_V>();

            for (int32_t i = maxLoRARank_; i < NUM_ELEMENTS_PER_REPEAT; i += maxLoRARank_) {
                for (int32_t j = 0; j < maxLoRARank_; j++) {
@@ -219,15 +219,15 @@ private:
        AscendC::LocalTensor<Y_T> yInLocal = inQueueY_.DeQue<Y_T>();
        AscendC::LocalTensor<float> yInLocalFP32 = inBufferY_.Get<float>();
        Cast(yInLocalFP32, yInLocal, AscendC::RoundMode::CAST_NONE, numElements);
-        pipe_barrier(PIPE_V);
+        AscendC::PipeBarrier<PIPE_V>();
        inQueueY_.FreeTensor(yInLocal);

        Add(yLocal, yLocal, yInLocalFP32, numElements);
-        pipe_barrier(PIPE_V);
+        AscendC::PipeBarrier<PIPE_V>();

        AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.AllocTensor<Y_T>();
        Cast(yOutLocal, yLocal, AscendC::RoundMode::CAST_RINT, numElements);
-        pipe_barrier(PIPE_V);
+        AscendC::PipeBarrier<PIPE_V>();

        outQueueY_.EnQue<Y_T>(yOutLocal);
    }
@@ -243,40 +243,40 @@ private:
        AscendC::LocalTensor<float> wTmpTensor = tmpBufferW_.Get<float>();

        Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, MASK_COUNT, blockReduceRepeatCount, castParams_);
-        pipe_barrier(PIPE_V);
+        AscendC::PipeBarrier<PIPE_V>();
        inQueueW_.FreeTensor(wLocal);

        Mul(wTmpTensor, xDup, wTmpTensor, MASK_COUNT, blockReduceRepeatCount, dotProductParams_);
-        pipe_barrier(PIPE_V);
+        AscendC::PipeBarrier<PIPE_V>();

        if (maxLoRARank_ == LORA_RANK_8) {
            BlockReduceSum(yLocal[progress], wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
-            pipe_barrier(PIPE_V);
+            AscendC::PipeBarrier<PIPE_V>();
        } else if (maxLoRARank_ == LORA_RANK_16) {
            BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
-            pipe_barrier(PIPE_V);
+            AscendC::PipeBarrier<PIPE_V>();
            PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT,
                          reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
-            pipe_barrier(PIPE_V);
+            AscendC::PipeBarrier<PIPE_V>();
        } else if (maxLoRARank_ == LORA_RANK_32) {
            BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
-            pipe_barrier(PIPE_V);
+            AscendC::PipeBarrier<PIPE_V>();
            PairReduceSum(wTmpTensor, wTmpTensor, pairReduceRepeat16, MASK_COUNT,
                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
-            pipe_barrier(PIPE_V);
+            AscendC::PipeBarrier<PIPE_V>();
            PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat32, MASK_COUNT,
                          reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
-            pipe_barrier(PIPE_V);
+            AscendC::PipeBarrier<PIPE_V>();
        } else if (maxLoRARank_ == LORA_RANK_64) {
            BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
-            pipe_barrier(PIPE_V);
+            AscendC::PipeBarrier<PIPE_V>();
            BlockReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT,
                          reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
-            pipe_barrier(PIPE_V);
+            AscendC::PipeBarrier<PIPE_V>();
        }
    }