[Build] Add support for Ascend950 chip (#7151)

### What this PR does / why we need it? This PR adds support for the Ascend950 chip. This includes: - Updating build scripts (`CMakeLists.txt` and `setup.py`) to recognize the Ascend950 chip and set appropriate compilation flags. - Disabling a set of custom operators that are not yet supported on the Ascend950 hardware target. - Performing a codebase-wide refactoring of `pipe_barrier()` calls to the namespaced `AscendC::PipeBarrier<>()` for improved code consistency and adherence to the latest API standards. Ascend950DT e2e passed (Qwen3-32B-MXFP8) and CI passed - vLLM version: v0.16.0 - vLLM main: 4034c3d32e --------- Signed-off-by: linfeng-yuan <1102311262@qq.com>
2026-03-12 10:25:51 +08:00
parent da01a74009
commit 5f3826b093
24 changed files with 246 additions and 227 deletions
--- a/csrc/utils/inc/kernel/pse.h
+++ b/csrc/utils/inc/kernel/pse.h
@@ -336,7 +336,7 @@ __aicore__ inline void PseSlopeCopyIn(LocalTensor<T> &dstTensor, LocalTensor<hal
        if (pseInfo.needCast) {
            int64_t computeSize = pseInfo.vec1S1RealSize * pseInfo.s2AlignedSize;
            Cast(dstTensor, helpTensor, RoundMode::CAST_NONE, computeSize);
-            pipe_barrier(PIPE_V);
+            AscendC::PipeBarrier<PIPE_V>();

            int64_t s1Offset = pseInfo.s1oIdx * pseInfo.s1BaseSize + pseInfo.vecCoreOffset +
                               pseInfo.loopIdx * pseInfo.vec1S1BaseSize;
@@ -345,16 +345,16 @@ __aicore__ inline void PseSlopeCopyIn(LocalTensor<T> &dstTensor, LocalTensor<hal
            float posShift = float(s2Offset + pseInfo.kvStartIdx - s1Offset - pseInfo.qStartIdx);

            Adds(dstTensor, dstTensor, posShift, computeSize);
-            pipe_barrier(PIPE_V);
+            AscendC::PipeBarrier<PIPE_V>();
            Abs(dstTensor, dstTensor, computeSize);
-            pipe_barrier(PIPE_V);
+            AscendC::PipeBarrier<PIPE_V>();
            float slopes = ((__gm__ T *)pseSlope)[offset] * -1;
            if (pseInfo.pseType == (uint32_t)PseTypeEnum::PSE_INNER_MUL_ADD_SQRT_TYPE) {
                Sqrt(dstTensor, dstTensor, computeSize);
-                pipe_barrier(PIPE_V);
+                AscendC::PipeBarrier<PIPE_V>();
            }
            Muls(dstTensor, dstTensor, slopes, computeSize);
-            pipe_barrier(PIPE_V);
+            AscendC::PipeBarrier<PIPE_V>();
        }
    }
 }
@@ -373,7 +373,7 @@ __aicore__ inline void PseSlopeCast(LocalTensor<T> &dstTensor, LocalTensor<half>
        int64_t offset = bOffset + n2Offset + gOffset;
        int64_t computeSize = pseInfo.vec1S1RealSize * pseInfo.s2AlignedSize;
        Cast(dstTensor, helpTensor, RoundMode::CAST_NONE, computeSize);
-        pipe_barrier(PIPE_V);
+        AscendC::PipeBarrier<PIPE_V>();

        int64_t s1Offset = pseInfo.s1oIdx * pseInfo.s1BaseSize + pseInfo.vecCoreOffset +
                           pseInfo.loopIdx * pseInfo.vec1S1BaseSize;
@@ -382,16 +382,16 @@ __aicore__ inline void PseSlopeCast(LocalTensor<T> &dstTensor, LocalTensor<half>
        float posShift = float(s2Offset + pseInfo.kvStartIdx - s1Offset - pseInfo.qStartIdx);

        Adds(dstTensor, dstTensor, posShift, computeSize);
-        pipe_barrier(PIPE_V);
+        AscendC::PipeBarrier<PIPE_V>();
        Abs(dstTensor, dstTensor, computeSize);
-        pipe_barrier(PIPE_V);
+        AscendC::PipeBarrier<PIPE_V>();
        float slopes = ((__gm__ T *)pseSlope)[offset] * -1;
        if (pseInfo.pseType == (uint32_t)PseTypeEnum::PSE_INNER_MUL_ADD_SQRT_TYPE) {
            Sqrt(dstTensor, dstTensor, computeSize);
-            pipe_barrier(PIPE_V);
+            AscendC::PipeBarrier<PIPE_V>();
        }
        Muls(dstTensor, dstTensor, slopes, computeSize);
-        pipe_barrier(PIPE_V);
+        AscendC::PipeBarrier<PIPE_V>();
    }
 }