[Build] Add support for Ascend950 chip (#7151)

### What this PR does / why we need it? This PR adds support for the Ascend950 chip. This includes: - Updating build scripts (`CMakeLists.txt` and `setup.py`) to recognize the Ascend950 chip and set appropriate compilation flags. - Disabling a set of custom operators that are not yet supported on the Ascend950 hardware target. - Performing a codebase-wide refactoring of `pipe_barrier()` calls to the namespaced `AscendC::PipeBarrier<>()` for improved code consistency and adherence to the latest API standards. Ascend950DT e2e passed (Qwen3-32B-MXFP8) and CI passed - vLLM version: v0.16.0 - vLLM main: 4034c3d32e --------- Signed-off-by: linfeng-yuan <1102311262@qq.com>
2026-03-12 10:25:51 +08:00
parent da01a74009
commit 5f3826b093
24 changed files with 246 additions and 227 deletions
--- a/csrc/kernels/sgmv_shrink.cpp
+++ b/csrc/kernels/sgmv_shrink.cpp
@@ -94,7 +94,7 @@ private:
            AscendC::LocalTensor<float> xTmpTensor = tmpBufferX_.Get<float>();
            AscendC::LocalTensor<X_T> xLocal = inQueueX_.DeQue<X_T>();
            Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, inputHiddenDim_);
-            pipe_barrier(PIPE_V);
+            AscendC::PipeBarrier<PIPE_V>();
            inQueueX_.FreeTensor(xLocal);
        }

@@ -153,20 +153,20 @@ private:
            AscendC::LocalTensor<X_T> xLocal = inQueueX_.DeQue<X_T>();
            Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, numElements);
            Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements);
-            pipe_barrier(PIPE_V);
+            AscendC::PipeBarrier<PIPE_V>();
            inQueueX_.FreeTensor(xLocal);
            inQueueW_.FreeTensor(wLocal);
        } else {
            Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements);
-            pipe_barrier(PIPE_V);
+            AscendC::PipeBarrier<PIPE_V>();
            inQueueW_.FreeTensor(wLocal);
        }
        // dot product of the one tile of X and W 
        Mul(wTmpTensor, xTmpTensor, wTmpTensor, numElements);
-        pipe_barrier(PIPE_V);
+        AscendC::PipeBarrier<PIPE_V>();
        // reduce sum generate one number, which is the summation of all the dot product
        ReduceSum<float>(wTmpTensor, wTmpTensor, wTmpTensor, numElements);
-        pipe_barrier(PIPE_V);
+        AscendC::PipeBarrier<PIPE_V>();

        acc += wTmpTensor.GetValue(0);
    }
@@ -192,7 +192,7 @@ private:
        AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.AllocTensor<Y_T>();

        Muls(yOutLocal, yLocal, scale_, maxLoRARank_);
-        pipe_barrier(PIPE_V);
+        AscendC::PipeBarrier<PIPE_V>();

        outQueueY_.EnQue<Y_T>(yOutLocal);
    }