[Build] Add support for Ascend950 chip (#7151)

### What this PR does / why we need it? This PR adds support for the Ascend950 chip. This includes: - Updating build scripts (`CMakeLists.txt` and `setup.py`) to recognize the Ascend950 chip and set appropriate compilation flags. - Disabling a set of custom operators that are not yet supported on the Ascend950 hardware target. - Performing a codebase-wide refactoring of `pipe_barrier()` calls to the namespaced `AscendC::PipeBarrier<>()` for improved code consistency and adherence to the latest API standards. Ascend950DT e2e passed (Qwen3-32B-MXFP8) and CI passed - vLLM version: v0.16.0 - vLLM main: 4034c3d32e --------- Signed-off-by: linfeng-yuan <1102311262@qq.com>
2026-03-12 10:25:51 +08:00
parent da01a74009
commit 5f3826b093
24 changed files with 246 additions and 227 deletions
--- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_fullload_dynamic_quant.h
+++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_fullload_dynamic_quant.h
@@ -99,9 +99,9 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::SortCompute() {
  LocalTensor<int32_t> expertIdxLocal = inLocal[0];
  LocalTensor<float> expertIdxLocalFp32 = expertIdxLocal.ReinterpretCast<float>();
  Cast(expertIdxLocalFp32, expertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  Muls(expertIdxLocalFp32, expertIdxLocalFp32, (float)-1, this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  int64_t duplicateNum = this->totalLength % ONE_REPEAT_SORT_NUM;
  if (duplicateNum > 0) {
    int duplicateIndex = this->totalLength - duplicateNum;
@@ -110,38 +110,38 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::SortCompute() {
    mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
    uint64_t mask[2] = {mask0, 0};
    Duplicate(expertIdxLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
  }
  LocalTensor<float> concatLocal;
  LocalTensor<float> tempTensor = tempBuffer.Get<float>(GetSortLen<float>(this->sortNum_));
  Concat(concatLocal, expertIdxLocalFp32, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  LocalTensor<uint32_t> rowIdxLocal = inLocal[this->sortNum_].template ReinterpretCast<uint32_t>();
  LocalTensor<float> sortedLocal = sortedBuffer.Get<float>(GetSortLen<float>(this->sortNum_));
  Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  LocalTensor<float> expandedExpertIdxLocal = expandedExpertIdxCopyOutQueue_.AllocTensor<float>();
  expandDstToSrcRowLocal = expandDstToSrcRowQueue_.AllocTensor<uint32_t>();
  LocalTensor<float> expandDstToSrcRowLocalFp32 = expandDstToSrcRowLocal.ReinterpretCast<float>();
  Extract(expandedExpertIdxLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum_ / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  Cast(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocal.ReinterpretCast<int32_t>(), RoundMode::CAST_ROUND,
       this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  Muls(expandedExpertIdxLocal, expandedExpertIdxLocal, (float)-1, this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  LocalTensor<int32_t> expandedExpertIdxLocalInt32;
  expandedExpertIdxLocalInt32 = expandedExpertIdxLocal.ReinterpretCast<int32_t>();
  Cast(expandedExpertIdxLocalInt32, expandedExpertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  expandedExpertIdxCopyOutQueue_.EnQue<int32_t>(expandedExpertIdxLocalInt32);

  LocalTensor<uint32_t> expandedRowIdx = expandedRowIdxCopyOutQueue_.AllocTensor<uint32_t>();
  LocalTensor<uint32_t> expandedRowIdxU32 = expandedRowIdx.ReinterpretCast<uint32_t>();
  Muls(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocalFp32, (float)-1, this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  ArithProgression<int32_t>(inLocal[this->sortNum_], 0, 1, this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  if (duplicateNum > 0) {
    int duplicateIndex = this->totalLength - duplicateNum;
    uint64_t mask0 = UINT64_MAX;
@@ -149,14 +149,14 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::SortCompute() {
    mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
    uint64_t mask[2] = {mask0, 0};
    Duplicate(expandDstToSrcRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
  }
  Concat(concatLocal, expandDstToSrcRowLocalFp32, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  Extract(tempTensor, expandedRowIdxU32, sortedLocal, this->sortNum_ / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  expandedRowIdxCopyOutQueue_.EnQue<uint32_t>(expandedRowIdx);
  sortDataCopyInQueue.FreeTensor(inLocal);
 }
@@ -227,31 +227,31 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::Compute(LocalTensor<float>&

  if constexpr (!IsSameType<T, float>::value) {
    Cast(inLocal, inLocal.ReinterpretCast<T>()[colsAlign], RoundMode::CAST_NONE, this->cols_);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
  }

  if (smoothType != 0) {
    Mul(inLocal, inLocal, smoothLocal, this->cols_);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
  }

  Abs(tempLocal, inLocal, this->cols_);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();

  ReduceMax(dynamicQuantLocal, tempLocal, tempLocal, this->cols_);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();

  float maxValue = dynamicQuantLocal.GetValue(0) / 127.0f;

  Duplicate<float>(dynamicQuantLocal, maxValue, 8);
  Duplicate<float>(tempLocal, maxValue, this->cols_);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();

  Div(tempLocal, inLocal, tempLocal, this->cols_);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();

  Cast(tempLocal.ReinterpretCast<half>(), tempLocal, RoundMode::CAST_TRUNC, this->cols_);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();

  Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_ROUND, this->cols_);

--- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_fullload_quant.h
+++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_fullload_quant.h
@@ -56,34 +56,34 @@ __aicore__ inline void MoeV2FullLoadQuant<T>::Compute(int64_t xLocalLength) {
  uint32_t elements = Align(this->cols, sizeof(int8_t)) * xLocalLength;
  if constexpr (IsSameType<T, bfloat16_t>::value) {
    Cast(floatLocal, inLocal, RoundMode::CAST_NONE, elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
    Cast(halfLocal, floatLocal, RoundMode::CAST_NONE, elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
    Muls(halfLocal, halfLocal, static_cast<half>(this->scale), elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
    Adds(halfLocal, halfLocal, static_cast<half>(this->offset), elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
    LocalTensor<int32_t> intLocal = floatLocal.ReinterpretCast<int32_t>();
    Cast(intLocal, halfLocal, RoundMode::CAST_RINT, elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
    SetDeqScale((half)1.000000e+00f);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
    Cast(halfLocal, intLocal, RoundMode::CAST_RINT, elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
    Cast(outLocal, halfLocal, RoundMode::CAST_RINT, elements);
  } else if constexpr (IsSameType<T, float>::value) {
    Cast(halfLocal, inLocal, RoundMode::CAST_NONE, elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
    Muls(halfLocal, halfLocal, static_cast<half>(this->scale), elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
    Adds(halfLocal, halfLocal, static_cast<half>(this->offset), elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
    Cast(outLocal, halfLocal, RoundMode::CAST_RINT, elements);
  } else {
    Muls(inLocal, inLocal, static_cast<T>(this->scale), elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
    Adds(inLocal, inLocal, static_cast<T>(this->offset), elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
    Cast(outLocal, inLocal, RoundMode::CAST_RINT, elements);
  }
  inputXCopyOutQueue.EnQue(outLocal);
--- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_fullload_quant_base.h
+++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_fullload_quant_base.h
@@ -94,9 +94,9 @@ __aicore__ inline void MoeV2FullLoadQuantBase::SortCompute() {
  LocalTensor<int32_t> expertIdxLocal = inLocal[0];
  LocalTensor<float> expertIdxLocalFp32 = expertIdxLocal.ReinterpretCast<float>();
  Cast(expertIdxLocalFp32, expertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  Muls(expertIdxLocalFp32, expertIdxLocalFp32, (float)-1, this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  int64_t duplicateNum = this->totalLength % ONE_REPEAT_SORT_NUM;
  if (duplicateNum > 0) {
    int duplicateIndex = this->totalLength - duplicateNum;
@@ -105,38 +105,38 @@ __aicore__ inline void MoeV2FullLoadQuantBase::SortCompute() {
    mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
    uint64_t mask[2] = {mask0, 0};
    Duplicate(expertIdxLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
  }
  LocalTensor<float> concatLocal;
  LocalTensor<float> tempTensor = tempBuffer.Get<float>(GetSortLen<float>(this->sortNum));
  Concat(concatLocal, expertIdxLocalFp32, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  LocalTensor<uint32_t> rowIdxLocal = inLocal[this->sortNum].template ReinterpretCast<uint32_t>();
  LocalTensor<float> sortedLocal = sortedBuffer.Get<float>(GetSortLen<float>(this->sortNum));
  Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  LocalTensor<float> expandedExpertIdxLocal = expandedExpertIdxCopyOutQueue.AllocTensor<float>();
  LocalTensor<uint32_t> expandDstToSrcRowLocal = expandDstToSrcRowQueue.AllocTensor<uint32_t>();
  LocalTensor<float> expandDstToSrcRowLocalFp32 = expandDstToSrcRowLocal.ReinterpretCast<float>();
  Extract(expandedExpertIdxLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  Cast(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocal.ReinterpretCast<int32_t>(), RoundMode::CAST_ROUND,
       this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  Muls(expandedExpertIdxLocal, expandedExpertIdxLocal, (float)-1, this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  LocalTensor<int32_t> expandedExpertIdxLocalInt32;
  expandedExpertIdxLocalInt32 = expandedExpertIdxLocal.ReinterpretCast<int32_t>();
  Cast(expandedExpertIdxLocalInt32, expandedExpertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  expandedExpertIdxCopyOutQueue.EnQue<int32_t>(expandedExpertIdxLocalInt32);

  LocalTensor<uint32_t> expandedRowIdx = expandedRowIdxCopyOutQueue.AllocTensor<uint32_t>();
  LocalTensor<uint32_t> expandedRowIdxU32 = expandedRowIdx.ReinterpretCast<uint32_t>();
  Muls(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocalFp32, (float)-1, this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  ArithProgression<int32_t>(inLocal[this->sortNum], 0, 1, this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  if (duplicateNum > 0) {
    int duplicateIndex = this->totalLength - duplicateNum;
    uint64_t mask0 = UINT64_MAX;
@@ -144,14 +144,14 @@ __aicore__ inline void MoeV2FullLoadQuantBase::SortCompute() {
    mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
    uint64_t mask[2] = {mask0, 0};
    Duplicate(expandDstToSrcRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
  }
  Concat(concatLocal, expandDstToSrcRowLocalFp32, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  Extract(tempTensor, expandedRowIdxU32, sortedLocal, this->sortNum / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  expandedRowIdxCopyOutQueue.EnQue<uint32_t>(expandedRowIdx);
  sortDataCopyInQueue.FreeTensor(inLocal);

--- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_dynamic_quant.h
+++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_dynamic_quant.h
@@ -122,31 +122,31 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::Compute(LocalTensor<float>& s

  if constexpr (!IsSameType<T, float>::value) {
    Cast(inLocal, inLocal.ReinterpretCast<T>()[perLoopColsAlign], RoundMode::CAST_NONE, this->cols);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
  }

  if (smoothType != 0) {
    Mul(inLocal, inLocal, smoothLocal, this->cols);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
  }

  Abs(tempLocal, inLocal, this->cols);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();

  ReduceMax(dynamicQuantLocal, tempLocal, tempLocal, this->cols);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();

  float maxValue = dynamicQuantLocal.GetValue(0) / 127.0f;

  Duplicate<float>(dynamicQuantLocal, maxValue, 8);
  Duplicate<float>(tempLocal, maxValue, this->cols);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();

  Div(tempLocal, inLocal, tempLocal, this->cols);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();

  Cast(tempLocal.ReinterpretCast<half>(), tempLocal, RoundMode::CAST_TRUNC, this->cols);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();

  Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_ROUND, this->cols);

@@ -285,16 +285,16 @@ __aicore__ inline float MoeV2GatherDynamicQuant<T>::ComputeMax(LocalTensor<float

  if constexpr (!IsSameType<T, float>::value) {
    Cast(inLocal, inLocal.ReinterpretCast<T>()[perLoopColsAlign], RoundMode::CAST_NONE, colsTileLength);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
  }

  if (smoothType != 0) {
    Mul(inLocal, inLocal, smoothLocal, colsTileLength);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
  }

  Abs(tempLocal, inLocal, colsTileLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();

  ReduceMax(dynamicQuantLocal[8], tempLocal, tempLocal, colsTileLength);

@@ -319,13 +319,13 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::ComputeScale(LocalTensor<floa
  inLocal = inputXInQueue.DeQue<float>();

  Duplicate<float>(tempLocal, scaleTemp, colsTileLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();

  Div(tempLocal, inLocal, tempLocal, colsTileLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();

  Cast(tempLocal.ReinterpretCast<half>(), tempLocal, RoundMode::CAST_TRUNC, colsTileLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();

  Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_ROUND, colsTileLength);

--- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_quant.h
+++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_quant.h
@@ -95,34 +95,34 @@ __aicore__ inline void MoeV2GatherQuant<T>::Compute() {
  uint32_t elements = Align(this->colsTileLength, sizeof(T));
  if constexpr (IsSameType<T, bfloat16_t>::value) {
    Cast(floatLocal, inLocal, RoundMode::CAST_NONE, elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
    Cast(halfLocal, floatLocal, RoundMode::CAST_NONE, elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
    Muls(halfLocal, halfLocal, static_cast<half>(this->scale), elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
    Adds(halfLocal, halfLocal, static_cast<half>(this->offset), elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
    LocalTensor<int32_t> intLocal = floatLocal.ReinterpretCast<int32_t>();
    Cast(intLocal, halfLocal, RoundMode::CAST_RINT, elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
    SetDeqScale((half)1.000000e+00f);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
    Cast(halfLocal, intLocal, RoundMode::CAST_RINT, elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
    Cast(outLocal, halfLocal, RoundMode::CAST_RINT, elements);
  } else if constexpr (IsSameType<T, float>::value) {
    Cast(halfLocal, inLocal, RoundMode::CAST_NONE, elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
    Muls(halfLocal, halfLocal, static_cast<half>(this->scale), elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
    Adds(halfLocal, halfLocal, static_cast<half>(this->offset), elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
    Cast(outLocal, halfLocal, RoundMode::CAST_RINT, elements);
  } else {
    Muls(inLocal, inLocal, static_cast<T>(this->scale), elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
    Adds(inLocal, inLocal, static_cast<T>(this->offset), elements);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
    Cast(outLocal, inLocal, RoundMode::CAST_RINT, elements);
  }
  inputXCopyOutQueue.EnQue(outLocal);
--- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_init_routing_fullload.h
+++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_init_routing_fullload.h
@@ -88,9 +88,9 @@ __aicore__ inline void MoeV2FullLoad<T>::SortCompute() {
  LocalTensor<int32_t> expertIdxLocal = inLocal[0];
  LocalTensor<float> expertIdxLocalFp32 = expertIdxLocal.ReinterpretCast<float>();
  Cast(expertIdxLocalFp32, expertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  Muls(expertIdxLocalFp32, expertIdxLocalFp32, (float)-1, this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  int64_t duplicateNum = this->totalLength % ONE_REPEAT_SORT_NUM;
  if (duplicateNum > 0) {
    int duplicateIndex = this->totalLength - duplicateNum;
@@ -99,38 +99,38 @@ __aicore__ inline void MoeV2FullLoad<T>::SortCompute() {
    mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
    uint64_t mask[2] = {mask0, 0};
    Duplicate(expertIdxLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
  }
  LocalTensor<float> concatLocal;
  LocalTensor<float> tempTensor = tempBuffer.Get<float>(GetSortLen<float>(this->sortNum_));
  Concat(concatLocal, expertIdxLocalFp32, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  LocalTensor<uint32_t> rowIdxLocal = inLocal[this->sortNum_].template ReinterpretCast<uint32_t>();
  LocalTensor<float> sortedLocal = sortedBuffer.Get<float>(GetSortLen<float>(this->sortNum_));
  Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  LocalTensor<float> expandedExpertIdxLocal = expandedExpertIdxCopyOutQueue_.AllocTensor<float>();
  LocalTensor<uint32_t> expandDstToSrcRowLocal = expandDstToSrcRowQueue_.AllocTensor<uint32_t>();
  LocalTensor<float> expandDstToSrcRowLocalFp32 = expandDstToSrcRowLocal.ReinterpretCast<float>();
  Extract(expandedExpertIdxLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum_ / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  Cast(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocal.ReinterpretCast<int32_t>(), RoundMode::CAST_ROUND,
       this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  Muls(expandedExpertIdxLocal, expandedExpertIdxLocal, (float)-1, this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  LocalTensor<int32_t> expandedExpertIdxLocalInt32;
  expandedExpertIdxLocalInt32 = expandedExpertIdxLocal.ReinterpretCast<int32_t>();
  Cast(expandedExpertIdxLocalInt32, expandedExpertIdxLocal, RoundMode::CAST_ROUND, this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  expandedExpertIdxCopyOutQueue_.EnQue<int32_t>(expandedExpertIdxLocalInt32);

  LocalTensor<uint32_t> expandedRowIdx = expandedRowIdxCopyOutQueue_.AllocTensor<uint32_t>();
  LocalTensor<uint32_t> expandedRowIdxU32 = expandedRowIdx.ReinterpretCast<uint32_t>();
  Muls(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocalFp32, (float)-1, this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  ArithProgression<int32_t>(inLocal[this->sortNum_], 0, 1, this->totalLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  if (duplicateNum > 0) {
    int duplicateIndex = this->totalLength - duplicateNum;
    uint64_t mask0 = UINT64_MAX;
@@ -138,14 +138,14 @@ __aicore__ inline void MoeV2FullLoad<T>::SortCompute() {
    mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
    uint64_t mask[2] = {mask0, 0};
    Duplicate(expandDstToSrcRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
  }
  Concat(concatLocal, expandDstToSrcRowLocalFp32, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  Extract(tempTensor, expandedRowIdxU32, sortedLocal, this->sortNum_ / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  expandedRowIdxCopyOutQueue_.EnQue<uint32_t>(expandedRowIdx);
  sortDataCopyInQueue.FreeTensor(inLocal);

--- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_mrgsort_out.h
+++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_mrgsort_out.h
@@ -168,9 +168,9 @@ __aicore__ inline void MoeV2MrgsortOut::UpdateSortInfo() {

 __aicore__ inline void MoeV2MrgsortOut::Extract() {
  AscendC::Extract(this->ubOutput1, this->ubOutput2, this->tempBuffer, Ceil(curLoopSortedNum, ONE_REPEAT_SORT_NUM));
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  Muls(this->ubOutput1, this->ubOutput1, (float)-1, Align(curLoopSortedNum, sizeof(float)));
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  Cast(this->ubOutputInt1, this->ubOutput1, RoundMode::CAST_ROUND, Align(curLoopSortedNum, sizeof(float)));
 }

--- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_sort_multi_core.h
+++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_sort_multi_core.h
@@ -106,9 +106,9 @@ __aicore__ inline void MoeV2SortMultiCore::UBSortCompute(int64_t progress, int64

  expertForSourceRowLocalFp32 = expertForSourceRowLocal.ReinterpretCast<float>();
  Cast(expertForSourceRowLocalFp32, expertForSourceRowLocal, RoundMode::CAST_ROUND, sortNum);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  Muls(expertForSourceRowLocalFp32, expertForSourceRowLocalFp32, (float)-1, sortNum);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();

  int64_t duplicateNum = size % ONE_REPEAT_SORT_NUM;
  if (duplicateNum > 0) {
@@ -118,7 +118,7 @@ __aicore__ inline void MoeV2SortMultiCore::UBSortCompute(int64_t progress, int64
    mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
    uint64_t mask[2] = {mask0, 0};
    Duplicate(expertForSourceRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
  }

  LocalTensor<float> concatLocal = expertForSourceRowLocalFp32;
--- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_sort_one_core.h
+++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_sort_one_core.h
@@ -56,9 +56,9 @@ __aicore__ inline void MoeV2SortOneCore::SortCompute() {
  LocalTensor<int32_t> expertForSourceRowLocal = inLocal[0];
  LocalTensor<float> expertForSourceRowLocalFp32 = expertForSourceRowLocal.ReinterpretCast<float>();
  Cast(expertForSourceRowLocalFp32, expertForSourceRowLocal, RoundMode::CAST_ROUND, this->tileLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  Muls(expertForSourceRowLocalFp32, expertForSourceRowLocalFp32, (float)-1, this->tileLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();

  int64_t duplicateNum = this->totalLength % ONE_REPEAT_SORT_NUM;
  if (duplicateNum > 0) {
@@ -68,28 +68,28 @@ __aicore__ inline void MoeV2SortOneCore::SortCompute() {
    mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
    uint64_t mask[2] = {mask0, 0};
    Duplicate(expertForSourceRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
  }

  LocalTensor<float> concatLocal;
  LocalTensor<float> tempTensor = tempBuffer.Get<float>(GetSortLen<float>(this->sortNum));
  Concat(concatLocal, expertForSourceRowLocalFp32, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();

  LocalTensor<float> sortedLocal = sortedBuffer.Get<float>(GetSortLen<float>(this->sortNum));
  LocalTensor<uint32_t> sourceRowLocal;
  sourceRowLocal = inLocal[this->sortNum].ReinterpretCast<uint32_t>();
  Sort<float, true>(sortedLocal, concatLocal, sourceRowLocal, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();

  LocalTensor<float> outLocal = sortDataCopyOutQueue.AllocTensor<float>();
  LocalTensor<float> sortedExpertForSourceRowLocal = outLocal[0];
  LocalTensor<uint32_t> expandDstToSrcRowLocal;
  expandDstToSrcRowLocal = outLocal[this->sortNum].ReinterpretCast<uint32_t>();
  Extract(sortedExpertForSourceRowLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum / ONE_REPEAT_SORT_NUM);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  Muls(sortedExpertForSourceRowLocal, sortedExpertForSourceRowLocal, (float)-1, this->tileLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();

  LocalTensor<int32_t> expertForSourceRowLocalInt32;
  expertForSourceRowLocalInt32 = sortedExpertForSourceRowLocal.ReinterpretCast<int32_t>();
--- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_src_to_dst_and_gather.h
+++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_src_to_dst_and_gather.h
@@ -164,31 +164,31 @@ __aicore__ inline void MoeV2SrcToDstAndGather<T, TilingData>::Compute(int32_t sr

  if constexpr (!IsSameType<T, float>::value) {
    Cast(inLocal, inLocal.template ReinterpretCast<T>()[perLoopColsAlign], RoundMode::CAST_NONE, this->cols);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
  }

  if (smoothType != 0) {
    Mul(inLocal, inLocal, smoothLocal, this->cols);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
  }

  Abs(tempLocal, inLocal, this->cols);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();

  ReduceMax(dynamicQuantLocal, tempLocal, tempLocal, this->cols);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();

  float maxValue = dynamicQuantLocal.GetValue(0) / 127.0f;

  Duplicate<float>(dynamicQuantLocal, maxValue, 8);
  Duplicate<float>(tempLocal, maxValue, this->cols);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();

  Div(tempLocal, inLocal, tempLocal, this->cols);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();

  Cast(tempLocal.ReinterpretCast<half>(), tempLocal, RoundMode::CAST_TRUNC, this->cols);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();

  Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_ROUND, this->cols);

@@ -274,7 +274,7 @@ __aicore__ inline float MoeV2SrcToDstAndGather<T, TilingData>::ComputeMax(LocalT

  if constexpr (!IsSameType<T, float>::value) {
    Cast(inLocal, inLocal.ReinterpretCast<T>()[perLoopColsAlign], RoundMode::CAST_NONE, colsTileLength);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
  }

  if (smoothType != 0) {
@@ -284,11 +284,11 @@ __aicore__ inline float MoeV2SrcToDstAndGather<T, TilingData>::ComputeMax(LocalT
    smoothLocal = smoothInQueue.DeQue<float>();

    Mul(inLocal, inLocal, smoothLocal, colsTileLength);
-    pipe_barrier(PIPE_V);
+    AscendC::PipeBarrier<PIPE_V>();
  }

  Abs(tempLocal, inLocal, colsTileLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();

  ReduceMax(dynamicQuantLocal[8], tempLocal, tempLocal, colsTileLength);

@@ -314,13 +314,13 @@ __aicore__ inline void MoeV2SrcToDstAndGather<T, TilingData>::ComputeScale(Local
  inLocal = inputXInQueue.DeQue<float>();

  Duplicate<float>(tempLocal, scaleTemp, colsTileLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();

  Div(tempLocal, inLocal, tempLocal, colsTileLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();

  Cast(tempLocal.ReinterpretCast<half>(), tempLocal, RoundMode::CAST_TRUNC, colsTileLength);
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();

  Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_ROUND, colsTileLength);

--- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_src_to_dst_op.h
+++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_src_to_dst_op.h
@@ -75,13 +75,13 @@ __aicore__ inline void MoeV2SrcToDstOp::Compute(int64_t progress) {
  LocalTensor<int32_t> outLocal = copyOutQueue.AllocTensor<int32_t>();
  LocalTensor<int32_t> assistTensor = assistBuffer.Get<int32_t>(ASSIST_NUM);

-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  int64_t loops = Ceil(currentLoopRows, ASSIST_INDEX_NUM);
  for (int64_t i = 0; i < loops; i++) {
    Adds(outLocal[i * ASSIST_NUM], assistTensor,
         static_cast<int32_t>(this->perLoopRows * progress + i * ASSIST_INDEX_NUM), ASSIST_NUM);
  }
-  pipe_barrier(PIPE_V);
+  AscendC::PipeBarrier<PIPE_V>();
  copyOutQueue.EnQue<int32_t>(outLocal);
 }

--- a/csrc/dispatch_ffn_combine/op_kernel/utils/hccl_shmem.hpp
+++ b/csrc/dispatch_ffn_combine/op_kernel/utils/hccl_shmem.hpp
@@ -226,7 +226,7 @@ public:
        AscendC::CrossCoreSetFlag<0x0, PIPE_MTE3>(RECV_SYNC_EVENT_ID);
        AscendC::CrossCoreSetFlag<0x0, PIPE_MTE3>(SEND_SYNC_EVENT_ID);
        AscendC::CrossCoreWaitFlag(SEND_SYNC_EVENT_ID);
-        pipe_barrier(PIPE_ALL);
+        AscendC::PipeBarrier<PIPE_ALL>();
 
        ctrBuffer.SetValue(0, epStateValue_);
        AscendC::SetFlag<AscendC::HardEvent::S_MTE3>(EVENT_ID0);