[BugFix][0.18.0]dispatch_ffn_combine kernal rollback combine 、unpermute part and scale part (#8534)

cherry-pick https://github.com/vllm-project/vllm-ascend/pull/8539  ### What this PR does / why we need it?  Due to end-to-end testing , three optimization points for the decode scenario have been reverted in dispatch_ffn_combine kernel. ### Does this PR introduce _any_ user-facing change?  ### How was this patch tested?  --------- Signed-off-by: l00893928 <liuquanlu@huawei.com> Co-authored-by: l00893928 <liuquanlu@huawei.com>
2026-04-22 23:27:02 +08:00
parent 69a57bc9ec
commit fcf4d477a7
6 changed files with 180 additions and 136 deletions
--- a/csrc/dispatch_ffn_combine/op_kernel/dispatch_ffn_combine.h
+++ b/csrc/dispatch_ffn_combine/op_kernel/dispatch_ffn_combine.h
@@ -234,7 +234,8 @@ __aicore__ inline void DispatchFFNCombine<TemplateMMA2ACFunc>::Process()
    using BlockEpilogue1 = Epilogue::Block::BlockEpilogue<EpilogueDispatchPolicy1, CType, PerTokenScaleType,
        D1Type, TileElemWiseMuls, TileCopy1>;
-    using EpilogueDispatchPolicy2 = Epilogue::EpilogueAtlasA2PerTokenDequantV2<ubStages>;
+    using EpilogueDispatchPolicy2 = Epilogue::EpilogueAtlasA2PerTokenDequant<ubStages>;
    using TileCopy2 = Epilogue::Tile::TileCopy<ArchTag, CType, ScaleType, PerTokenScaleType, D2Type>;
    using BlockEpilogue2 = Epilogue::Block::BlockEpilogue<EpilogueDispatchPolicy2, CType,PerTokenScaleType,
        D2Type, TileCopy2>;
@@ -254,9 +255,11 @@ __aicore__ inline void DispatchFFNCombine<TemplateMMA2ACFunc>::Process()
    GemmCoord problemShape{static_cast<uint32_t>(m), static_cast<uint32_t>(n), static_cast<uint32_t>(k)};
-    uint32_t epilogueCoreNum = aivNum / 2;
+    uint32_t epilogueCoreNum = aivNum;
-    uint32_t epilogueGranularity = expertPerRank - 1;
+    uint32_t epilogueGranularity = expertPerRank - 3;
-
+    if (expertPerRank <= 4) {
        epilogueGranularity = expertPerRank - 1;
    }
    typename MatmulKernel::Params params{
        problemShape, static_cast<uint32_t>(EP), static_cast<uint32_t>(listLen), static_cast<uint32_t>(expertPerRank), static_cast<uint32_t>(maxOutputSize),
        static_cast<uint32_t>(rank), static_cast<uint32_t>(rankSize),
--- a/csrc/dispatch_ffn_combine/op_kernel/dispatch_ffn_combine_kernel.hpp
+++ b/csrc/dispatch_ffn_combine/op_kernel/dispatch_ffn_combine_kernel.hpp
@@ -571,6 +571,7 @@ private:
        if constexpr (BlockMmad::DispatchPolicy::ASYNC) {
            blockMmad.SynchronizeBlock();
        }
        blockMmad.Finalize(params.expertPerRank - 1, 0);
    }
@@ -727,19 +728,6 @@ private:
    }
    CATLASS_DEVICE
    void CombineSetFlag() {
        AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID0);
        AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID1);
        AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID2);
        AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID3);
        AscendC::SetFlag<AscendC::HardEvent::S_MTE2>(EVENT_ID2);
        AscendC::SetFlag<AscendC::HardEvent::S_MTE2>(EVENT_ID3);
        AscendC::SetFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID0);
        AscendC::SetFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID1);
    }
    CATLASS_DEVICE
    void DispatchAndCombine(Params const &params) {
        icache_preload(8);
@@ -800,13 +788,17 @@ private:
                    GM_ADDR otherRankPtr = shmem(0, dstEpIdx);
                    AscendC::GlobalTensor<ElementA> gmRemoteA;
                    gmRemoteA.SetGlobalBuffer(reinterpret_cast<__gm__ ElementA*>(otherRankPtr + peermemInfo.offsetA));
-
+                    AscendC::GlobalTensor<ElementPerTokenScale> gmRemotePerTokenScale;
                    gmRemotePerTokenScale.SetGlobalBuffer(reinterpret_cast<__gm__ ElementPerTokenScale*>(otherRankPtr + peermemInfo.offsetPeerPerTokenScale));
                    MatrixCoord offsetA{rowStart, 0};
                    MatrixCoord offsetPeer{rowSrc, 0};
                    int64_t gmOffsetA = params.layoutA.GetOffset(offsetA);
-                    int64_t gmOffsetPeer = rowSrc * (params.problemShape.k() + ALIGN_512);
+                    int64_t gmOffsetPeer = params.layoutA.GetOffset(offsetPeer);
                    // Communication data
-                    CopyGMToGMPerToken(gmA[gmOffsetA], gmPerTokenScale1[rowStart], gmRemoteA[gmOffsetPeer],  rows, params.problemShape.k());
+                    CopyGMToGM(gmA[gmOffsetA], gmRemoteA[gmOffsetPeer], rows * params.problemShape.k(), params.ubMoveNum);
                    // Communication scale
                    CopyGMToGM(gmPerTokenScale1[rowStart], gmRemotePerTokenScale[rowSrc], rows, rows);
                }
            }
@@ -837,16 +829,12 @@ private:
        uint32_t n2 = params.problemShape.k();
        typename BlockEpilogue2::Params epilogueParams{
            static_cast<int32_t>(params.EP),
            static_cast<int32_t>(params.expertPerRank),
            static_cast<int32_t>(params.rank),
            reinterpret_cast<__gm__ int32_t *>(shmem() + peermemInfo.offsetPeerTokenPerExpert),
-            params.layoutD2,
+            static_cast<int32_t>(n2)
            static_cast<int32_t>(n2),
            static_cast<int32_t>(L1TileShape::N),
            shmem,
            static_cast<int32_t>(peermemInfo.offsetD)
        };
        uint32_t n = params.problemShape.n();
@@ -890,109 +878,65 @@ private:
        blockEpilogue1.Finalize();
-
+        blockEpilogue2.SetFlag();
-        CombineSetFlag();
+        CombineV1(params, blockEpilogue2);
        CombineV2(params, blockEpilogue2);
        AscendC::SyncAll<true>();
        #ifndef __CROSSRANKSYNCANDALLGATHERV1__
        ResetTokenPerExpert(params.EP * AlignUp(params.EP * params.expertPerRank, 128));
        #endif
-        shmem.InitStatusTargetSum();
+
-        if (get_subblockid() == 0) {
+        shmem.CrossRankSync();
-            AscendC::LocalTensor<int32_t> ctrBuffer = resource.ubBuf.template GetBufferByByte<int32_t>(0);
+
            shmem.CrossRankSyncV2Set(ctrBuffer);
        } else {
            uint32_t uboffset = 0;
            uint32_t aicCoreNum = coreNum / 2;
            uint32_t aicCoreIdx = get_block_idx();
            uint32_t sendRankNum_ = params.EP / aicCoreNum;
            uint32_t remainderRankNum = params.EP % aicCoreNum;
            if (aicCoreIdx < remainderRankNum) {
                sendRankNum_++;
            }
            AscendC::LocalTensor<float> statusTensor = resource.ubBuf.template GetBufferByByte<float>(uboffset);
            uboffset += sendRankNum_ * UB_ALIGN;
            AscendC::LocalTensor<float> gatherMaskOutTensor = resource.ubBuf.template GetBufferByByte<float>(uboffset);
            uboffset += AlignUp(params.EP * sizeof(float), 32);
            AscendC::LocalTensor<uint32_t> gatherTmpTensor = resource.ubBuf.template GetBufferByByte<uint32_t>(uboffset);
            uboffset += AlignUp(sizeof(uint32_t), 32);
            AscendC::LocalTensor<float> statusSumOutTensor = resource.ubBuf.template GetBufferByByte<float>(uboffset);
            uboffset += AlignUp(sizeof(float), 32);
            shmem.CrossRankSyncV2Wait(statusTensor, gatherMaskOutTensor, gatherTmpTensor, statusSumOutTensor);
        MoeTokenUnpermuteTilingData tilingData;
-            MoeTokenUnpermuteTiling(params.problemShape.m() * params.topK, n2, params.topK, tilingData, coreNum / 2);
+        MoeTokenUnpermuteTiling(params.problemShape.m() * params.topK, n2, params.topK, tilingData, coreNum);
        KernelMoeTokenUnpermute<ElementD2, int32_t, float, true> kernelMoeTokenUnpermuteOp;
        kernelMoeTokenUnpermuteOp.Init(shmem() + peermemInfo.offsetD, workspaceInfo.expandedRowIdx, params.probs, reinterpret_cast<GM_ADDR>(params.ptrOutput), &tilingData);
        kernelMoeTokenUnpermuteOp.Process();
    }
    }
    CATLASS_DEVICE
-    void CombineV2(Params const &params, BlockEpilogue2 & blockEpilogue) {
+    void CombineV1(Params const &params, BlockEpilogue2 & blockEpilogue) {
        BlockScheduler blockScheduler;
        int32_t syncLoopIdx = 0;
        uint32_t startCoreIdx = 0;
        uint32_t aicCoreNum = coreNum / 2;
        uint32_t aicCoreIdx = get_block_idx();
        uint32_t aivSubCoreIdx = get_subblockid();
        uint32_t preSrcExpertSum = 0;
        uint32_t n2 = params.problemShape.k();
-        uint32_t k2 = params.problemShape.n() / 2;
+        int32_t prevGroupSum2 = 0;
        icache_preload(8);
-        for (uint32_t groupIdx = 0; groupIdx < params.expertPerRank; ++groupIdx) {
+        for (uint32_t t_groupIdx = 0; t_groupIdx < params.expertPerRank; ++t_groupIdx) {
-            uint32_t currentExpertM = cumsumMM((params.EP - 1) * params.expertPerRank + groupIdx);
+            int32_t flagId = t_groupIdx / CROSS_CORE_FLAG_MAX_SET_COUNT;
-            if (preSrcExpertSum >= params.maxOutputSize) {
+            AscendC::CrossCoreWaitFlag<0x2>(flagId);
-                currentExpertM = 0;
+            AscendC::SyncAll<true>();
            } else if (preSrcExpertSum + currentExpertM > params.maxOutputSize) {
                currentExpertM = params.maxOutputSize - preSrcExpertSum;
            }
            GemmCoord inGroupProblemShape{currentExpertM, n2, k2}; // M N K
            blockScheduler.Update(inGroupProblemShape, MakeCoord(L1TileShape::M, L1TileShape::N));
            uint32_t coreLoops = blockScheduler.GetCoreLoops();
            uint32_t startLoopIdx = ((aicCoreIdx < startCoreIdx) ? (aicCoreIdx + aicCoreNum) : aicCoreIdx) - startCoreIdx;
-            for (uint32_t loopIdx = startLoopIdx; loopIdx < coreLoops; loopIdx += aicCoreNum) {
+            uint32_t groupIdx = t_groupIdx;
                GemmCoord blockCoord = blockScheduler.GetBlockCoord(loopIdx);
                GemmCoord actualBlockShape = blockScheduler.GetActualBlockShape(blockCoord);
                int32_t m0 = 16;
                //  Block count, the shape of each block is (m0, actualBlockShape.n())
                int32_t m_rows = (actualBlockShape.m() + m0 - 1) / m0;
                int32_t aiv_m_rows = m_rows / 2;
                if (aivSubCoreIdx == 1 && aiv_m_rows * 2 < m_rows) {
                    aiv_m_rows += 1;
                }
                uint32_t m_offset = blockCoord.m() * L1TileShape::M;//blockOffset
                if(aivSubCoreIdx == 1) {
                    m_offset += (m_rows / 2) * m0;
                }
-
+            for(int32_t dstEpIdx = coreIdx; dstEpIdx < params.EP; dstEpIdx += coreNum) {
-                for (;syncLoopIdx <= groupIdx; syncLoopIdx ++) {
+                __gm__ void* dstPeermemPtr = shmem(peermemInfo.offsetD, dstEpIdx);
-                    int32_t flag_id = syncLoopIdx / CROSS_CORE_FLAG_MAX_SET_COUNT;
+                AscendC::GlobalTensor<ElementD2> gmRemotePeer;
-                    AscendC::CrossCoreWaitFlag<0x2>(flag_id);
+                gmRemotePeer.SetGlobalBuffer(reinterpret_cast<__gm__ ElementD2*>(dstPeermemPtr));
                uint32_t srcRowOffset = (dstEpIdx == 0 ? 0 : cumsumMM((dstEpIdx - 1) * params.expertPerRank + groupIdx)) + prevGroupSum2;
                if (srcRowOffset < params.maxOutputSize) {
                    uint32_t dataRows = tokenPerExpert(tokenPerExpertLayout(dstEpIdx, params.rank, groupIdx));
                    if (srcRowOffset + dataRows > params.maxOutputSize) {
                        dataRows = params.maxOutputSize - srcRowOffset;
                    }
-
+                    //uint32_t dstRowOffset = preSumBeforeRank(2 * dstEpIdx * FLAGSTRIDE + groupIdx);
-                for (int32_t cur_row = 0; cur_row < aiv_m_rows; cur_row ++) {
+                    int32_t tmpBlock = AlignUp(params.expertPerRank, FLAGSTRIDE);
-                    GemmCoord realTileCoord{m_offset, blockCoord.n() * L1TileShape::N, 1};
+                    //uint32_t dstRowOffset = preSumBeforeRank(dstEpIdx * tmpBlock + groupIdx);
-                    uint32_t actualm = m0;
+                    uint32_t dstRowOffset = preSumBeforeRank(dstEpIdx * params.expertPerRank + groupIdx);
-                    if(aivSubCoreIdx == 1 && cur_row == aiv_m_rows - 1){
+                    MatrixCoord offsetC{srcRowOffset, 0};
-                        actualm = actualBlockShape.m() - (m_rows / 2) * m0 - cur_row * m0;
+                    MatrixCoord offsetPeer{dstRowOffset, 0};
-                    }
+                    MatrixCoord shapeC{dataRows, n2};
-                    GemmCoord realTileShape{actualm, actualBlockShape.n(), 1};
+                    int64_t gmOffsetC = params.layoutD2.GetOffset(offsetC);
-                    blockEpilogue(gmC2, gmPerTokenScale2, realTileCoord, realTileShape, groupIdx, preSrcExpertSum, preSumBeforeRank);
+                    int64_t gmOffsetPeer = params.layoutD2.GetOffset(offsetPeer);
-                    m_offset += m0;
+                    if constexpr (std::is_same_v<ElementA, int8_t>) {
                        blockEpilogue(gmC2[gmOffsetC], shapeC, gmPerTokenScale2[srcRowOffset], gmRemotePeer[gmOffsetPeer]);
                    } else {
                        blockEpilogue(gmC2[gmOffsetC], shapeC, gmRemotePeer[gmOffsetPeer]);
                    }
                }
-            preSrcExpertSum += currentExpertM;
+            }
-            startCoreIdx = (startCoreIdx + coreLoops) % aicCoreNum;
+            prevGroupSum2 += cumsumMM((params.EP - 1) * params.expertPerRank + groupIdx);
        }
        blockEpilogue.Finalize();
    }
 private:
  struct WorkspaceInfo {
        GM_ADDR ptrA;
--- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_fullload_dynamic_quant.h
+++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_fullload_dynamic_quant.h
@@ -35,6 +35,7 @@ class MoeV2FullLoadDynamicQuant : public MoeV2SortBase {
  __aicore__ inline void CopyOutIdx();
  __aicore__ inline void CopyOutEmpty();
  __aicore__ inline void CopyOutXQuant1H();
  __aicore__ inline void CopyOutXQuantEH();
  __aicore__ inline void ComputeExpertTokenCountOrCumsum();
  __aicore__ inline void Compute(LocalTensor<float>& smoothLocal);
@@ -48,7 +49,6 @@ class MoeV2FullLoadDynamicQuant : public MoeV2SortBase {
  int64_t k_;
  int64_t n_;
  int64_t cols_;
  int64_t cols_scale_;
  int64_t activateRows_;
  int64_t expertNum;
  int64_t expertCapacity;
@@ -63,10 +63,12 @@ class MoeV2FullLoadDynamicQuant : public MoeV2SortBase {
  TQue<QuePosition::VECIN, 1> smoothInQueue;
  TQue<QuePosition::VECOUT, 1> calcQueue;
  TQue<QuePosition::VECOUT, 1> inputXOutQueue;
  TQue<QuePosition::VECOUT, 1> scaleOutQueue;
  GlobalTensor<T> xGm_;
  GlobalTensor<int32_t> expertIdxGm_;
  GlobalTensor<float> quantSmoothGm;
  GlobalTensor<float> dynamicQuantScaleGm;
  GlobalTensor<int8_t> expandedXGm_;
  GlobalTensor<int32_t> expandedRowIdxGm_;
@@ -223,7 +225,7 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::Compute(LocalTensor<float>&
  LocalTensor<float> tempLocal = calcQueue.AllocTensor<float>();
  LocalTensor<int8_t> outLocal = inputXOutQueue.AllocTensor<int8_t>();
-  LocalTensor<float> dynamicQuantLocal = outLocal[this->cols_].template ReinterpretCast<float>();
+  LocalTensor<float> dynamicQuantLocal = scaleOutQueue.AllocTensor<float>();
  if constexpr (!IsSameType<T, float>::value) {
    Cast(inLocal, inLocal.ReinterpretCast<T>()[colsAlign], RoundMode::CAST_NONE, this->cols_);
@@ -257,6 +259,7 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::Compute(LocalTensor<float>&
  calcQueue.FreeTensor(tempLocal);
  inputXOutQueue.EnQue(outLocal);
  scaleOutQueue.EnQue(dynamicQuantLocal);
 }
 template <typename T>
@@ -272,7 +275,7 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::CopyOutXQuant1H() {
  DataCopyExtParams dataXCopyParams{1, static_cast<uint32_t>(this->cols_ * sizeof(T)), 0, 0, 0};
  DataCopyExtParams smoothCopyParams{1, static_cast<uint32_t>(this->cols_ * sizeof(float)), 0, 0, 0};
-  DataCopyExtParams intriParams{1, static_cast<uint32_t>((this->cols_ + BLOCK_BYTES) * sizeof(int8_t)), 0, 0, 0};
+  DataCopyExtParams intriParams{1, static_cast<uint32_t>(this->cols_ * sizeof(int8_t)), 0, 0, 0};
  LocalTensor<float> smoothLocal;
  if (smoothType == 1) {
@@ -292,6 +295,7 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::CopyOutXQuant1H() {
    xCopyInQueue_.EnQue<T>(xLocal);
    Compute(smoothLocal);
    LocalTensor<float> quantScaleLocal = scaleOutQueue.DeQue<float>();
    LocalTensor<int8_t> outLocal = inputXOutQueue.DeQue<int8_t>();
    while (curRowsStart <= curRowsEnd && curRowsStart / this->k_ == row) {
      int32_t outIndex = expandedRowIdx.GetValue(curRowsStart);
@@ -299,15 +303,74 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::CopyOutXQuant1H() {
      if (outIndex == -1 || (this->dropPadMode == DROPLESS_MODE && outIndex >= this->activateRows_)) {
        continue;
      }
-      DataCopyPad(expandedXGm_[outIndex * this->cols_scale_], outLocal, intriParams);
+      DataCopyPad(expandedXGm_[outIndex * cols_], outLocal, intriParams);
      DataCopyPad(dynamicQuantScaleGm[outIndex], quantScaleLocal, {1, 4, 0, 0, 0});
    }
    xCopyInQueue_.FreeTensor(xLocal);
    inputXOutQueue.FreeTensor(outLocal);
    scaleOutQueue.FreeTensor(quantScaleLocal);
  }
  if (smoothType == 1) {
    smoothInQueue.FreeTensor(smoothLocal);
  }
  expandedRowIdxCopyOutQueue_.FreeTensor(expandedRowIdx);
 }
 template <typename T>
 __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::CopyOutXQuantEH() {
  LocalTensor<int32_t> expandedRowIdx = expandedRowIdxCopyOutQueue_.DeQue<int32_t>();
  expandedRowIdxCopyOutQueue_.FreeTensor(expandedRowIdx);
  Muls(expandDstToSrcRowLocal.ReinterpretCast<float>(), expandDstToSrcRowLocal.ReinterpretCast<float>(), (float)-1,
       this->totalLength);
  pipe_barrier(PIPE_V);
  LocalTensor<int32_t> sortedRowIdx = expandDstToSrcRowLocal.ReinterpretCast<int32_t>();
  Cast(sortedRowIdx, expandDstToSrcRowLocal.ReinterpretCast<float>(), RoundMode::CAST_ROUND, this->totalLength);
  int64_t curRowsStart = this->blockIdx_ * this->perCoreRows_;
  int64_t curRowsEnd = curRowsStart + this->coreRows_ - 1;
  DataCopyExtParams dataXCopyParams{1, static_cast<uint32_t>(this->cols_ * sizeof(T)), 0, 0, 0};
  DataCopyExtParams smoothCopyParams{1, static_cast<uint32_t>(this->cols_ * sizeof(float)), 0, 0, 0};
  DataCopyExtParams intriParams{1, static_cast<uint32_t>(this->cols_ * sizeof(int8_t)), 0, 0, 0};
  for (int64_t row = curRowsStart; row <= curRowsEnd; row++) {
    if (this->dropPadMode == DROPLESS_MODE && row >= this->activateRows_) {
      break;
    }
    int32_t srcIdx = sortedRowIdx.GetValue(row);
    int32_t expertIdx = expandedExpertIdxLocal.GetValue(row);
    LocalTensor<T> inLocal = xCopyInQueue_.AllocTensor<T>();
    LocalTensor<float> smoothLocal = smoothInQueue.AllocTensor<float>();
    if constexpr (IsSameType<T, float>::value) {
      DataCopyPad(inLocal, xGm_[srcIdx / this->k_ * this->cols_], dataXCopyParams, {false, 0, 0, 0});
    } else {
      DataCopyPad(inLocal[colsAlign], xGm_[srcIdx / this->k_ * this->cols_], dataXCopyParams, {false, 0, 0, 0});
    }
    DataCopyPad(smoothLocal, quantSmoothGm[expertIdx * this->cols_], smoothCopyParams, {false, 0, 0, 0});
    xCopyInQueue_.EnQue<T>(inLocal);
    smoothInQueue.EnQue(smoothLocal);
    smoothLocal = smoothInQueue.DeQue<float>();
    Compute(smoothLocal);
    LocalTensor<float> quantScaleLocal = scaleOutQueue.DeQue<float>();
    DataCopyPad(dynamicQuantScaleGm[row], quantScaleLocal, {1, 4, 0, 0, 0});
    LocalTensor<int8_t> outLocal = inputXOutQueue.DeQue<int8_t>();
    DataCopyPad(expandedXGm_[row * this->cols_], outLocal, intriParams);
    xCopyInQueue_.FreeTensor(inLocal);
    smoothInQueue.FreeTensor(smoothLocal);
    inputXOutQueue.FreeTensor(outLocal);
    scaleOutQueue.FreeTensor(quantScaleLocal);
  }
  expandDstToSrcRowQueue_.FreeTensor(expandDstToSrcRowLocal);
  expandedExpertIdxCopyOutQueue_.FreeTensor(expandedExpertIdxLocal);
 }
 template <typename T>
 __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::Init(GM_ADDR x, GM_ADDR expertIdx, GM_ADDR expandedX,
                                                          GM_ADDR expandedRowIdx, GM_ADDR expertTokensCountOrCumsum,
@@ -321,7 +384,6 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::Init(GM_ADDR x, GM_ADDR exp
  this->k_ = tilingData->k;
  this->n_ = tilingData->n;
  this->cols_ = tilingData->cols;
  this->cols_scale_ = this->cols_ + ALIGN_512;
  this->needCoreNum_ = this->gatherOutTilingData_->needCoreNum;
  this->perCoreRows_ = this->gatherOutTilingData_->perCoreRows;
  this->activateRows_ = this->gatherOutTilingData_->activateRows;
@@ -352,6 +414,7 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::Init(GM_ADDR x, GM_ADDR exp
                                                Align(this->expertNum, sizeof(int32_t)));
  }
  quantSmoothGm.SetGlobalBuffer((__gm__ float*)quantSmooth);
  dynamicQuantScaleGm.SetGlobalBuffer((__gm__ float*)dynamicQuantScale);
  int64_t kvFactor = 2;
  int64_t buffSize = this->sortNum_ * sizeof(int32_t);
@@ -375,7 +438,8 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::Init(GM_ADDR x, GM_ADDR exp
  }
  pipe->InitBuffer(smoothInQueue, 1, AlignBytes(this->cols_, sizeof(float)));
  pipe->InitBuffer(calcQueue, 1, AlignBytes(this->cols_, sizeof(float)));
-  pipe->InitBuffer(inputXOutQueue, 1, AlignBytes(this->cols_scale_, sizeof(int8_t)));
+  pipe->InitBuffer(inputXOutQueue, 1, AlignBytes(this->cols_, sizeof(int8_t)));
  pipe->InitBuffer(scaleOutQueue, 1, BLOCK_BYTES + BLOCK_BYTES);
 }
 template <typename T>
@@ -391,8 +455,12 @@ __aicore__ inline void MoeV2FullLoadDynamicQuant<T>::Process() {
    } else {
      CopyOutEmpty();
    }
    if (smoothType == 2) {
      CopyOutXQuantEH();
    } else {
      CopyOutXQuant1H();
    }
  }
 }
 }  // namespace MoeInitRoutingQuantV2
 #endif  // MOE_V2_DYNAMIC_QUANT_FULL_LOAD_H
--- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_dynamic_quant.h
+++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_dynamic_quant.h
@@ -66,7 +66,6 @@ class MoeV2GatherDynamicQuant {
  int64_t needCoreNum;
  int64_t blockIdx;
  int64_t cols;
  int64_t cols_scale_;
  int64_t n;
  int64_t k;
  int64_t totalLength;
@@ -118,7 +117,7 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::Compute(LocalTensor<float>& s
  LocalTensor<float> tempLocal = calcQueue.AllocTensor<float>();
  LocalTensor<int8_t> outLocal = inputXOutQueue.AllocTensor<int8_t>();
-  LocalTensor<float> dynamicQuantLocal = outLocal[this->cols].template ReinterpretCast<float>();
+  LocalTensor<float> dynamicQuantLocal = scaleOutQueue.AllocTensor<float>();
  if constexpr (!IsSameType<T, float>::value) {
    Cast(inLocal, inLocal.ReinterpretCast<T>()[perLoopColsAlign], RoundMode::CAST_NONE, this->cols);
@@ -152,6 +151,7 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::Compute(LocalTensor<float>& s
  calcQueue.FreeTensor(tempLocal);
  inputXOutQueue.EnQue(outLocal);
  scaleOutQueue.EnQue(dynamicQuantLocal);
 }
 template <typename T>
@@ -163,7 +163,7 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::CopyOutXQuant1H(int64_t progr
  int64_t currentLoopStartRow = initialRow / this->k;
  int64_t currentLoopLastRow = (initialRow + this->currentLoopRows - 1) / this->k;
  DataCopyExtParams copyInParams{1, static_cast<uint32_t>(this->cols * sizeof(T)), 0, 0, 0};
-  DataCopyExtParams copyOutParams{1, static_cast<uint32_t>((this->cols + BLOCK_BYTES) * sizeof(int8_t)), 0, 0, 0};
+  DataCopyExtParams copyOutParams{1, static_cast<uint32_t>(this->cols * sizeof(int8_t)), 0, 0, 0};
  DataCopyExtParams smoothParams{1, static_cast<uint32_t>(this->cols * sizeof(float)), 0, 0, 0};
  LocalTensor<float> smoothLocal;
@@ -187,6 +187,7 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::CopyOutXQuant1H(int64_t progr
    // Compute quantization
    Compute(smoothLocal);
    LocalTensor<float> quantScaleLocal = scaleOutQueue.DeQue<float>();
    LocalTensor<int8_t> outLocal = inputXOutQueue.DeQue<int8_t>();
    while (curLoopRow < this->currentLoopRows && initialRow / this->k == row) {
@@ -196,11 +197,15 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::CopyOutXQuant1H(int64_t progr
      if (outIndex == -1 || (this->dropPadMode == DROPLESS_MODE && outIndex >= this->activateRows)) {
        continue;
      }
-      // Scale is placed after the data position
+      DataCopyPad(expandedXGm[outIndex * cols], outLocal, copyOutParams);
-      DataCopyPad(expandedXGm[outIndex * cols_scale_], outLocal, copyOutParams);
+      DataCopyPad(dynamicQuantScaleGm[outIndex], quantScaleLocal, {1, 4, 0, 0, 0});
    }
    inputXInQueue.FreeTensor(inLocal);
    inputXOutQueue.FreeTensor(outLocal);
    scaleOutQueue.FreeTensor(quantScaleLocal);
  }
  if (smoothType == 1) {
    smoothInQueue.FreeTensor(smoothLocal);
  }
  expandRowIdxInQueue.FreeTensor(indicesLocal);
 }
@@ -458,7 +463,6 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::Init(GM_ADDR inputX, GM_ADDR
  this->needCoreNum = this->gatherOutTilingData->needCoreNum;
  this->activateRows = this->gatherOutTilingData->activateRows;
  this->cols = tilingData->cols;
  this->cols_scale_ = this->cols + ALIGN_512;
  this->n = tilingData->n;
  this->k = tilingData->k;
  this->totalLength = tilingData->n * tilingData->k;
@@ -514,15 +518,33 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::Init(GM_ADDR inputX, GM_ADDR
  pipe->InitBuffer(smoothInQueue, BUFFER_NUM, AlignBytes(this->perLoopCols, sizeof(float)));
  pipe->InitBuffer(calcQueue, 1, AlignBytes(this->perLoopCols, sizeof(float)));
  pipe->InitBuffer(inputXOutQueue, 1, AlignBytes(this->perLoopCols, sizeof(int8_t)));
  pipe->InitBuffer(scaleOutQueue, 1, BLOCK_BYTES + BLOCK_BYTES);
 }
 template <typename T>
 __aicore__ inline void MoeV2GatherDynamicQuant<T>::Process() {
  if (this->blockIdx < this->needCoreNum) {
    currentLoopRows = perLoopRows;
-      if (colLoops > 1) {  // Cannot fit all data in one row, workspace is required
+
-        trap();   // Not supported
+    if (colLoops > 1) {  // A single row cannot be fully loaded; workspace is required
-      } else {  // All data can fit in one row
+      if (smoothType == 2) {
        for (int64_t loop = 0; loop < this->rowLoops - 1; loop++) {
          CopyInExpandedExpertIdx(loop);
          CopyOutPartialXQuantEH(loop);
        }
        currentLoopRows = lastLoopRows;
        CopyInExpandedExpertIdx(this->rowLoops - 1);
        CopyOutPartialXQuantEH(this->rowLoops - 1);
      } else {
        for (int64_t loop = 0; loop < this->rowLoops - 1; loop++) {
          CopyInExpandedRowIdx(loop);
          CopyOutPartialXQuant1H(loop);
        }
        currentLoopRows = lastLoopRows;
        CopyInExpandedRowIdx(this->rowLoops - 1);
        CopyOutPartialXQuant1H(this->rowLoops - 1);
      }
    } else {  // A single row can be fully loaded
      if (smoothType == 2) {
        for (int64_t loop = 0; loop < this->rowLoops - 1; loop++) {
          CopyInExpandedExpertIdx(loop);
--- a/csrc/dispatch_ffn_combine/op_kernel/unpermute/moe_token_unpermute.h
+++ b/csrc/dispatch_ffn_combine/op_kernel/unpermute/moe_token_unpermute.h
@@ -85,9 +85,8 @@ KernelMoeTokenUnpermute<T1, T2, T3, PROBS>::Init(GM_ADDR permuted_tokens, GM_ADD
                                                 GM_ADDR unpermuted_tokens,
                                                 const MoeTokenUnpermuteTilingData *__restrict tiling_data)
 {
-    this->blockIdx = get_block_idx();
+    this->blockIdx = get_block_idx() + get_subblockid() * get_block_num();
-    this->blockNum = get_block_num();
+    this->blockNum = get_block_num() * get_subblockdim();
    if (blockIdx >= blockNum) {
        return;
    }
--- a/csrc/dispatch_ffn_combine/op_kernel/utils/block_epilogue_pertoken_row.hpp
+++ b/csrc/dispatch_ffn_combine/op_kernel/utils/block_epilogue_pertoken_row.hpp
@@ -99,12 +99,20 @@ public:
            eventUbDMTE3VList[i] = eventMTE3V++;
            eventUbDVMTE3List[i] = eventVMTE3++;
-            AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(eventUbCVMTE2List[i]);
+
            AscendC::SetFlag<AscendC::HardEvent::MTE3_V>(eventUbDMTE3VList[i]);
            ubCFp32List[i] = resource.ubBuf.template GetBufferByByte<float>(ubOffset);
            ubOffset += blockN * sizeof(float);
        }
    }
    CATLASS_DEVICE
    void SetFlag() 
    {
        for (uint32_t i = 0; i < UB_STAGES; ++i) {
            AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(eventUbCVMTE2List[i]);
            AscendC::SetFlag<AscendC::HardEvent::MTE3_V>(eventUbDMTE3VList[i]);
        }
    }
    CATLASS_DEVICE
    void Finalize()
    {