[BugFix] dispatch_ffn_combine kernal rollback combinev2 part (#8405)

### What this PR does / why we need it?  ### Does this PR introduce _any_ user-facing change?  ### How was this patch tested?  Signed-off-by: l00893928 <liuquanlu@huawei.com> Co-authored-by: l00893928 <liuquanlu@huawei.com>
2026-04-18 22:45:08 +08:00
parent 6bdc72949b
commit b992b11545
2 changed files with 53 additions and 15 deletions
--- a/csrc/dispatch_ffn_combine/op_kernel/dispatch_ffn_combine.h
+++ b/csrc/dispatch_ffn_combine/op_kernel/dispatch_ffn_combine.h
@@ -224,7 +224,7 @@ __aicore__ inline void DispatchFFNCombine<TemplateMMA2ACFunc>::Process()
    constexpr uint32_t ubStages = 2;

    using EpilogueDispatchPolicy1 = Epilogue::EpilogueAtlasA2PerTokenDequantSwigluQuant<ubStages>;
-
+    
    using ScaleType = Gemm::GemmType<uint64_t, layout::VectorLayout>;
    using PerTokenScaleType = Gemm::GemmType<float, layout::VectorLayout>;
    using ElementMulType = Gemm::GemmType<float, layout::RowMajor>;
@@ -234,7 +234,8 @@ __aicore__ inline void DispatchFFNCombine<TemplateMMA2ACFunc>::Process()
    using BlockEpilogue1 = Epilogue::Block::BlockEpilogue<EpilogueDispatchPolicy1, CType, PerTokenScaleType,
        D1Type, TileElemWiseMuls, TileCopy1>;

-    using EpilogueDispatchPolicy2 = Epilogue::EpilogueAtlasA2PerTokenDequantV2<ubStages>;
+    using EpilogueDispatchPolicy2 = Epilogue::EpilogueAtlasA2PerTokenDequant<ubStages>;
+
    using TileCopy2 = Epilogue::Tile::TileCopy<ArchTag, CType, ScaleType, PerTokenScaleType, D2Type>;
    using BlockEpilogue2 = Epilogue::Block::BlockEpilogue<EpilogueDispatchPolicy2, CType,PerTokenScaleType,
        D2Type, TileCopy2>;
--- a/csrc/dispatch_ffn_combine/op_kernel/dispatch_ffn_combine_kernel.hpp
+++ b/csrc/dispatch_ffn_combine/op_kernel/dispatch_ffn_combine_kernel.hpp
@@ -571,6 +571,7 @@ private:
        if constexpr (BlockMmad::DispatchPolicy::ASYNC) {
            blockMmad.SynchronizeBlock();
        }
+        blockMmad.Finalize(params.expertPerRank - 1, 0);
    }


@@ -837,20 +838,16 @@ private:

        uint32_t n2 = params.problemShape.k();

+
        typename BlockEpilogue2::Params epilogueParams{
            static_cast<int32_t>(params.EP),
            static_cast<int32_t>(params.expertPerRank),
-            static_cast<int32_t>(params.rank),
            reinterpret_cast<__gm__ int32_t *>(shmem() + peermemInfo.offsetPeerTokenPerExpert),
-            params.layoutD2,
-            static_cast<int32_t>(n2),
-            static_cast<int32_t>(L1TileShape::N),
-            shmem,
-            static_cast<int32_t>(peermemInfo.offsetD)
+            static_cast<int32_t>(n2)
        };

        uint32_t n = params.problemShape.n();
-        BlockEpilogue2 blockEpilogue2(resource, epilogueParams);
+        
        BlockEpilogue1 blockEpilogue1(resource, n);

        // Synchronous wait: SwiGLU waits for GMM1 [1]
@@ -889,16 +886,13 @@ private:
        }

        blockEpilogue1.Finalize();
-
-
-        CombineSetFlag();
-
-        CombineV2(params, blockEpilogue2);
-
+        BlockEpilogue2 blockEpilogue2(resource, epilogueParams);
+        CombineV1(params, blockEpilogue2);
        AscendC::SyncAll<true>();
        #ifndef __CROSSRANKSYNCANDALLGATHERV1__
        ResetTokenPerExpert(params.EP * AlignUp(params.EP * params.expertPerRank, 128));
        #endif
+
        shmem.InitStatusTargetSum();
        if (get_subblockid() == 0) {
            AscendC::LocalTensor<int32_t> ctrBuffer = resource.ubBuf.template GetBufferByByte<int32_t>(0);
@@ -929,6 +923,49 @@ private:
        }
 
    }
+    CATLASS_DEVICE
+    void CombineV1(Params const &params, BlockEpilogue2 & blockEpilogue) {
+        uint32_t n2 = params.problemShape.k();
+        int32_t prevGroupSum2 = 0;
+
+        icache_preload(8);
+        for (uint32_t t_groupIdx = 0; t_groupIdx < params.expertPerRank; ++t_groupIdx) {
+            int32_t flagId = t_groupIdx / CROSS_CORE_FLAG_MAX_SET_COUNT;
+            AscendC::CrossCoreWaitFlag<0x2>(flagId);
+            AscendC::SyncAll<true>();
+
+            uint32_t groupIdx = t_groupIdx;
+
+            for(int32_t dstEpIdx = coreIdx; dstEpIdx < params.EP; dstEpIdx += coreNum) {
+                __gm__ void* dstPeermemPtr = shmem(peermemInfo.offsetD, dstEpIdx);
+                AscendC::GlobalTensor<ElementD2> gmRemotePeer;
+                gmRemotePeer.SetGlobalBuffer(reinterpret_cast<__gm__ ElementD2*>(dstPeermemPtr));
+                uint32_t srcRowOffset = (dstEpIdx == 0 ? 0 : cumsumMM((dstEpIdx - 1) * params.expertPerRank + groupIdx)) + prevGroupSum2;
+                if (srcRowOffset < params.maxOutputSize) {
+                    uint32_t dataRows = tokenPerExpert(tokenPerExpertLayout(dstEpIdx, params.rank, groupIdx));
+                    if (srcRowOffset + dataRows > params.maxOutputSize) {
+                        dataRows = params.maxOutputSize - srcRowOffset;
+                    }
+                    //uint32_t dstRowOffset = preSumBeforeRank(2 * dstEpIdx * FLAGSTRIDE + groupIdx);
+                    int32_t tmpBlock = AlignUp(params.expertPerRank, FLAGSTRIDE);
+                    //uint32_t dstRowOffset = preSumBeforeRank(dstEpIdx * tmpBlock + groupIdx);
+                    uint32_t dstRowOffset = preSumBeforeRank(dstEpIdx * params.expertPerRank + groupIdx);
+                    MatrixCoord offsetC{srcRowOffset, 0};
+                    MatrixCoord offsetPeer{dstRowOffset, 0};
+                    MatrixCoord shapeC{dataRows, n2};
+                    int64_t gmOffsetC = params.layoutD2.GetOffset(offsetC);
+                    int64_t gmOffsetPeer = params.layoutD2.GetOffset(offsetPeer);
+                    if constexpr (std::is_same_v<ElementA, int8_t>) {
+                        blockEpilogue(gmC2[gmOffsetC], shapeC, gmPerTokenScale2[srcRowOffset], gmRemotePeer[gmOffsetPeer]);
+                    } else {
+                        blockEpilogue(gmC2[gmOffsetC], shapeC, gmRemotePeer[gmOffsetPeer]);
+                    }
+                }
+            }
+            prevGroupSum2 += cumsumMM((params.EP - 1) * params.expertPerRank + groupIdx);
+        }
+        blockEpilogue.Finalize();
+    }

    CATLASS_DEVICE
    void CombineV2(Params const &params, BlockEpilogue2 & blockEpilogue) {