[BugFix][0.18.0]dispatch_ffn_combine kernal rollback combine 、unpermute part and scale part (#8534)

cherry-pick https://github.com/vllm-project/vllm-ascend/pull/8539  ### What this PR does / why we need it?  Due to end-to-end testing , three optimization points for the decode scenario have been reverted in dispatch_ffn_combine kernel. ### Does this PR introduce _any_ user-facing change?  ### How was this patch tested?  --------- Signed-off-by: l00893928 <liuquanlu@huawei.com> Co-authored-by: l00893928 <liuquanlu@huawei.com>
2026-04-22 23:27:02 +08:00
parent 69a57bc9ec
commit fcf4d477a7
6 changed files with 180 additions and 136 deletions
--- a/csrc/dispatch_ffn_combine/op_kernel/dispatch_ffn_combine.h
+++ b/csrc/dispatch_ffn_combine/op_kernel/dispatch_ffn_combine.h
@@ -224,7 +224,7 @@ __aicore__ inline void DispatchFFNCombine<TemplateMMA2ACFunc>::Process()
    constexpr uint32_t ubStages = 2;

    using EpilogueDispatchPolicy1 = Epilogue::EpilogueAtlasA2PerTokenDequantSwigluQuant<ubStages>;
-
+    
    using ScaleType = Gemm::GemmType<uint64_t, layout::VectorLayout>;
    using PerTokenScaleType = Gemm::GemmType<float, layout::VectorLayout>;
    using ElementMulType = Gemm::GemmType<float, layout::RowMajor>;
@@ -234,7 +234,8 @@ __aicore__ inline void DispatchFFNCombine<TemplateMMA2ACFunc>::Process()
    using BlockEpilogue1 = Epilogue::Block::BlockEpilogue<EpilogueDispatchPolicy1, CType, PerTokenScaleType,
        D1Type, TileElemWiseMuls, TileCopy1>;

-    using EpilogueDispatchPolicy2 = Epilogue::EpilogueAtlasA2PerTokenDequantV2<ubStages>;
+    using EpilogueDispatchPolicy2 = Epilogue::EpilogueAtlasA2PerTokenDequant<ubStages>;
+
    using TileCopy2 = Epilogue::Tile::TileCopy<ArchTag, CType, ScaleType, PerTokenScaleType, D2Type>;
    using BlockEpilogue2 = Epilogue::Block::BlockEpilogue<EpilogueDispatchPolicy2, CType,PerTokenScaleType,
        D2Type, TileCopy2>;
@@ -254,9 +255,11 @@ __aicore__ inline void DispatchFFNCombine<TemplateMMA2ACFunc>::Process()

    GemmCoord problemShape{static_cast<uint32_t>(m), static_cast<uint32_t>(n), static_cast<uint32_t>(k)};

-    uint32_t epilogueCoreNum = aivNum / 2;
-    uint32_t epilogueGranularity = expertPerRank - 1;
-
+    uint32_t epilogueCoreNum = aivNum;
+    uint32_t epilogueGranularity = expertPerRank - 3;
+    if (expertPerRank <= 4) {
+        epilogueGranularity = expertPerRank - 1;
+    }
    typename MatmulKernel::Params params{
        problemShape, static_cast<uint32_t>(EP), static_cast<uint32_t>(listLen), static_cast<uint32_t>(expertPerRank), static_cast<uint32_t>(maxOutputSize),
        static_cast<uint32_t>(rank), static_cast<uint32_t>(rankSize),
@@ -277,4 +280,4 @@ __aicore__ inline void DispatchFFNCombine<TemplateMMA2ACFunc>::Process()
 }

 } // DispatchFFNCombineImpl
-#endif // DISPATCH_FFN_COMBINE_H
+#endif // DISPATCH_FFN_COMBINE_H