fix: resolve sync bug in DispathFFNCombine when expert num per card is 32 (#6416)

### What this PR does / why we need it? Fix the synchronization deadlock issue in DispathFFNCombine module that occurs on NPU cards when the number of experts per card exceeds 16 (the bug manifests prominently when set to 32/128). ### Does this PR introduce _any_ user-facing change? No, this is a bug fix for internal synchronization logic specific to NPU expert dispatch, with no impact on external APIs, interfaces, or end-user behaviors. - vLLM version: v0.14.1 - vLLM main: dc917cceb8 Signed-off-by: xulei_ict <xulei292@huawei.com> Co-authored-by: xulei_ict <xulei292@huawei.com>
2026-01-30 21:21:20 +08:00
parent 56f5d3bd49
commit 77ea873224
4 changed files with 69 additions and 32 deletions
--- a/csrc/dispatch_ffn_combine/op_kernel/utils/block_mmad_preload_async_fixpipe_quant.hpp
+++ b/csrc/dispatch_ffn_combine/op_kernel/utils/block_mmad_preload_async_fixpipe_quant.hpp
@@ -22,6 +22,8 @@

 namespace Catlass::Gemm::Block {

+constexpr uint16_t CROSS_CORE_FLAG_MAX_SET_COUNT = 15;
+
 template<AscendC::HardEvent event>
 __aicore__ inline void SyncFlagFunc(int32_t eventID)
 {
@@ -271,7 +273,7 @@ public:
    void Finalize(int32_t target, int32_t flag = 0)
    {
        for(;syncGroupIdx <= target; syncGroupIdx++) {
-            int32_t flagId = syncGroupIdx / 8 + flag;
+            int32_t flagId = syncGroupIdx / CROSS_CORE_FLAG_MAX_SET_COUNT + flag;
            AscendC::CrossCoreSetFlag<0x2, PIPE_FIX>(flagId);
        }
    }