From de4990a5b2d1db1ba6a13c222538e00443d58abd Mon Sep 17 00:00:00 2001 From: Yuhao Yao <37280700+yuhyao@users.noreply.github.com> Date: Thu, 21 Aug 2025 18:45:18 +0800 Subject: [PATCH] [Bug] Fix w4afp8 moe kernel (#9392) --- ...m90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sgl-kernel/csrc/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp b/sgl-kernel/csrc/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp index 13e890e35..b37d5696c 100644 --- a/sgl-kernel/csrc/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp +++ b/sgl-kernel/csrc/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp @@ -1488,6 +1488,10 @@ struct CollectiveMmaArrayMixedInput< template CUTLASS_DEVICE void tensormaps_cp_fence_release(TensorMapStorage& shared_tensormaps, cute::tuple const& input_tensormaps) { + if (cute::elect_one_sync()) { + cute::tma_desc_commit_group(); + cute::tma_desc_wait_group(); + } // Entire warp must do this (i.e. it's aligned) tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A); tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B);