[Bug] Fix w4afp8 moe kernel (#9392)

This commit is contained in:
Yuhao Yao
2025-08-21 18:45:18 +08:00
committed by GitHub
parent 029e0af31d
commit de4990a5b2

View File

@@ -1488,6 +1488,10 @@ struct CollectiveMmaArrayMixedInput<
template <class... TMs>
CUTLASS_DEVICE void
tensormaps_cp_fence_release(TensorMapStorage& shared_tensormaps, cute::tuple<TMs...> const& input_tensormaps) {
if (cute::elect_one_sync()) {
cute::tma_desc_commit_group();
cute::tma_desc_wait_group();
}
// Entire warp must do this (i.e. it's aligned)
tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A);
tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B);