Rename files in sgl kernel to avoid nested folder structure (#4213)

Co-authored-by: zhyncs <me@zhyncs.com>
2025-03-08 22:54:51 -08:00
parent ee132a4515
commit 8abf74e3c9
47 changed files with 184 additions and 199 deletions
--- a/sgl-kernel/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp
+++ b/sgl-kernel/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp
@@ -0,0 +1,125 @@
+// Adapt from
+// https://github.com/vllm-project/vllm/blob/v0.7.1/csrc/cutlass_extensions/gemm/collective/collective_buildler.hpp
+// Modified from: cutlass/gemm/collective/builders/sm90_gmma_builder.inl
+// clang-format off
+#pragma once
+
+#include <cutlass/gemm/collective/builders/sm90_gmma_builder.inl>
+#include "cutlass_extensions/gemm/dispatch_policy.hpp"
+#include "cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp"
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_TMA_WS_SS (BlockScaled Builders)
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  int ScaleGranularityM
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>,
+    cute::enable_if_t<
+      not detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()>
+> {
+  using KernelScheduleType = KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>;
+
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
+                "Should meet TMA alignment requirement\n");
+
+  static constexpr bool IsArrayOfPointersGemm = (cute::is_any_of_v<KernelScheduleType,
+                                                                   KernelPtrArrayTmaWarpSpecializedCooperative,
+                                                                   KernelPtrArrayTmaWarpSpecializedPingpong>);
+  static constexpr bool IsFP8Input = detail::is_input_fp8<ElementA, ElementB>();
+  static_assert((!IsFP8Input || !IsArrayOfPointersGemm),
+                "KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum is only compatible with FP8 Blocked Scaled version right now.");
+
+  // For fp32 types, map to tf32 MMA value type
+  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
+  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
+
+  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementAMma, GmemLayoutATag>();
+  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementBMma, GmemLayoutBTag>();
+
+  static constexpr bool IsCooperative = cute::is_any_of_v<KernelScheduleType,
+                                                          KernelTmaWarpSpecializedCooperative,
+                                                          KernelPtrArrayTmaWarpSpecializedCooperative,
+                                                          KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>>;
+  using AtomLayoutMNK = cute::conditional_t<IsCooperative,
+      Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
+      ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
+
+  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  using SmemLayoutAtomA = decltype(detail::ss_smem_selector<
+      GmmaMajorA, ElementAMma, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+  using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
+      GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  static constexpr size_t TensorMapStorage = IsArrayOfPointersGemm ? sizeof(cute::TmaDescriptor) * 2 /* for A and B */ : 0;
+  static constexpr int KernelSmemCarveout = static_cast<int>(TensorMapStorage);
+
+  static constexpr int PipelineStages = detail::compute_stage_count_or_override<detail::sm90_smem_capacity_bytes - KernelSmemCarveout,
+      ElementAMma, ElementBMma, TileShape_MNK>(StageCountType{});
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<PipelineStages, ClusterShape_MNK, KernelScheduleType, ScaleGranularityM>;
+
+  using SmemCopyAtomA = void;
+  using SmemCopyAtomB = void;
+
+  using CollectiveOp = CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementA,
+      TagToStrideA_t<GmemLayoutATag>,
+      ElementB,
+      TagToStrideB_t<GmemLayoutBTag>,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      SmemCopyAtomA,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      SmemCopyAtomB,
+      cute::identity
+    >;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/sgl-kernel/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+++ b/sgl-kernel/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
@@ -0,0 +1,733 @@
+// clang-format off
+// Adapt from https://github.com/vllm-project/vllm/blob/v0.7.1/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+// Adapted (Heavily) from: https://github.com/soundOfDestiny/cutlass/blob/9d997ce0dea4c5fa1a617db6b7ff29aa9235822c/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+ #pragma once
+
+ #include "cutlass/cutlass.h"
+ #include "cutlass/gemm/dispatch_policy.hpp"
+ #include "cutlass/trace.h"
+ #include "cutlass/numeric_types.h"
+
+ #include "cute/arch/cluster_sm90.hpp"
+ #include "cute/arch/copy_sm80.hpp"
+ #include "cute/arch/copy_sm90.hpp"
+ #include "cute/algorithm/functional.hpp"
+ #include "cute/atom/mma_atom.hpp"
+ #include "cute/algorithm/gemm.hpp"
+ #include "cute/tensor_predicate.hpp"
+ #include "cute/numeric/arithmetic_tuple.hpp"
+
+ /////////////////////////////////////////////////////////////////////////////////////////////////
+
+ namespace cutlass::gemm::collective {
+ using namespace cute;
+
+ /////////////////////////////////////////////////////////////////////////////////////////////////
+
+ // WarpSpecialized Mainloop
+ template <
+   int Stages,
+   class ClusterShape,
+   class KernelSchedule,
+   int ScaleGranularityM_,
+   class TileShape_,
+   class ElementA_,
+   class StrideA_,
+   class ElementB_,
+   class StrideB_,
+   class TiledMma_,
+   class GmemTiledCopyA_,
+   class SmemLayoutAtomA_,
+   class SmemCopyAtomA_,
+   class TransformA_,
+   class GmemTiledCopyB_,
+   class SmemLayoutAtomB_,
+   class SmemCopyAtomB_,
+   class TransformB_>
+ struct CollectiveMma<
+     MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<Stages, ClusterShape, KernelSchedule, ScaleGranularityM_>,
+     TileShape_,
+     ElementA_,
+     StrideA_,
+     ElementB_,
+     StrideB_,
+     TiledMma_,
+     GmemTiledCopyA_,
+     SmemLayoutAtomA_,
+     SmemCopyAtomA_,
+     TransformA_,
+     GmemTiledCopyB_,
+     SmemLayoutAtomB_,
+     SmemCopyAtomB_,
+     TransformB_>
+ {
+   //
+   // Type Aliases
+   //
+   using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<Stages, ClusterShape, KernelSchedule, ScaleGranularityM_>;
+   using TileShape = TileShape_;
+   using ElementA = ElementA_;
+   using StrideA = StrideA_;
+   using ElementB = ElementB_;
+   using StrideB = StrideB_;
+   using TiledMma = TiledMma_;
+   using ElementAccumulator = typename TiledMma::ValTypeC;
+   using ElementBlockScale = ElementAccumulator;
+   using GmemTiledCopyA = GmemTiledCopyA_;
+   using GmemTiledCopyB = GmemTiledCopyB_;
+   using SmemLayoutAtomA = SmemLayoutAtomA_;
+   using SmemLayoutAtomB = SmemLayoutAtomB_;
+   using SmemCopyAtomA = SmemCopyAtomA_;
+   using SmemCopyAtomB = SmemCopyAtomB_;
+   using TransformA = TransformA_;
+   using TransformB = TransformB_;
+   using ArchTag = typename DispatchPolicy::ArchTag;
+
+   using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+   using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+   using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+   using PipelineParams = typename MainloopPipeline::Params;
+
+   // Two threads per CTA are producers (1 for operand tile and 32 for scales)
+   static constexpr int NumProducerThreadEvents = 33;
+
+   static constexpr int ScaleGranularityM = ScaleGranularityM_ == 0 ? size<0>(TileShape{}) : ScaleGranularityM_;
+   static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
+
+   static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+   static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+   static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+   static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+   static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+   static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+   static_assert((size<0>(TileShape{}) % ScaleGranularityM) == 0, "FP8 scaling granularity must evenly divide tile shape along M.");
+
+   // Tile along modes in a way that maximizes the TMA box size.
+   using SmemLayoutA = decltype(tile_to_shape(
+       SmemLayoutAtomA{},
+       make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+       cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+   using SmemLayoutB = decltype(tile_to_shape(
+       SmemLayoutAtomB{},
+       make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+       cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+   // Block scaling gmem-to-smem copy atom
+   using SmemBlockScalingCopyAtomA = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>;
+   using SmemBlockScalingCopyAtomB = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>;
+
+   // Block scaling smem layout
+   using SmemLayoutScaleA = Layout<Shape<Int<ScaleMsPerTile>, Int<DispatchPolicy::Stages>>>;
+   using SmemLayoutScaleB = Layout<Shape<Int<DispatchPolicy::Stages>>, Stride<_1>>; // `ScaleNsPerTile` is always 1.
+
+   static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
+   static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                 cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                 "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+   static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+       "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+   static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+       "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+   static_assert(cute::is_same_v<ElementAccumulator, ElementBlockScale>,
+              "ElementAccumulator and ElementBlockScale should be same datatype");
+
+   struct SharedStorage
+   {
+     struct TensorStorage : cute::aligned_struct<128> {
+       cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;  // mxk
+       cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;  // nxk
+       cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutScaleA>> smem_scale_A; // ScaleMsPerTile x k
+       cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutScaleB>> smem_scale_B; // 1xk
+     } tensors;
+
+     using PipelineStorage = typename MainloopPipeline::SharedStorage;
+     PipelineStorage pipeline;
+   };
+   using TensorStorage = typename SharedStorage::TensorStorage;
+   using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+   // Host side kernel arguments
+   struct Arguments {
+     ElementA const* ptr_A;
+     StrideA dA;
+     ElementB const* ptr_B;
+     StrideB dB;
+     uint32_t mma_promotion_interval = 4;
+     ElementBlockScale const* ptr_scale_A;
+     ElementBlockScale const* ptr_scale_B;
+   };
+
+   // Device side kernel params
+   struct Params {
+     // Assumption: StrideA is congruent with Problem_MK
+     using TMA_A = decltype(make_tma_copy_A_sm90(
+         GmemTiledCopyA{},
+         make_tensor(static_cast<ElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+         SmemLayoutA{}(_,_,0),
+         TileShape{},
+         ClusterShape{}));
+     // Assumption: StrideB is congruent with Problem_NK
+     using TMA_B = decltype(make_tma_copy_B_sm90(
+         GmemTiledCopyB{},
+         make_tensor(static_cast<ElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+         SmemLayoutB{}(_,_,0),
+         TileShape{},
+         ClusterShape{}));
+     TMA_A tma_load_a;
+     TMA_B tma_load_b;
+     uint32_t tma_transaction_bytes = TmaTransactionBytes;
+     uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
+     uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
+     uint32_t mma_promotion_interval = 4;
+     // Block scaling factors for A and B
+     ElementBlockScale const* ptr_scale_A;
+     ElementBlockScale const* ptr_scale_B;
+   };
+
+   //
+   // Methods
+   //
+
+   template <class ProblemShape>
+   static constexpr Params
+   to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+     (void) workspace;
+
+     // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+     auto problem_shape_MNKL = append<4>(problem_shape, 1);
+     auto [M,N,K,L] = problem_shape_MNKL;
+
+     auto ptr_A = reinterpret_cast<ElementA const*>(args.ptr_A);
+     auto ptr_B = reinterpret_cast<ElementB const*>(args.ptr_B);
+
+     Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
+     Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+     typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90(
+         GmemTiledCopyA{},
+         tensor_a,
+         SmemLayoutA{}(_,_,cute::Int<0>{}),
+         TileShape{},
+         ClusterShape{});
+     typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90(
+         GmemTiledCopyB{},
+         tensor_b,
+         SmemLayoutB{}(_,_,cute::Int<0>{}),
+         TileShape{},
+         ClusterShape{});
+     uint32_t transaction_bytes_mk = TmaTransactionBytesMK;
+     uint32_t transaction_bytes_nk = TmaTransactionBytesNK;
+     uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk;
+
+     return {
+       tma_load_a,
+       tma_load_b,
+       transaction_bytes,
+       transaction_bytes_mk,
+       transaction_bytes_nk,
+      args.mma_promotion_interval,
+       args.ptr_scale_A,
+       args.ptr_scale_B
+     };
+   }
+
+   template<class ProblemShape>
+   static bool
+   can_implement(
+       ProblemShape const& problem_shape,
+       [[maybe_unused]] Arguments const& args) {
+     constexpr int tma_alignment_bits = 128;
+     auto problem_shape_MNKL = append<4>(problem_shape, 1);
+     auto [M,N,K,L] = problem_shape_MNKL;
+
+     bool implementable = true;
+     constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+     implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+     constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+     implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+     /* MMA promotion interval should be a multiple of 4, since each mainloop iteration would issue 4 MMA instructions. */
+    implementable = implementable && (args.mma_promotion_interval % 4 == 0);
+
+     if (!implementable) {
+       CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+     }
+     return implementable;
+   }
+
+   static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+   static constexpr int K_PIPE_MMAS = 1;
+   static constexpr uint32_t TmaTransactionBytesMK =
+         cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<ElementA>::value));
+   static constexpr uint32_t TmaTransactionBytesNK =
+         cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<ElementB>::value));
+   static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+   /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+   CUTLASS_DEVICE
+   static void prefetch_tma_descriptors(Params const& mainloop_params)
+   {
+     cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+     cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+   }
+
+   /// Set up the data needed by this collective for load and mma.
+   /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+   /// Returned tuple must contain at least two elements, with the first two elements being:
+   /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+   /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+   template <class ProblemShape_MNKL>
+   CUTLASS_DEVICE auto
+   load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+     using X = Underscore;
+     // Separate out problem shape for convenience
+     auto [M,N,K,L] = problem_shape_MNKL;
+
+     // TMA requires special handling of strides to deal with coord codomain mapping
+     // Represent the full tensors -- get these from TMA
+     Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L));                            // (m,k,l)
+     Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
+
+     // Make tiled views, defer the slice
+     Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+     Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
+
+     constexpr auto scales_m = Int<ScaleMsPerTile>{};
+     auto tM = get<2>(gA_mkl.shape());
+     auto tN = get<2>(gB_nkl.shape());
+     auto tK = get<3>(gA_mkl.shape());
+
+     // Make the tiled views of scale tensors
+     auto scaleA_shape = make_shape(M / ScaleGranularityM, tK, L); // (scale_m,k,l)
+     auto scaleA_layout = make_ordered_layout(scaleA_shape,  Step<_0, _1, _2>{});
+     auto scaleB_shape = make_shape(tN, tK, L); // (n,k,l)
+     auto scaleB_layout = make_ordered_layout(scaleB_shape, Step<_1, _0, _2>{});
+
+     // Note that mScaleA_mkl and mScaleB_nkl are already blocked tiled in the `m` host and
+     // gScaleA_mkl and gScaleB_nkl in `g` global memory are same as mScaleA_mkl and mScaleB_nkl.
+     Tensor mScaleA_mkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_A), scaleA_layout); // (scale_m,k,l)
+     Tensor mScaleB_nkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_B), scaleB_layout); // (n,k,l)
+
+     return cute::make_tuple(gA_mkl, gB_nkl, mScaleA_mkl, mScaleB_nkl);
+   }
+
+   /// Perform a collective-scoped matrix multiply-accumulate
+   /// Producer Perspective
+   template <
+     class TensorA, class TensorB,
+     class TensorScaleA, class TensorScaleB,
+     class KTileIterator, class BlockCoord
+   >
+   CUTLASS_DEVICE void
+   load(
+       Params const& mainloop_params,
+       MainloopPipeline pipeline,
+       PipelineState smem_pipe_write,
+       cute::tuple<TensorA, TensorB, TensorScaleA, TensorScaleB> const& load_inputs,
+       BlockCoord const& blk_coord,
+       KTileIterator k_tile_iter, int k_tile_count,
+       int thread_idx,
+       uint32_t block_rank_in_cluster,
+       TensorStorage& shared_tensors) {
+     int lane_predicate = cute::elect_one_sync();
+
+     // Blockscaling: Tma loads for load_input and CpAsync for load_scale
+     Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+     Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+     Tensor sScaleA = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()), SmemLayoutScaleA{}); // (ScaleMsPerTile,k)
+     Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k)
+
+     //
+     // Prepare the TMA loads for A and B
+     //
+
+     constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+     uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+     Tensor gA_mkl = get<0>(load_inputs);
+     Tensor gB_nkl = get<1>(load_inputs);
+
+     auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+     auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+     // Partition the inputs based on the current block coordinates.
+     auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+     Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+     Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+
+     // Block scaling: load_scale has scaling tensors in global memory which are not tiled
+     Tensor mScaleA_mkl = get<2>(load_inputs);
+     Tensor mScaleB_nkl = get<3>(load_inputs);
+     auto scales_m = get<0>(mScaleA_mkl.shape());
+
+     Tensor cScaleA_mkl = make_identity_tensor(mScaleA_mkl.shape());
+
+     Tensor gScaleA = local_tile(
+       mScaleA_mkl, make_tile(Int<ScaleMsPerTile>{}),
+       make_coord(m_coord,_,l_coord));                   // (ScaleMsPerTile,k,1)
+     Tensor cScaleA = local_tile(
+       cScaleA_mkl, make_tile(Int<ScaleMsPerTile>{}),
+       make_coord(m_coord,_,l_coord));
+     Tensor gScaleB = mScaleB_nkl(n_coord,_,l_coord);                                           // (1,k,1)
+
+     // TODO: test `scale_copy_a` with `ScaleMsPerTile` < 128
+     TiledCopy scale_copy_a = make_tiled_copy(SmemBlockScalingCopyAtomA{},
+       Layout<Shape<_32, _1>>{}, Layout<Shape<_4, _1>>{}); // (1,1,1)
+     TiledCopy scale_copy_b = make_tiled_copy(SmemBlockScalingCopyAtomB{},
+       Layout<Shape<_1>>{}, Layout<Shape<_1>>{}); // (1,1,1)
+     ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(threadIdx.x);
+     ThrCopy thr_scale_copy_b = scale_copy_b.get_slice(threadIdx.x);
+
+     Tensor tAgA_ScaleA = thr_scale_copy_a.partition_S(gScaleA);
+     Tensor tAcA_ScaleA = thr_scale_copy_a.partition_S(cScaleA);
+     Tensor tAsA_ScaleA = thr_scale_copy_a.partition_D(sScaleA);
+
+     Tensor tBgB_ScaleB = thr_scale_copy_b.partition_S(gScaleB);
+     Tensor tBsB_ScaleB = thr_scale_copy_b.partition_D(sScaleB);
+
+     // Applies the mapping from block_tma_a
+     Tensor tAgA = block_tma_a.partition_S(gA);                                              // (TMA,TMA_M,TMA_K,k)
+     Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+     Tensor tBgB = block_tma_b.partition_S(gB);                                              // (TMA,TMA_N,TMA_K,k)
+     Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+     uint16_t mcast_mask_a = 0;
+     uint16_t mcast_mask_b = 0;
+
+     // Issue TmaLoads for GEMM operands A/B and CpAsync for scale tensors
+     // Maps the tile -> block, value
+     if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+       auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+       for (int n = 0; n < size<1>(block_layout); ++n) {
+         mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+       }
+     }
+
+     if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+       auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+       for (int m = 0; m < size<0>(block_layout); ++m) {
+         mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+       }
+     }
+
+     // Allocate predicate tensors for a_scales (since we can't guarantee that
+     // all scales are valid, since we could have a partial tiles along M)
+     Tensor tApA_ScaleA = make_tensor<bool>(shape(tAsA_ScaleA(_,_,0)));
+     #pragma unroll
+     for (int i = 0; i < size(tApA_ScaleA); ++i) {
+       tApA_ScaleA(i) = get<0>(tAcA_ScaleA(i)) < scales_m;
+     }
+
+     // Mainloop
+     CUTLASS_PRAGMA_NO_UNROLL
+     for ( ; k_tile_count > 0; --k_tile_count) {
+       // LOCK smem_pipe_write for _writing_
+       pipeline.producer_acquire(smem_pipe_write);
+
+       //
+       // Copy gmem to smem for *k_tile_iter
+       //
+       int write_stage = smem_pipe_write.index();
+       using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+       BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+       // Copy operands A and B from global memory to shared memory
+       if (lane_predicate) copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+       if (lane_predicate) copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+
+       // Copy scale tensors from global memory to shared memory
+       copy_if(scale_copy_a, tApA_ScaleA, tAgA_ScaleA(_,_,*k_tile_iter), tAsA_ScaleA(_,_,write_stage));
+       copy(scale_copy_b, tBgB_ScaleB(_,*k_tile_iter), tBsB_ScaleB(_,write_stage));
+       pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive_noinc);
+
+       ++k_tile_iter;
+
+       // Advance smem_pipe_write
+       ++smem_pipe_write;
+     }
+   }
+
+   /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+   CUTLASS_DEVICE void
+   load_tail(
+       MainloopPipeline pipeline,
+       PipelineState smem_pipe_write) {
+     int lane_predicate = cute::elect_one_sync();
+
+     // Issue the epilogue waits
+     if (lane_predicate) {
+       /* This helps avoid early exit of blocks in Cluster
+        * Waits for all stages to either be released (all
+        * Consumer UNLOCKs), or if the stage was never used
+        * then would just be acquired since the phase was
+        * still inverted from make_producer_start_state
+        */
+       pipeline.producer_tail(smem_pipe_write);
+     }
+   }
+
+   /// Perform a collective-scoped matrix multiply-accumulate
+   /// Consumer Perspective
+   template <
+     class FrgTensorC
+   >
+   CUTLASS_DEVICE void
+   mma(MainloopPipeline pipeline,
+       PipelineState smem_pipe_read,
+       FrgTensorC& accum,
+       int k_tile_count,
+       int thread_idx,
+       TensorStorage& shared_tensors,
+       Params const& mainloop_params) {
+
+
+     static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+     static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+     static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+     static_assert(cute::is_void_v<SmemCopyAtomA>,
+       "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+     static_assert(cute::is_void_v<SmemCopyAtomB>,
+       "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+     Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+     Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+
+     // Block scaling
+     Tensor sScaleAViewAsC = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()),
+       Layout<
+         Shape<Shape<Int<ScaleGranularityM>, Int<ScaleMsPerTile>>, cute::tuple_element_t<1, TileShape>, Int<DispatchPolicy::Stages>>,
+         Stride<Stride<_0, _1>, _0, Int<ScaleMsPerTile>>
+       >{}); // ((ScaleGranularityM,ScaleMsPerTile),n,k)
+     Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k)
+
+     //
+     // Define C accumulators and A/B partitioning
+     //
+
+     // Layout of warp group to thread mapping
+
+     static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and
+                   stride<0>(typename TiledMma::BLayout{}) == 0 and
+                   size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
+                   size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup,
+                   "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+     constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+     Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{},
+                                                   Int<NumThreadsPerWarpGroup>{});
+
+     int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+     TiledMma tiled_mma;
+     auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+     Tensor tCsScaleAViewAsC = tiled_mma.get_slice(thread_idx).partition_C(sScaleAViewAsC);    // (MMA,MMA_M,MMA_N,PIPE), `thread_mma` above is correct when partitioning A and B, but it is not correct when partitioning C.
+
+     Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
+     Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
+
+     // Allocate "fragments/descriptors"
+     Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+     Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+     CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
+     CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+     CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+     CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+     CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+     CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+     //
+     // PIPELINED MAIN LOOP
+     //
+     static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+         "ERROR : Incorrect number of MMAs in flight");
+
+     // We release buffers to producer warps(dma load) with some mmas in flight
+     PipelineState smem_pipe_release = smem_pipe_read;
+
+     // Per block scale values for operand A and B
+
+     using RegLayoutScaleAViewAsC = decltype(make_layout_like(tCsScaleAViewAsC(_, _, _, 0).layout())); // `make_layout_like` makes a compact layout.
+     using RegLayoutScaleAEssential = decltype(filter_zeros(RegLayoutScaleAViewAsC{}.stride(), RegLayoutScaleAViewAsC{}.shape())); // an interface to traverse the underlying storage for the compact layout mentioned above
+
+     Tensor tCrScaleAViewAsC = make_tensor<ElementBlockScale>(RegLayoutScaleAViewAsC{});              // (MMA,MMA_M,MMA_N)
+     ElementBlockScale scale_b;
+
+     // Prologue GMMAs
+     int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+
+     tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+     GmmaFP8Accumulation accumulation(accum, mainloop_params.mma_promotion_interval, size<2>(tCrA));
+     warpgroup_fence_operand(accumulation());
+     CUTLASS_PRAGMA_UNROLL
+     for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue)
+     {
+       // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+       auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+       pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+       if (accumulation.prepare_if_needed()) {
+         tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+       }
+
+       int read_stage = smem_pipe_read.index();
+
+       // Load per block scale values from shared memory to registers.
+       scale_b = sScaleB[read_stage];
+       CUTLASS_PRAGMA_UNROLL
+       for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
+         tCrScaleAViewAsC.data()[i] = tCsScaleAViewAsC(_, _, _, read_stage)(idx2crd(i, RegLayoutScaleAEssential{}));
+       }
+       if constexpr (ScaleMsPerTile == 1) {
+         static_assert(size(RegLayoutScaleAEssential{}) == 1);
+         tCrScaleAViewAsC.data()[0] = __shfl_sync(0xffffffff, tCrScaleAViewAsC.data()[0] * scale_b, 0); // `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`.
+       } else {
+         CUTLASS_PRAGMA_UNROLL
+         for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
+           tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b;
+         }
+       }
+
+       warpgroup_arrive();
+       // Unroll the K mode manually to set scale D to 1
+       CUTLASS_PRAGMA_UNROLL
+       for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+         // (V,M,K) x (V,N,K) => (V,M,N)
+         cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
+         tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+       }
+       warpgroup_commit_batch();
+
+       // Block scale the accumulators with reg tensor `tCrScaleAViewAsC`
+       accumulation.scale_if_needed(tCrScaleAViewAsC);
+
+       ++smem_pipe_read;
+     }
+
+     warpgroup_fence_operand(accumulation());
+     // Mainloop GMMAs
+     k_tile_count -= prologue_mma_count;
+
+     CUTLASS_PRAGMA_NO_UNROLL
+     for ( ; k_tile_count > 0; --k_tile_count)
+     {
+       // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+       auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+       pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+       //
+       // Compute on k_tile
+       //
+
+       int read_stage = smem_pipe_read.index();
+
+       // Load per block scale values from shared memory to registers (at most twice per block along M and exactly once per block along N)
+       scale_b = sScaleB[read_stage];
+       CUTLASS_PRAGMA_UNROLL
+       for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
+         tCrScaleAViewAsC.data()[i] = tCsScaleAViewAsC(_, _, _, read_stage)(idx2crd(i, RegLayoutScaleAEssential{}));
+       }
+       if constexpr (ScaleMsPerTile == 1) {
+         static_assert(size(RegLayoutScaleAEssential{}) == 1);
+         tCrScaleAViewAsC.data()[0] = __shfl_sync(0xffffffff, tCrScaleAViewAsC.data()[0] * scale_b, 0); // `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`.
+       } else {
+         CUTLASS_PRAGMA_UNROLL
+         for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
+           tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b;
+         }
+       }
+
+       if (accumulation.prepare_if_needed()) {
+         tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+       }
+
+       warpgroup_fence_operand(accumulation());
+       warpgroup_arrive();
+       // Unroll the K mode manually to set scale D to 1
+       CUTLASS_PRAGMA_UNROLL
+       for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+         // (V,M,K) x (V,N,K) => (V,M,N)
+         cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
+         tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+       }
+       warpgroup_commit_batch();
+
+       /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+       warpgroup_wait<K_PIPE_MMAS>();
+       warpgroup_fence_operand(accumulation());
+
+       // Block scale the accumulators with reg tensor `tCrScaleAViewAsC`
+       accumulation.scale_if_needed(tCrScaleAViewAsC);
+
+       pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+
+       // Advance smem_pipe_read and smem_pipe_release
+       ++smem_pipe_read;
+       ++smem_pipe_release;
+     }
+
+     accumulation.scale_residue_if_needed(tCrScaleAViewAsC);
+
+     warpgroup_fence_operand(accumulation());
+   }
+
+   /// Perform a Consumer Epilogue to release all buffers
+   CUTLASS_DEVICE void
+   mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+     // Prologue GMMAs
+     int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+     k_tile_count -= prologue_mma_count;
+
+     smem_pipe_release.advance(k_tile_count);
+
+     // Wait on all GMMAs to complete
+     warpgroup_wait<0>();
+
+     for (int count = 0; count < prologue_mma_count; ++count) {
+       pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+       ++smem_pipe_release;
+     }
+   }
+ };
+
+ /////////////////////////////////////////////////////////////////////////////////////////////////
+
+ } // namespace cutlass::gemm::collective
+
+ /////////////////////////////////////////////////////////////////////////////////////////////////
--- a/sgl-kernel/csrc/cutlass_extensions/gemm/dispatch_policy.hpp
+++ b/sgl-kernel/csrc/cutlass_extensions/gemm/dispatch_policy.hpp
@@ -0,0 +1,37 @@
+// Adapt from https://github.com/vllm-project/vllm/blob/v0.7.1/csrc/cutlass_extensions/gemm/dispatch_policy.hpp
+#pragma once
+
+#include <cutlass/gemm/dispatch_policy.hpp>
+
+namespace cutlass::gemm {
+
+//////////////////////////////////////////////////////////////////////////////
+
+// FP8 related policies (including Blocked Scaled Accumulation)
+//  `ScaleGranularityM` specifies scaling granularity along M, while zero-value
+//  `ScaleGranularityM` indicates that scaling granularity is
+//  `size<0>(TileShape_MNK{})` along M.
+template <int ScaleGranularityM = 0>
+struct KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum : KernelTmaWarpSpecializedCooperative {};
+
+// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp
+// specialized dynamic schedule For FP8 kernels with Block Scaling
+template <
+    int Stages_,
+    class ClusterShape_ = Shape<_1, _1, _1>,
+    class KernelSchedule = KernelTmaWarpSpecialized,
+    int ScaleGranularityM = 0  // `ScaleGranularityM` specifies scaling granularity along M,
+                               // while zero-value `ScaleGranularityM` indicates that scaling
+                               // granularity is `size<0>(TileShape_MNK{})` along M.
+    >
+struct MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8
+    : MainloopSm90TmaGmmaWarpSpecialized<Stages_, ClusterShape_, KernelSchedule> {
+  static_assert(
+      cute::
+          is_same_v<KernelSchedule, KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>>,
+      "KernelSchedule must be one of the warp specialized policies");
+};
+
+//////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm
--- a/sgl-kernel/csrc/cutlass_extensions/gemm/gemm_universal_base_compat.h
+++ b/sgl-kernel/csrc/cutlass_extensions/gemm/gemm_universal_base_compat.h
@@ -0,0 +1,356 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Adapted from
+// https://github.com/NVIDIA/TensorRT-LLM/blob/be1788106245496872d18e702978e59b6bfd50e0/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/device/gemm_universal_base_compat.h
+#pragma once
+
+#include <cutlass/cutlass.h>
+#include <cutlass/device_kernel.h>
+#include <cutlass/trace.h>
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*
+    This is the device layer from CUTLASS 2.10 (SHA - cc85b64cf676c45f98a17e3a47c0aafcf817f088)
+    It is replicated here since we needed to duplicate kernel level APIs for mixed dtype GEMMs
+    and SmoothQuant. The newer device layer is not compatible with these older kernel level APIs.
+
+    Note: While CUTLASS 3.x supports stream-k, none of the kernels in the extensions folder support
+          that feature at the moment.
+  */
+
+template <typename GemmKernel_>
+class GemmUniversalBaseCompat {
+ public:
+  using GemmKernel = GemmKernel_;
+  using ThreadblockShape = typename GemmKernel::Mma::Shape;
+
+  using ElementA = typename GemmKernel::ElementA;
+  using LayoutA = typename GemmKernel::LayoutA;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  static ComplexTransform const kTransformA = GemmKernel::kTransformA;
+
+  using ElementB = typename GemmKernel::ElementB;
+  using LayoutB = typename GemmKernel::LayoutB;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  static ComplexTransform const kTransformB = GemmKernel::kTransformB;
+
+  using ElementC = typename GemmKernel::ElementC;
+  using LayoutC = typename GemmKernel::LayoutC;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+
+  using ElementAccumulator = typename GemmKernel::Mma::Policy::Operator::ElementC;
+
+  using EpilogueOutputOp = typename GemmKernel::EpilogueOutputOp;
+  using ThreadblockSwizzle = typename GemmKernel::ThreadblockSwizzle;
+  using Operator = typename GemmKernel::Operator;
+
+  /// Argument structure
+  using Arguments = typename GemmKernel::Arguments;
+
+ protected:
+  /// Kernel parameters object
+  typename GemmKernel::Params params_;
+
+ protected:
+  /// Private helper to obtain the grid dimensions with fix-up for split-K
+  static void get_grid_shape_(gemm::GemmCoord& grid_tiled_shape, int& gemm_k_size, Arguments const& args) {
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+        args.problem_size, {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK}, args.batch_count);
+
+    gemm_k_size = args.problem_size.k();
+
+    if (args.mode == GemmUniversalMode::kGemm || args.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      int const kAlignK =
+          const_max(const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value), 1);
+
+      gemm_k_size = round_up(ceil_div(args.problem_size.k(), args.batch_count), kAlignK);
+
+      if (gemm_k_size) {
+        grid_tiled_shape.k() = ceil_div(args.problem_size.k(), gemm_k_size);
+      }
+    }
+  }
+
+ public:
+  /// Constructs the GEMM.
+  GemmUniversalBaseCompat() {}
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const& args) {
+    // Determine grid shape
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int gemm_k_size = 0;
+
+    get_grid_shape_(grid_tiled_shape, gemm_k_size, args);
+
+    ThreadblockSwizzle threadblock_swizzle;
+    dim3 grid = threadblock_swizzle.get_grid_shape(grid_tiled_shape);
+
+    uint32_t const kGridYZMax = ((1 << (sizeof(uint16_t) * 8)) - 1);
+
+    if (!(grid.y <= kGridYZMax && grid.z <= kGridYZMax)) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return GemmKernel::can_implement(args);
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const& args) {
+    CUTLASS_TRACE_HOST("GemmUniversalBaseCompat::get_workspace_size()");
+
+    size_t workspace_bytes = 0;
+
+    // Determine grid shape
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int gemm_k_size = 0;
+
+    get_grid_shape_(grid_tiled_shape, gemm_k_size, args);
+
+    if (args.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      // Split-K parallel always requires a temporary workspace
+      workspace_bytes = sizeof(ElementC) * size_t(args.batch_stride_D) * size_t(grid_tiled_shape.k());
+    } else if (args.mode == GemmUniversalMode::kGemm && grid_tiled_shape.k() > 1) {
+      // Serial split-K only requires a temporary workspace if the number of partitions along the
+      // GEMM K dimension is greater than one.
+      workspace_bytes = sizeof(int) * size_t(grid_tiled_shape.m()) * size_t(grid_tiled_shape.n());
+    }
+
+    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
+
+    workspace_bytes += GemmKernel::get_extra_workspace_size(args, grid_tiled_shape);
+
+    return workspace_bytes;
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const& args) {
+    CUTLASS_TRACE_HOST("GemmUniversalBaseCompat::get_grid_shape()");
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int gemm_k_size = 0;
+
+    get_grid_shape_(grid_tiled_shape, gemm_k_size, args);
+    dim3 result = threadblock_swizzle.get_grid_shape(grid_tiled_shape);
+
+    CUTLASS_TRACE_HOST(
+        "  grid_tiled_shape: " << grid_tiled_shape << "\n"
+                               << "  result = {" << result << "}");
+
+    return result;
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    CUTLASS_TRACE_HOST("GemmUniversalBaseCompat::maximum_active_blocks()");
+
+    int max_active_blocks = -1;
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+
+    CUTLASS_TRACE_HOST("  smem_size: " << smem_size << " bytes");
+
+    if (smem_size <= (48 << 10)) {
+      cudaError_t result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+          &max_active_blocks, Kernel<GemmKernel>, GemmKernel::kThreadCount, smem_size);
+
+      if (result == cudaSuccess) {
+        CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
+        return max_active_blocks;
+      }
+    } else {
+      // Query assuming zero shared memory then compute occupancy limit based on SMEM
+      cudaError_t result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+          &max_active_blocks, Kernel<GemmKernel>, GemmKernel::kThreadCount, 0);
+
+      if (result != cudaSuccess) {
+        CUTLASS_TRACE_HOST(
+            "  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error " << cudaGetErrorString(result));
+
+        return -1;
+      }
+
+      if (smem_capacity < 0) {
+        int device_idx = 0;
+        result = cudaGetDevice(&device_idx);
+
+        if (result != cudaSuccess) {
+          return -1;
+        }
+
+        cudaDeviceProp properties;
+        result = cudaGetDeviceProperties(&properties, device_idx);
+
+        if (result != cudaSuccess) {
+          return -1;
+        }
+
+        smem_capacity = static_cast<int>(properties.sharedMemPerMultiprocessor);
+      }
+
+      int occupancy = std::min(max_active_blocks, smem_capacity / smem_size);
+
+      CUTLASS_TRACE_HOST("  occupancy: " << occupancy);
+
+      return occupancy;
+    }
+
+    CUTLASS_TRACE_HOST("  returning internal error");
+
+    return -1;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr) {
+    CUTLASS_TRACE_HOST(
+        "GemmUniversalBaseCompat::initialize() - workspace " << workspace
+                                                             << ", stream: " << (stream ? "non-null" : "null"));
+
+    size_t workspace_bytes = get_workspace_size(args);
+
+    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
+
+    if (workspace_bytes) {
+      if (!workspace) {
+        CUTLASS_TRACE_HOST("  error: device workspace must not be null");
+
+        return Status::kErrorWorkspaceNull;
+      }
+
+      if (args.mode == GemmUniversalMode::kGemm) {
+        CUTLASS_TRACE_HOST("  clearing device workspace");
+        cudaError_t result = cudaMemsetAsync(workspace, 0, workspace_bytes, stream);
+
+        if (result != cudaSuccess) {
+          CUTLASS_TRACE_HOST("  cudaMemsetAsync() returned error " << cudaGetErrorString(result));
+
+          return Status::kErrorInternal;
+        }
+      }
+    }
+
+    // Get CUDA grid shape
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int gemm_k_size = 0;
+
+    get_grid_shape_(grid_tiled_shape, gemm_k_size, args);
+
+    // Initialize the Params structure
+    params_ = typename GemmKernel::Params(args, grid_tiled_shape, gemm_k_size, static_cast<int*>(workspace));
+
+    // Specify shared memory capacity for kernel.
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result =
+          cudaFuncSetAttribute(Kernel<GemmKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const& args, void* workspace = nullptr) {
+    CUTLASS_TRACE_HOST("GemmUniversalBaseCompat()::update() - workspace: " << workspace);
+
+    size_t workspace_bytes = get_workspace_size(args);
+
+    if (workspace_bytes && !workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    params_.update(args, workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+    CUTLASS_TRACE_HOST("GemmUniversalBaseCompat::run()");
+
+    //
+    // Configure grid and block dimensions
+    //
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(GemmKernel::kThreadCount, 1, 1);
+
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+
+    //
+    // Launch kernel
+    //
+
+    CUTLASS_TRACE_HOST("  grid: (" << grid << "),  block: (" << block << "),  SMEM: " << smem_size << " bytes");
+
+    // Launch
+    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    //
+    // Query for errors
+    //
+    cudaError_t result = cudaGetLastError();
+
+    if (result != cudaSuccess) {
+      CUTLASS_TRACE_HOST("  grid launch failed with error " << cudaGetErrorString(result));
+      return Status::kErrorInternal;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr) {
+    Status status = initialize(args, workspace, stream);
+
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace device
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/sgl-kernel/csrc/cutlass_extensions/gemm/gemm_with_epilogue_visitor.h
+++ b/sgl-kernel/csrc/cutlass_extensions/gemm/gemm_with_epilogue_visitor.h
@@ -0,0 +1,492 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Adapted from
+// https://github.com/NVIDIA/TensorRT-LLM/blob/be1788106245496872d18e702978e59b6bfd50e0/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/gemm_with_epilogue_visitor.h
+
+#pragma once
+
+#include <cutlass/complex.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/fast_math.h>
+#include <cutlass/matrix_coord.h>
+#include <cutlass/trace.h>
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    typename Mma_,                ///! Threadblock-scoped matrix multiply-accumulate
+    typename Epilogue_,           ///! Epilogue
+    typename ThreadblockSwizzle_  ///! Threadblock swizzling function
+    >
+struct GemmWithEpilogueVisitor {
+ public:
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueVisitor = typename Epilogue::Visitor;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using TensorRefA = TensorRef<ElementA, LayoutA>;
+
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using TensorRefB = TensorRef<ElementB, LayoutB>;
+
+  using ElementCompute = typename EpilogueVisitor::ElementCompute;
+  using LayoutAlphaCol = cutlass::layout::RowMajor;
+  using LayoutAlphaRow = cutlass::layout::ColumnMajor;
+  using TensorRefAlphaCol = TensorRef<ElementCompute, LayoutAlphaCol>;
+  using TensorRefAlphaRow = TensorRef<ElementCompute, LayoutAlphaRow>;
+
+  using ElementC = typename EpilogueVisitor::ElementOutput;
+  using LayoutC = typename Epilogue::Layout;
+  using TensorRefC = TensorRef<ElementC, LayoutC>;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+  using EpilogueOutputOp =
+      typename Epilogue::Visitor::ElementwiseFunctor;  // Define type so GemmUniversalBase doesn't complain
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = EpilogueVisitor::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+    //
+    // Data members
+    //
+
+    GemmUniversalMode mode;
+    GemmCoord problem_size;
+    int batch_count;
+
+    TensorRefA ref_A;
+    TensorRefB ref_B;
+    TensorRefAlphaCol ref_alpha_col;
+    TensorRefAlphaRow ref_alpha_row;
+    TensorRefC ref_C;
+    TensorRefC ref_D;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_D;
+
+    typename EpilogueVisitor::Arguments epilogue_visitor;
+
+    //
+    // Methods
+    //
+
+    Arguments() : mode(GemmUniversalMode::kGemm), batch_count(1) {}
+
+    /// constructs an arguments structure
+    Arguments(
+        GemmCoord problem_size_,
+        TensorRefA ref_A_,
+        TensorRefB ref_B_,
+        TensorRefAlphaCol ref_alpha_col_,
+        TensorRefAlphaRow ref_alpha_row_,
+        TensorRefC ref_C_,
+        TensorRefC ref_D_,
+        typename EpilogueVisitor::Arguments epilogue_visitor_)
+        : mode(GemmUniversalMode::kGemm),
+          problem_size(problem_size_),
+          batch_count(1),
+          ref_A(ref_A_),
+          ref_B(ref_B_),
+          ref_alpha_col(ref_alpha_col_),
+          ref_alpha_row(ref_alpha_row_),
+          ref_C(ref_C_),
+          ref_D(ref_D_),
+          batch_stride_A(0),
+          batch_stride_B(0),
+          batch_stride_D(0),
+          epilogue_visitor(epilogue_visitor_) {}
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+    cutlass::gemm::GemmCoord problem_size;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int swizzle_log_tile;
+
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorB::Params params_B;
+    typename EpilogueVisitor::ScaleTileIterator::Params params_alpha_col;
+    typename EpilogueVisitor::ScaleTileIterator::Params params_alpha_row;
+    typename EpilogueVisitor::OutputTileIterator::Params params_C;
+    typename EpilogueVisitor::OutputTileIterator::Params params_D;
+
+    GemmUniversalMode mode;
+    int batch_count;
+    int gemm_k_size;
+
+    void* ptr_A;
+    void* ptr_B;
+    typename EpilogueVisitor::ScaleTileIterator::Element* ptr_alpha_col;
+    typename EpilogueVisitor::ScaleTileIterator::Element* ptr_alpha_row;
+    ElementC* ptr_C;
+    ElementC* ptr_D;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+
+    typename EpilogueVisitor::Params epilogue_visitor;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params()
+        : swizzle_log_tile(0),
+          params_A(0),
+          params_B(0),
+          params_alpha_col(0),
+          params_C(0),
+          params_D(0),
+          batch_count(0),
+          gemm_k_size(0),
+          mode(cutlass::gemm::GemmUniversalMode::kGemm),
+          ptr_A(nullptr),
+          ptr_B(nullptr),
+          ptr_alpha_col(nullptr),
+          ptr_alpha_row(nullptr),
+          ptr_C(nullptr),
+          ptr_D(nullptr),
+          batch_stride_A(0),
+          batch_stride_B(0) {}
+
+    Params(Arguments const& args, cutlass::gemm::GemmCoord const& grid_tiled_shape_, int gemm_k_size_, int* workspace_)
+        : problem_size(args.problem_size),
+          swizzle_log_tile(0),
+          params_A(args.ref_A.layout()),
+          params_B(args.ref_B.layout()),
+          params_alpha_col(args.ref_alpha_col.layout()),
+          params_alpha_row(args.ref_alpha_col.layout()),
+          params_C(args.ref_C.layout()),
+          params_D(args.ref_D.layout()),
+          mode(args.mode),
+          batch_count(args.batch_count),
+          gemm_k_size(args.problem_size.k()),
+          ptr_A(args.ref_A.data()),
+          ptr_B(args.ref_B.data()),
+          ptr_alpha_col(args.ref_alpha_col.data()),
+          ptr_alpha_row(args.ref_alpha_row.data()),
+          ptr_C(args.ref_C.data()),
+          ptr_D(args.ref_D.data()),
+          batch_stride_A(args.batch_stride_A),
+          batch_stride_B(args.batch_stride_B),
+          epilogue_visitor(args.epilogue_visitor) {
+      ThreadblockSwizzle threadblock_swizzle;
+
+      grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+          args.problem_size, {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK}, args.batch_count);
+
+      if (args.mode == GemmUniversalMode::kGemm || args.mode == GemmUniversalMode::kGemmSplitKParallel) {
+        int const kAlignK =
+            const_max(const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value), 1);
+
+        gemm_k_size = round_up(ceil_div(args.problem_size.k(), args.batch_count), kAlignK);
+
+        if (gemm_k_size) {
+          grid_tiled_shape.k() = ceil_div(args.problem_size.k(), gemm_k_size);
+        }
+      }
+
+      swizzle_log_tile = threadblock_swizzle.get_log_tile(grid_tiled_shape);
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+
+    struct {
+      typename Epilogue::SharedStorage epilogue;
+      typename EpilogueVisitor::SharedStorage visitor;
+    } epilogue;
+  };
+
+ public:
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  GemmWithEpilogueVisitor() {}
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(cutlass::gemm::GemmCoord const& problem_size) {
+    CUTLASS_TRACE_HOST("GemmWithEpilogueVisitor::can_implement()");
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = EpilogueVisitor::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (
+        platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value ||
+        platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (
+        platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value ||
+        platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (
+        platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value ||
+        platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const& args) {
+    return can_implement(args.problem_size);
+  }
+
+  static size_t get_extra_workspace_size(Arguments const& args, cutlass::gemm::GemmCoord const& grid_tiled_shape) {
+    return 0;
+  }
+
+#define SPLIT_K_ENABLED 1
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void run_kernel_(Params const& params, SharedStorage& shared_storage) {
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+        params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA* ptr_A = static_cast<ElementA*>(params.ptr_A);
+    ElementB* ptr_B = static_cast<ElementB*>(params.ptr_B);
+
+#if SPLIT_K_ENABLED
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm || params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    } else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+    } else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA* const*>(params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB* const*>(params.ptr_B)[threadblock_tile_offset.k()];
+    }
+#endif
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+        threadblock_tile_offset.m() * Mma::Shape::kM,
+        offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{offset_k, threadblock_tile_offset.n() * Mma::Shape::kN};
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+        params.params_A, ptr_A, {params.problem_size.m(), problem_size_k}, thread_idx, tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+        params.params_B, ptr_B, {problem_size_k, params.problem_size.n()}, thread_idx, tb_offset_B);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // assume identity swizzle
+    MatrixCoord threadblock_offset(
+        threadblock_tile_offset.m() * Mma::Shape::kM, threadblock_tile_offset.n() * Mma::Shape::kN);
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    //
+    // Construct the epilogue visitor
+    //
+
+    bool with_bias = true;
+    if (params.ptr_C == nullptr) {
+      with_bias = false;
+    }
+
+    EpilogueVisitor epilogue_visitor(
+        params.epilogue_visitor,
+        shared_storage.epilogue.visitor,
+        params.problem_size.mn(),
+        thread_idx,
+        warp_idx,
+        lane_idx,
+        params.params_alpha_col,
+        params.params_C,
+        params.params_D,
+        with_bias,
+        true,
+        true,
+        params.ptr_alpha_row,
+        params.ptr_alpha_col,
+        params.ptr_C,
+        params.ptr_D,
+        threadblock_offset,
+        blockIdx.y * params.problem_size.m());
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+      // Indicate which position in a serial reduction the output operator is currently updating
+      epilogue_visitor.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+    } else if (params.mode == GemmUniversalMode::kBatched || params.mode == GemmUniversalMode::kArray) {
+      epilogue_visitor.set_batch_index(threadblock_tile_offset.k());
+    }
+
+    // Construct the epilogue
+    Epilogue epilogue(shared_storage.epilogue.epilogue, thread_idx, warp_idx, lane_idx);
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(epilogue_visitor, accumulators);
+  }
+
+  template <typename CompilationArch>
+  CUTLASS_DEVICE void run_kernel(Params const& params, SharedStorage& shared_storage) {
+    if constexpr (platform::is_same<ArchTag, CompilationArch>::value) {
+      run_kernel_(params, shared_storage);
+    } else {
+      CUTLASS_NOT_IMPLEMENTED();
+    }
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const& params, SharedStorage& shared_storage) {
+    run_kernel<ArchTag>(params, shared_storage);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////