support blockwise fp8 matmul kernel (#3267)

2025-02-13 01:49:33 +08:00
parent 8616357a97
commit 640363ad20
11 changed files with 1366 additions and 0 deletions
--- a/sgl-kernel/benchmark/bench_fp8_blockwise_gemm.py
+++ b/sgl-kernel/benchmark/bench_fp8_blockwise_gemm.py
@@ -0,0 +1,148 @@
+import argparse
+import copy
+import itertools
+
+import torch
+import triton
+from sgl_kernel import fp8_blockwise_scaled_mm
+from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
+
+
+def get_weight_shapes(args):
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    # NOTE(HandH1998): The weight shapes only works for DeepSeek-V3. Modify them, if you tune for another different model.
+    # cannot TP
+    total = [
+        # (512 + 64, 7168), # this weight is not supported by current kernel
+        ((128 + 64) * 128, 7168),
+        (128 * (128 + 128), 512),
+        (7168, 16384),
+        (7168, 18432),
+    ]
+    # N can TP
+    n_tp = [
+        (18432 * 2, 7168),
+        ((128 + 64) * 128, 7168),
+        (128 * (128 + 128), 512),
+        (24576, 1536),
+        (4096, 7168),
+    ]
+    # K can TP
+    k_tp = [(7168, 18432), (7168, 16384), (7168, 2048)]
+    # only support Deepseek-V3
+    SUPPORT_MODEL = ["deepseek-ai/DeepSeek-V3"]
+
+    weight_shapes = []
+    for model, tp_size in models_tps:
+        assert model in SUPPORT_MODEL
+        for t in total:
+            new_t = [t[0], t[1], model]
+            weight_shapes.append(new_t)
+        for n_t in n_tp:
+            new_t = [n_t[0] // tp_size, n_t[1], model]
+            weight_shapes.append(new_t)
+        for k_t in k_tp:
+            new_t = [k_t[0], k_t[1] // tp_size, model]
+            weight_shapes.append(new_t)
+    return weight_shapes
+
+
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+
+
+def scale_shape(shape, group_shape):
+    assert len(shape) == len(group_shape)
+    return tuple(cdiv(shape[i], group_shape[i]) for i in range(len(group_shape)))
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048],
+        x_log=False,
+        line_arg="provider",
+        line_vals=["vllm", "sgl-kernel"],
+        line_names=["vllm fp8 blockwise gemm", "sgl-kernel fp8 blockwise gemm"],
+        styles=[("blue", "-"), ("orange", "-")],
+        ylabel="GB/s",
+        plot_name="fp8 blockwise scaled matmul",
+        args={},
+    )
+)
+def benchmark(batch_size, provider, N, K):
+    M = batch_size
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+    a_fp32 = (torch.rand(M, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * fp8_max
+    a_fp8 = a_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+    b_fp32 = (torch.rand(N, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * fp8_max
+    b_fp8 = b_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn).t()
+
+    scale_a_group_shape = (1, 128)
+    scale_b_group_shape = (128, 128)
+    scale_a_shape = scale_shape(a_fp8.shape, scale_a_group_shape)
+    scale_b_shape = scale_shape(b_fp8.shape, scale_b_group_shape)
+
+    scale_a = torch.randn(scale_a_shape, device="cuda", dtype=torch.float32)
+    scale_b = torch.randn(scale_b_shape, device="cuda", dtype=torch.float32)
+    scale_a = scale_a.t().contiguous().t()
+    scale_b = scale_b.t().contiguous().t()
+
+    quantiles = [0.5, 0.2, 0.8]
+    if provider == "sgl-kernel":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: fp8_blockwise_scaled_mm(
+                a_fp8, b_fp8, scale_a, scale_b, torch.float16
+            ),
+            quantiles=quantiles,
+        )
+    if provider == "vllm":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: vllm_scaled_mm(a_fp8, b_fp8, scale_a, scale_b, torch.float16),
+            quantiles=quantiles,
+        )
+    gbps = (
+        lambda ms: (
+            (2 * M * N * K - M * N) * a_fp8.element_size()
+            + (3 * M * N) * scale_a.element_size()
+        )
+        * 1e-9
+        / (ms * 1e-3)
+    )
+    return gbps(ms), gbps(max_ms), gbps(min_ms)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=["deepseek-ai/DeepSeek-V3"],
+        help="List of models to benchmark",
+    )
+    parser.add_argument(
+        "--tp-sizes",
+        nargs="+",
+        type=int,
+        default=[1],
+        help="List of tensor parallel sizes",
+    )
+    args = parser.parse_args()
+
+    NK_model_names = get_weight_shapes(args)
+    for N, K, model_name in NK_model_names:
+        print(f"{model_name} N={N} K={K}: ")
+        benchmark.run(
+            print_data=True,
+            show_plots=True,
+            save_path="bench_fp8_blockwise_res",
+            N=N,
+            K=K,
+        )
+
+    print("Benchmark finished!")
--- a/sgl-kernel/setup.py
+++ b/sgl-kernel/setup.py
@@ -96,6 +96,7 @@ sources = [
    "src/sgl-kernel/csrc/moe_align_kernel.cu",
    "src/sgl-kernel/csrc/int8_gemm_kernel.cu",
    "src/sgl-kernel/csrc/fp8_gemm_kernel.cu",
+    "src/sgl-kernel/csrc/fp8_blockwise_gemm_kernel.cu",
    "src/sgl-kernel/csrc/lightning_attention_decode_kernel.cu",
    "src/sgl-kernel/csrc/fused_add_rms_norm_kernel.cu",
    "src/sgl-kernel/csrc/eagle_utils.cu",
--- a/sgl-kernel/src/sgl-kernel/init.py
+++ b/sgl-kernel/src/sgl-kernel/init.py
@@ -14,6 +14,7 @@ from sgl_kernel.ops import (
    build_tree_kernel_efficient,
    custom_dispose,
    custom_reduce,
+    fp8_blockwise_scaled_mm,
    fp8_scaled_mm,
    fused_add_rmsnorm,
    gelu_and_mul,
@@ -44,6 +45,7 @@ __all__ = [
    "bmm_fp8",
    "custom_dispose",
    "custom_reduce",
+    "fp8_blockwise_scaled_mm",
    "fp8_scaled_mm",
    "fused_add_rmsnorm",
    "gelu_and_mul",
--- a/sgl-kernel/src/sgl-kernel/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp
+++ b/sgl-kernel/src/sgl-kernel/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp
@@ -0,0 +1,125 @@
+// Adapt from
+// https://github.com/vllm-project/vllm/blob/v0.7.1/csrc/cutlass_extensions/gemm/collective/collective_buildler.hpp
+// Modified from: cutlass/gemm/collective/builders/sm90_gmma_builder.inl
+// clang-format off
+#pragma once
+
+#include <cutlass/gemm/collective/builders/sm90_gmma_builder.inl>
+#include "cutlass_extensions/gemm/dispatch_policy.hpp"
+#include "cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp"
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_TMA_WS_SS (BlockScaled Builders)
+template <
+  class ElementA,
+  class GmemLayoutATag,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutBTag,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  int ScaleGranularityM
+>
+struct CollectiveBuilder<
+    arch::Sm90,
+    arch::OpClassTensorOp,
+    ElementA,
+    GmemLayoutATag,
+    AlignmentA,
+    ElementB,
+    GmemLayoutBTag,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>,
+    cute::enable_if_t<
+      not detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()>
+> {
+  using KernelScheduleType = KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>;
+
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
+                "Should meet TMA alignment requirement\n");
+
+  static constexpr bool IsArrayOfPointersGemm = (cute::is_any_of_v<KernelScheduleType,
+                                                                   KernelPtrArrayTmaWarpSpecializedCooperative,
+                                                                   KernelPtrArrayTmaWarpSpecializedPingpong>);
+  static constexpr bool IsFP8Input = detail::is_input_fp8<ElementA, ElementB>();
+  static_assert((!IsFP8Input || !IsArrayOfPointersGemm),
+                "KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum is only compatible with FP8 Blocked Scaled version right now.");
+
+  // For fp32 types, map to tf32 MMA value type
+  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
+  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
+
+  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementAMma, GmemLayoutATag>();
+  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementBMma, GmemLayoutBTag>();
+
+  static constexpr bool IsCooperative = cute::is_any_of_v<KernelScheduleType,
+                                                          KernelTmaWarpSpecializedCooperative,
+                                                          KernelPtrArrayTmaWarpSpecializedCooperative,
+                                                          KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>>;
+  using AtomLayoutMNK = cute::conditional_t<IsCooperative,
+      Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
+      ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
+
+  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  using SmemLayoutAtomA = decltype(detail::ss_smem_selector<
+      GmmaMajorA, ElementAMma, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+  using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
+      GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  static constexpr size_t TensorMapStorage = IsArrayOfPointersGemm ? sizeof(cute::TmaDescriptor) * 2 /* for A and B */ : 0;
+  static constexpr int KernelSmemCarveout = static_cast<int>(TensorMapStorage);
+
+  static constexpr int PipelineStages = detail::compute_stage_count_or_override<detail::sm90_smem_capacity_bytes - KernelSmemCarveout,
+      ElementAMma, ElementBMma, TileShape_MNK>(StageCountType{});
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<PipelineStages, ClusterShape_MNK, KernelScheduleType, ScaleGranularityM>;
+
+  using SmemCopyAtomA = void;
+  using SmemCopyAtomB = void;
+
+  using CollectiveOp = CollectiveMma<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementA,
+      TagToStrideA_t<GmemLayoutATag>,
+      ElementB,
+      TagToStrideB_t<GmemLayoutBTag>,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      SmemCopyAtomA,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      SmemCopyAtomB,
+      cute::identity
+    >;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/sgl-kernel/src/sgl-kernel/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+++ b/sgl-kernel/src/sgl-kernel/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
@@ -0,0 +1,733 @@
+// clang-format off
+// Adapt from https://github.com/vllm-project/vllm/blob/v0.7.1/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+// Adapted (Heavily) from: https://github.com/soundOfDestiny/cutlass/blob/9d997ce0dea4c5fa1a617db6b7ff29aa9235822c/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+ #pragma once
+
+ #include "cutlass/cutlass.h"
+ #include "cutlass/gemm/dispatch_policy.hpp"
+ #include "cutlass/trace.h"
+ #include "cutlass/numeric_types.h"
+
+ #include "cute/arch/cluster_sm90.hpp"
+ #include "cute/arch/copy_sm80.hpp"
+ #include "cute/arch/copy_sm90.hpp"
+ #include "cute/algorithm/functional.hpp"
+ #include "cute/atom/mma_atom.hpp"
+ #include "cute/algorithm/gemm.hpp"
+ #include "cute/tensor_predicate.hpp"
+ #include "cute/numeric/arithmetic_tuple.hpp"
+
+ /////////////////////////////////////////////////////////////////////////////////////////////////
+
+ namespace cutlass::gemm::collective {
+ using namespace cute;
+
+ /////////////////////////////////////////////////////////////////////////////////////////////////
+
+ // WarpSpecialized Mainloop
+ template <
+   int Stages,
+   class ClusterShape,
+   class KernelSchedule,
+   int ScaleGranularityM_,
+   class TileShape_,
+   class ElementA_,
+   class StrideA_,
+   class ElementB_,
+   class StrideB_,
+   class TiledMma_,
+   class GmemTiledCopyA_,
+   class SmemLayoutAtomA_,
+   class SmemCopyAtomA_,
+   class TransformA_,
+   class GmemTiledCopyB_,
+   class SmemLayoutAtomB_,
+   class SmemCopyAtomB_,
+   class TransformB_>
+ struct CollectiveMma<
+     MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<Stages, ClusterShape, KernelSchedule, ScaleGranularityM_>,
+     TileShape_,
+     ElementA_,
+     StrideA_,
+     ElementB_,
+     StrideB_,
+     TiledMma_,
+     GmemTiledCopyA_,
+     SmemLayoutAtomA_,
+     SmemCopyAtomA_,
+     TransformA_,
+     GmemTiledCopyB_,
+     SmemLayoutAtomB_,
+     SmemCopyAtomB_,
+     TransformB_>
+ {
+   //
+   // Type Aliases
+   //
+   using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<Stages, ClusterShape, KernelSchedule, ScaleGranularityM_>;
+   using TileShape = TileShape_;
+   using ElementA = ElementA_;
+   using StrideA = StrideA_;
+   using ElementB = ElementB_;
+   using StrideB = StrideB_;
+   using TiledMma = TiledMma_;
+   using ElementAccumulator = typename TiledMma::ValTypeC;
+   using ElementBlockScale = ElementAccumulator;
+   using GmemTiledCopyA = GmemTiledCopyA_;
+   using GmemTiledCopyB = GmemTiledCopyB_;
+   using SmemLayoutAtomA = SmemLayoutAtomA_;
+   using SmemLayoutAtomB = SmemLayoutAtomB_;
+   using SmemCopyAtomA = SmemCopyAtomA_;
+   using SmemCopyAtomB = SmemCopyAtomB_;
+   using TransformA = TransformA_;
+   using TransformB = TransformB_;
+   using ArchTag = typename DispatchPolicy::ArchTag;
+
+   using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+   using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+   using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+   using PipelineParams = typename MainloopPipeline::Params;
+
+   // Two threads per CTA are producers (1 for operand tile and 32 for scales)
+   static constexpr int NumProducerThreadEvents = 33;
+
+   static constexpr int ScaleGranularityM = ScaleGranularityM_ == 0 ? size<0>(TileShape{}) : ScaleGranularityM_;
+   static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
+
+   static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+   static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+   static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+   static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+   static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+   static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+   static_assert((size<0>(TileShape{}) % ScaleGranularityM) == 0, "FP8 scaling granularity must evenly divide tile shape along M.");
+
+   // Tile along modes in a way that maximizes the TMA box size.
+   using SmemLayoutA = decltype(tile_to_shape(
+       SmemLayoutAtomA{},
+       make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+       cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+   using SmemLayoutB = decltype(tile_to_shape(
+       SmemLayoutAtomB{},
+       make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+       cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+   // Block scaling gmem-to-smem copy atom
+   using SmemBlockScalingCopyAtomA = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>;
+   using SmemBlockScalingCopyAtomB = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>;
+
+   // Block scaling smem layout
+   using SmemLayoutScaleA = Layout<Shape<Int<ScaleMsPerTile>, Int<DispatchPolicy::Stages>>>;
+   using SmemLayoutScaleB = Layout<Shape<Int<DispatchPolicy::Stages>>, Stride<_1>>; // `ScaleNsPerTile` is always 1.
+
+   static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
+   static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                 cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                 "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+   static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+       "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+   static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+       "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+   static_assert(cute::is_same_v<ElementAccumulator, ElementBlockScale>,
+              "ElementAccumulator and ElementBlockScale should be same datatype");
+
+   struct SharedStorage
+   {
+     struct TensorStorage : cute::aligned_struct<128> {
+       cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;  // mxk
+       cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;  // nxk
+       cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutScaleA>> smem_scale_A; // ScaleMsPerTile x k
+       cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutScaleB>> smem_scale_B; // 1xk
+     } tensors;
+
+     using PipelineStorage = typename MainloopPipeline::SharedStorage;
+     PipelineStorage pipeline;
+   };
+   using TensorStorage = typename SharedStorage::TensorStorage;
+   using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+   // Host side kernel arguments
+   struct Arguments {
+     ElementA const* ptr_A;
+     StrideA dA;
+     ElementB const* ptr_B;
+     StrideB dB;
+     uint32_t mma_promotion_interval = 4;
+     ElementBlockScale const* ptr_scale_A;
+     ElementBlockScale const* ptr_scale_B;
+   };
+
+   // Device side kernel params
+   struct Params {
+     // Assumption: StrideA is congruent with Problem_MK
+     using TMA_A = decltype(make_tma_copy_A_sm90(
+         GmemTiledCopyA{},
+         make_tensor(static_cast<ElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+         SmemLayoutA{}(_,_,0),
+         TileShape{},
+         ClusterShape{}));
+     // Assumption: StrideB is congruent with Problem_NK
+     using TMA_B = decltype(make_tma_copy_B_sm90(
+         GmemTiledCopyB{},
+         make_tensor(static_cast<ElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+         SmemLayoutB{}(_,_,0),
+         TileShape{},
+         ClusterShape{}));
+     TMA_A tma_load_a;
+     TMA_B tma_load_b;
+     uint32_t tma_transaction_bytes = TmaTransactionBytes;
+     uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
+     uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
+     uint32_t mma_promotion_interval = 4;
+     // Block scaling factors for A and B
+     ElementBlockScale const* ptr_scale_A;
+     ElementBlockScale const* ptr_scale_B;
+   };
+
+   //
+   // Methods
+   //
+
+   template <class ProblemShape>
+   static constexpr Params
+   to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+     (void) workspace;
+
+     // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+     auto problem_shape_MNKL = append<4>(problem_shape, 1);
+     auto [M,N,K,L] = problem_shape_MNKL;
+
+     auto ptr_A = reinterpret_cast<ElementA const*>(args.ptr_A);
+     auto ptr_B = reinterpret_cast<ElementB const*>(args.ptr_B);
+
+     Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
+     Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+     typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90(
+         GmemTiledCopyA{},
+         tensor_a,
+         SmemLayoutA{}(_,_,cute::Int<0>{}),
+         TileShape{},
+         ClusterShape{});
+     typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90(
+         GmemTiledCopyB{},
+         tensor_b,
+         SmemLayoutB{}(_,_,cute::Int<0>{}),
+         TileShape{},
+         ClusterShape{});
+     uint32_t transaction_bytes_mk = TmaTransactionBytesMK;
+     uint32_t transaction_bytes_nk = TmaTransactionBytesNK;
+     uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk;
+
+     return {
+       tma_load_a,
+       tma_load_b,
+       transaction_bytes,
+       transaction_bytes_mk,
+       transaction_bytes_nk,
+      args.mma_promotion_interval,
+       args.ptr_scale_A,
+       args.ptr_scale_B
+     };
+   }
+
+   template<class ProblemShape>
+   static bool
+   can_implement(
+       ProblemShape const& problem_shape,
+       [[maybe_unused]] Arguments const& args) {
+     constexpr int tma_alignment_bits = 128;
+     auto problem_shape_MNKL = append<4>(problem_shape, 1);
+     auto [M,N,K,L] = problem_shape_MNKL;
+
+     bool implementable = true;
+     constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+     implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+     constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+     implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+     /* MMA promotion interval should be a multiple of 4, since each mainloop iteration would issue 4 MMA instructions. */
+    implementable = implementable && (args.mma_promotion_interval % 4 == 0);
+
+     if (!implementable) {
+       CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+     }
+     return implementable;
+   }
+
+   static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+   static constexpr int K_PIPE_MMAS = 1;
+   static constexpr uint32_t TmaTransactionBytesMK =
+         cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<ElementA>::value));
+   static constexpr uint32_t TmaTransactionBytesNK =
+         cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<ElementB>::value));
+   static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+   /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+   CUTLASS_DEVICE
+   static void prefetch_tma_descriptors(Params const& mainloop_params)
+   {
+     cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+     cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+   }
+
+   /// Set up the data needed by this collective for load and mma.
+   /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+   /// Returned tuple must contain at least two elements, with the first two elements being:
+   /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+   /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+   template <class ProblemShape_MNKL>
+   CUTLASS_DEVICE auto
+   load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+     using X = Underscore;
+     // Separate out problem shape for convenience
+     auto [M,N,K,L] = problem_shape_MNKL;
+
+     // TMA requires special handling of strides to deal with coord codomain mapping
+     // Represent the full tensors -- get these from TMA
+     Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L));                            // (m,k,l)
+     Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
+
+     // Make tiled views, defer the slice
+     Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+     Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
+
+     constexpr auto scales_m = Int<ScaleMsPerTile>{};
+     auto tM = get<2>(gA_mkl.shape());
+     auto tN = get<2>(gB_nkl.shape());
+     auto tK = get<3>(gA_mkl.shape());
+
+     // Make the tiled views of scale tensors
+     auto scaleA_shape = make_shape(M / ScaleGranularityM, tK, L); // (scale_m,k,l)
+     auto scaleA_layout = make_ordered_layout(scaleA_shape,  Step<_0, _1, _2>{});
+     auto scaleB_shape = make_shape(tN, tK, L); // (n,k,l)
+     auto scaleB_layout = make_ordered_layout(scaleB_shape, Step<_1, _0, _2>{});
+
+     // Note that mScaleA_mkl and mScaleB_nkl are already blocked tiled in the `m` host and
+     // gScaleA_mkl and gScaleB_nkl in `g` global memory are same as mScaleA_mkl and mScaleB_nkl.
+     Tensor mScaleA_mkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_A), scaleA_layout); // (scale_m,k,l)
+     Tensor mScaleB_nkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_B), scaleB_layout); // (n,k,l)
+
+     return cute::make_tuple(gA_mkl, gB_nkl, mScaleA_mkl, mScaleB_nkl);
+   }
+
+   /// Perform a collective-scoped matrix multiply-accumulate
+   /// Producer Perspective
+   template <
+     class TensorA, class TensorB,
+     class TensorScaleA, class TensorScaleB,
+     class KTileIterator, class BlockCoord
+   >
+   CUTLASS_DEVICE void
+   load(
+       Params const& mainloop_params,
+       MainloopPipeline pipeline,
+       PipelineState smem_pipe_write,
+       cute::tuple<TensorA, TensorB, TensorScaleA, TensorScaleB> const& load_inputs,
+       BlockCoord const& blk_coord,
+       KTileIterator k_tile_iter, int k_tile_count,
+       int thread_idx,
+       uint32_t block_rank_in_cluster,
+       TensorStorage& shared_tensors) {
+     int lane_predicate = cute::elect_one_sync();
+
+     // Blockscaling: Tma loads for load_input and CpAsync for load_scale
+     Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+     Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+     Tensor sScaleA = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()), SmemLayoutScaleA{}); // (ScaleMsPerTile,k)
+     Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k)
+
+     //
+     // Prepare the TMA loads for A and B
+     //
+
+     constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+     uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+     Tensor gA_mkl = get<0>(load_inputs);
+     Tensor gB_nkl = get<1>(load_inputs);
+
+     auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+     auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+     // Partition the inputs based on the current block coordinates.
+     auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+     Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+     Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+
+     // Block scaling: load_scale has scaling tensors in global memory which are not tiled
+     Tensor mScaleA_mkl = get<2>(load_inputs);
+     Tensor mScaleB_nkl = get<3>(load_inputs);
+     auto scales_m = get<0>(mScaleA_mkl.shape());
+
+     Tensor cScaleA_mkl = make_identity_tensor(mScaleA_mkl.shape());
+
+     Tensor gScaleA = local_tile(
+       mScaleA_mkl, make_tile(Int<ScaleMsPerTile>{}),
+       make_coord(m_coord,_,l_coord));                   // (ScaleMsPerTile,k,1)
+     Tensor cScaleA = local_tile(
+       cScaleA_mkl, make_tile(Int<ScaleMsPerTile>{}),
+       make_coord(m_coord,_,l_coord));
+     Tensor gScaleB = mScaleB_nkl(n_coord,_,l_coord);                                           // (1,k,1)
+
+     // TODO: test `scale_copy_a` with `ScaleMsPerTile` < 128
+     TiledCopy scale_copy_a = make_tiled_copy(SmemBlockScalingCopyAtomA{},
+       Layout<Shape<_32, _1>>{}, Layout<Shape<_4, _1>>{}); // (1,1,1)
+     TiledCopy scale_copy_b = make_tiled_copy(SmemBlockScalingCopyAtomB{},
+       Layout<Shape<_1>>{}, Layout<Shape<_1>>{}); // (1,1,1)
+     ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(threadIdx.x);
+     ThrCopy thr_scale_copy_b = scale_copy_b.get_slice(threadIdx.x);
+
+     Tensor tAgA_ScaleA = thr_scale_copy_a.partition_S(gScaleA);
+     Tensor tAcA_ScaleA = thr_scale_copy_a.partition_S(cScaleA);
+     Tensor tAsA_ScaleA = thr_scale_copy_a.partition_D(sScaleA);
+
+     Tensor tBgB_ScaleB = thr_scale_copy_b.partition_S(gScaleB);
+     Tensor tBsB_ScaleB = thr_scale_copy_b.partition_D(sScaleB);
+
+     // Applies the mapping from block_tma_a
+     Tensor tAgA = block_tma_a.partition_S(gA);                                              // (TMA,TMA_M,TMA_K,k)
+     Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+     Tensor tBgB = block_tma_b.partition_S(gB);                                              // (TMA,TMA_N,TMA_K,k)
+     Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+     uint16_t mcast_mask_a = 0;
+     uint16_t mcast_mask_b = 0;
+
+     // Issue TmaLoads for GEMM operands A/B and CpAsync for scale tensors
+     // Maps the tile -> block, value
+     if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+       auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+       for (int n = 0; n < size<1>(block_layout); ++n) {
+         mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+       }
+     }
+
+     if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+       auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+       for (int m = 0; m < size<0>(block_layout); ++m) {
+         mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+       }
+     }
+
+     // Allocate predicate tensors for a_scales (since we can't guarantee that
+     // all scales are valid, since we could have a partial tiles along M)
+     Tensor tApA_ScaleA = make_tensor<bool>(shape(tAsA_ScaleA(_,_,0)));
+     #pragma unroll
+     for (int i = 0; i < size(tApA_ScaleA); ++i) {
+       tApA_ScaleA(i) = get<0>(tAcA_ScaleA(i)) < scales_m;
+     }
+
+     // Mainloop
+     CUTLASS_PRAGMA_NO_UNROLL
+     for ( ; k_tile_count > 0; --k_tile_count) {
+       // LOCK smem_pipe_write for _writing_
+       pipeline.producer_acquire(smem_pipe_write);
+
+       //
+       // Copy gmem to smem for *k_tile_iter
+       //
+       int write_stage = smem_pipe_write.index();
+       using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+       BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+       // Copy operands A and B from global memory to shared memory
+       if (lane_predicate) copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+       if (lane_predicate) copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+
+       // Copy scale tensors from global memory to shared memory
+       copy_if(scale_copy_a, tApA_ScaleA, tAgA_ScaleA(_,_,*k_tile_iter), tAsA_ScaleA(_,_,write_stage));
+       copy(scale_copy_b, tBgB_ScaleB(_,*k_tile_iter), tBsB_ScaleB(_,write_stage));
+       pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive_noinc);
+
+       ++k_tile_iter;
+
+       // Advance smem_pipe_write
+       ++smem_pipe_write;
+     }
+   }
+
+   /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+   CUTLASS_DEVICE void
+   load_tail(
+       MainloopPipeline pipeline,
+       PipelineState smem_pipe_write) {
+     int lane_predicate = cute::elect_one_sync();
+
+     // Issue the epilogue waits
+     if (lane_predicate) {
+       /* This helps avoid early exit of blocks in Cluster
+        * Waits for all stages to either be released (all
+        * Consumer UNLOCKs), or if the stage was never used
+        * then would just be acquired since the phase was
+        * still inverted from make_producer_start_state
+        */
+       pipeline.producer_tail(smem_pipe_write);
+     }
+   }
+
+   /// Perform a collective-scoped matrix multiply-accumulate
+   /// Consumer Perspective
+   template <
+     class FrgTensorC
+   >
+   CUTLASS_DEVICE void
+   mma(MainloopPipeline pipeline,
+       PipelineState smem_pipe_read,
+       FrgTensorC& accum,
+       int k_tile_count,
+       int thread_idx,
+       TensorStorage& shared_tensors,
+       Params const& mainloop_params) {
+
+
+     static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+     static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+     static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+     static_assert(cute::is_void_v<SmemCopyAtomA>,
+       "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+     static_assert(cute::is_void_v<SmemCopyAtomB>,
+       "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+     Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+     Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+
+     // Block scaling
+     Tensor sScaleAViewAsC = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()),
+       Layout<
+         Shape<Shape<Int<ScaleGranularityM>, Int<ScaleMsPerTile>>, cute::tuple_element_t<1, TileShape>, Int<DispatchPolicy::Stages>>,
+         Stride<Stride<_0, _1>, _0, Int<ScaleMsPerTile>>
+       >{}); // ((ScaleGranularityM,ScaleMsPerTile),n,k)
+     Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k)
+
+     //
+     // Define C accumulators and A/B partitioning
+     //
+
+     // Layout of warp group to thread mapping
+
+     static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and
+                   stride<0>(typename TiledMma::BLayout{}) == 0 and
+                   size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
+                   size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup,
+                   "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+     constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+     Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{},
+                                                   Int<NumThreadsPerWarpGroup>{});
+
+     int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+     TiledMma tiled_mma;
+     auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+     Tensor tCsScaleAViewAsC = tiled_mma.get_slice(thread_idx).partition_C(sScaleAViewAsC);    // (MMA,MMA_M,MMA_N,PIPE), `thread_mma` above is correct when partitioning A and B, but it is not correct when partitioning C.
+
+     Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
+     Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
+
+     // Allocate "fragments/descriptors"
+     Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+     Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+     CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
+     CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+     CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+     CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+     CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+     CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+     //
+     // PIPELINED MAIN LOOP
+     //
+     static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+         "ERROR : Incorrect number of MMAs in flight");
+
+     // We release buffers to producer warps(dma load) with some mmas in flight
+     PipelineState smem_pipe_release = smem_pipe_read;
+
+     // Per block scale values for operand A and B
+
+     using RegLayoutScaleAViewAsC = decltype(make_layout_like(tCsScaleAViewAsC(_, _, _, 0).layout())); // `make_layout_like` makes a compact layout.
+     using RegLayoutScaleAEssential = decltype(filter_zeros(RegLayoutScaleAViewAsC{}.stride(), RegLayoutScaleAViewAsC{}.shape())); // an interface to traverse the underlying storage for the compact layout mentioned above
+
+     Tensor tCrScaleAViewAsC = make_tensor<ElementBlockScale>(RegLayoutScaleAViewAsC{});              // (MMA,MMA_M,MMA_N)
+     ElementBlockScale scale_b;
+
+     // Prologue GMMAs
+     int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+
+     tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+     GmmaFP8Accumulation accumulation(accum, mainloop_params.mma_promotion_interval, size<2>(tCrA));
+     warpgroup_fence_operand(accumulation());
+     CUTLASS_PRAGMA_UNROLL
+     for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue)
+     {
+       // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+       auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+       pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+       if (accumulation.prepare_if_needed()) {
+         tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+       }
+
+       int read_stage = smem_pipe_read.index();
+
+       // Load per block scale values from shared memory to registers.
+       scale_b = sScaleB[read_stage];
+       CUTLASS_PRAGMA_UNROLL
+       for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
+         tCrScaleAViewAsC.data()[i] = tCsScaleAViewAsC(_, _, _, read_stage)(idx2crd(i, RegLayoutScaleAEssential{}));
+       }
+       if constexpr (ScaleMsPerTile == 1) {
+         static_assert(size(RegLayoutScaleAEssential{}) == 1);
+         tCrScaleAViewAsC.data()[0] = __shfl_sync(0xffffffff, tCrScaleAViewAsC.data()[0] * scale_b, 0); // `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`.
+       } else {
+         CUTLASS_PRAGMA_UNROLL
+         for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
+           tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b;
+         }
+       }
+
+       warpgroup_arrive();
+       // Unroll the K mode manually to set scale D to 1
+       CUTLASS_PRAGMA_UNROLL
+       for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+         // (V,M,K) x (V,N,K) => (V,M,N)
+         cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
+         tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+       }
+       warpgroup_commit_batch();
+
+       // Block scale the accumulators with reg tensor `tCrScaleAViewAsC`
+       accumulation.scale_if_needed(tCrScaleAViewAsC);
+
+       ++smem_pipe_read;
+     }
+
+     warpgroup_fence_operand(accumulation());
+     // Mainloop GMMAs
+     k_tile_count -= prologue_mma_count;
+
+     CUTLASS_PRAGMA_NO_UNROLL
+     for ( ; k_tile_count > 0; --k_tile_count)
+     {
+       // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+       auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+       pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+       //
+       // Compute on k_tile
+       //
+
+       int read_stage = smem_pipe_read.index();
+
+       // Load per block scale values from shared memory to registers (at most twice per block along M and exactly once per block along N)
+       scale_b = sScaleB[read_stage];
+       CUTLASS_PRAGMA_UNROLL
+       for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
+         tCrScaleAViewAsC.data()[i] = tCsScaleAViewAsC(_, _, _, read_stage)(idx2crd(i, RegLayoutScaleAEssential{}));
+       }
+       if constexpr (ScaleMsPerTile == 1) {
+         static_assert(size(RegLayoutScaleAEssential{}) == 1);
+         tCrScaleAViewAsC.data()[0] = __shfl_sync(0xffffffff, tCrScaleAViewAsC.data()[0] * scale_b, 0); // `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`.
+       } else {
+         CUTLASS_PRAGMA_UNROLL
+         for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
+           tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b;
+         }
+       }
+
+       if (accumulation.prepare_if_needed()) {
+         tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+       }
+
+       warpgroup_fence_operand(accumulation());
+       warpgroup_arrive();
+       // Unroll the K mode manually to set scale D to 1
+       CUTLASS_PRAGMA_UNROLL
+       for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+         // (V,M,K) x (V,N,K) => (V,M,N)
+         cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
+         tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+       }
+       warpgroup_commit_batch();
+
+       /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+       warpgroup_wait<K_PIPE_MMAS>();
+       warpgroup_fence_operand(accumulation());
+
+       // Block scale the accumulators with reg tensor `tCrScaleAViewAsC`
+       accumulation.scale_if_needed(tCrScaleAViewAsC);
+
+       pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+
+       // Advance smem_pipe_read and smem_pipe_release
+       ++smem_pipe_read;
+       ++smem_pipe_release;
+     }
+
+     accumulation.scale_residue_if_needed(tCrScaleAViewAsC);
+
+     warpgroup_fence_operand(accumulation());
+   }
+
+   /// Perform a Consumer Epilogue to release all buffers
+   CUTLASS_DEVICE void
+   mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+     // Prologue GMMAs
+     int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+     k_tile_count -= prologue_mma_count;
+
+     smem_pipe_release.advance(k_tile_count);
+
+     // Wait on all GMMAs to complete
+     warpgroup_wait<0>();
+
+     for (int count = 0; count < prologue_mma_count; ++count) {
+       pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+       ++smem_pipe_release;
+     }
+   }
+ };
+
+ /////////////////////////////////////////////////////////////////////////////////////////////////
+
+ } // namespace cutlass::gemm::collective
+
+ /////////////////////////////////////////////////////////////////////////////////////////////////
--- a/sgl-kernel/src/sgl-kernel/csrc/cutlass_extensions/gemm/dispatch_policy.hpp
+++ b/sgl-kernel/src/sgl-kernel/csrc/cutlass_extensions/gemm/dispatch_policy.hpp
@@ -0,0 +1,33 @@
+// Adapt from https://github.com/vllm-project/vllm/blob/v0.7.1/csrc/cutlass_extensions/gemm/dispatch_policy.hpp
+#pragma once
+
+#include <cutlass/gemm/dispatch_policy.hpp>
+
+namespace cutlass::gemm {
+
+//////////////////////////////////////////////////////////////////////////////
+
+// FP8 related policies (including Blocked Scaled Accumulation)
+//  `ScaleGranularityM` specifies scaling granularity along M, while zero-value
+//  `ScaleGranularityM` indicates that scaling granularity is
+//  `size<0>(TileShape_MNK{})` along M.
+template <int ScaleGranularityM = 0>
+struct KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum : KernelTmaWarpSpecializedCooperative {};
+
+// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp
+// specialized dynamic schedule For FP8 kernels with Block Scaling
+template <int Stages_, class ClusterShape_ = Shape<_1, _1, _1>, class KernelSchedule = KernelTmaWarpSpecialized,
+          int ScaleGranularityM = 0  // `ScaleGranularityM` specifies scaling granularity along M,
+                                     // while zero-value `ScaleGranularityM` indicates that scaling
+                                     // granularity is `size<0>(TileShape_MNK{})` along M.
+          >
+struct MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8
+    : MainloopSm90TmaGmmaWarpSpecialized<Stages_, ClusterShape_, KernelSchedule> {
+  static_assert(cute::is_same_v<KernelSchedule,
+                                KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>>,
+                "KernelSchedule must be one of the warp specialized policies");
+};
+
+//////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm
--- a/sgl-kernel/src/sgl-kernel/csrc/fp8_blockwise_gemm_kernel.cu
+++ b/sgl-kernel/src/sgl-kernel/csrc/fp8_blockwise_gemm_kernel.cu
@@ -0,0 +1,191 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <cudaTypedefs.h>
+#include <cutlass/arch/arch.h>
+#include <cutlass/arch/memory.h>
+#include <cutlass/arch/mma.h>
+#include <cutlass/array.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/epilogue/thread/activation.h>
+#include <cutlass/epilogue/thread/linear_combination.h>
+#include <cutlass/epilogue/threadblock/default_thread_map_tensor_op.h>
+#include <cutlass/gemm/device/gemm.h>
+#include <cutlass/gemm/device/gemm_universal_adapter.h>
+#include <cutlass/gemm/gemm.h>
+#include <cutlass/gemm/kernel/default_gemm_universal_with_visitor.h>
+#include <cutlass/gemm/thread/mma.h>
+#include <cutlass/layout/matrix.h>
+#include <cutlass/matrix_coord.h>
+#include <cutlass/numeric_types.h>
+#include <cutlass/tensor_ref.h>
+#include <torch/all.h>
+
+#include <cute/tensor.hpp>
+#include <cutlass/epilogue/collective/collective_builder.hpp>
+#include <cutlass/epilogue/collective/default_epilogue.hpp>
+#include <cutlass/epilogue/threadblock/fusion/visitors.hpp>
+#include <cutlass/gemm/collective/collective_builder.hpp>
+#include <cutlass/gemm/dispatch_policy.hpp>
+#include <cutlass/gemm/kernel/gemm_universal.hpp>
+#include <cutlass/util/packed_stride.hpp>
+
+#include "cutlass_extensions/gemm/collective/collective_builder.hpp"
+#include "cutlass_extensions/gemm/dispatch_policy.hpp"
+#include "utils.h"
+
+using namespace cute;
+
+template <typename OutType, typename TileShape, typename ClusterShape, int ScaleGranularityM = 1>
+void launch_sm90_fp8_blockwise_scaled_mm(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b,
+                                         const torch::Tensor& scales_a, const torch::Tensor& scales_b) {
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementBlockScale = float;
+
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
+
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
+
+  using ElementC = void;
+  using LayoutC = cutlass::layout::RowMajor;
+  constexpr int AlignmentC = 128 / cutlass::sizeof_bits<OutType>::value;
+
+  using ElementD = OutType;
+  using LayoutD = cutlass::layout::RowMajor;
+  constexpr int AlignmentD = AlignmentC;
+
+  using ArchTag = cutlass::arch::Sm90;
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+  using StoreEpilogueCompute = typename cutlass::epilogue::fusion::Sm90EVT<cutlass::epilogue::fusion::Sm90AccFetch>;
+
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      ArchTag, OperatorClass, TileShape, ClusterShape, EpilogueTileType, ElementAccumulator, ElementCompute, ElementC,
+      LayoutC, AlignmentC, ElementD, LayoutD, AlignmentD, EpilogueSchedule, StoreEpilogueCompute>::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      ArchTag, OperatorClass, ElementA, LayoutA, AlignmentA, ElementB, LayoutB, AlignmentB, ElementAccumulator,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+          sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      KernelSchedule>::CollectiveOp;
+
+  using GemmKernel =
+      cutlass::gemm::kernel::GemmUniversal<Shape<int, int, int, int>,  // Indicates ProblemShape
+                                           CollectiveMainloop, CollectiveEpilogue, cutlass::gemm::PersistentScheduler>;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  Gemm gemm_op;
+
+  int m = a.size(0);
+  int k = a.size(1);
+  int n = b.size(1);
+
+  auto a_ptr = static_cast<ElementA*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementB*>(b.data_ptr());
+  auto o_ptr = static_cast<ElementD*>(out.data_ptr());
+
+  auto a_s_ptr = static_cast<ElementBlockScale*>(scales_a.data_ptr());
+  auto b_s_ptr = static_cast<ElementBlockScale*>(scales_b.data_ptr());
+
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+
+  StrideA stride_a = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1));
+  StrideB stride_b = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
+  StrideC stride_c;
+  StrideD stride_d = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(m, n, 1));
+
+  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, stride_a, b_ptr, stride_b, 4, a_s_ptr, b_s_ptr};
+  typename GemmKernel::EpilogueArguments epilogue_args{{}, nullptr, stride_d, o_ptr, stride_d};
+
+  typename Gemm::Arguments args = {
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      {m, n, k, 1},
+      mainloop_args,
+      epilogue_args,
+  };
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options = torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  auto can_implement = gemm_op.can_implement(args);
+  TORCH_CHECK(can_implement == cutlass::Status::kSuccess, cutlassGetStatusString(can_implement))
+
+  auto status = gemm_op.run(args, workspace.data_ptr(), stream);
+  TORCH_CHECK(status == cutlass::Status::kSuccess, cutlassGetStatusString(status))
+}
+
+template <typename OutType>
+void sm90_fp8_blockwise_dispatch_shape(torch::Tensor& out, const torch::Tensor& a, const torch::Tensor& b,
+                                       const torch::Tensor& scales_a, const torch::Tensor& scales_b) {
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  launch_sm90_fp8_blockwise_scaled_mm<OutType, TileShape, ClusterShape>(out, a, b, scales_a, scales_b);
+}
+
+torch::Tensor fp8_blockwise_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat_b,
+                                      const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+                                      const torch::Dtype& out_dtype) {
+  TORCH_CHECK(mat_a.is_cuda(), "mat_a must be a CUDA tensor");
+  TORCH_CHECK(mat_b.is_cuda(), "mat_b must be a CUDA tensor");
+  TORCH_CHECK(mat_a.dim() == 2, "mat_a must be a 2D tensor");
+  TORCH_CHECK(mat_b.dim() == 2, "mat_b must be a 2D tensor");
+  TORCH_CHECK(mat_a.stride(1) == 1, "mat_a must be a row major tensor");
+  TORCH_CHECK(mat_b.stride(0) == 1, "mat_a must be a column major tensor");
+  TORCH_CHECK(mat_a.size(1) == mat_b.size(0), "mat_a and mat_b shapes cannot be multiplied");
+
+  TORCH_CHECK((mat_a.size(1) * mat_a.element_size()) % 16 == 0,
+              "mat_a must be multiple of 16 bytes for memory alignment");
+  TORCH_CHECK((mat_b.size(0) * mat_b.element_size()) % 16 == 0,
+              "mat_b must be multiple of 16 bytes for memory alignment");
+  TORCH_CHECK(mat_a.scalar_type() == torch::kFloat8_e4m3fn, "mat_a must be Float8_e4m3fn");
+  TORCH_CHECK(mat_b.scalar_type() == torch::kFloat8_e4m3fn, "mat_b must be Float8_e4m3fn");
+  TORCH_CHECK(out_dtype == torch::kHalf || out_dtype == torch::kBFloat16, "out_dtype must be Half or BFloat16");
+
+  auto is_contiguous_vector = [](const torch::Tensor& t) {
+    auto t_sizes = t.sizes();
+    return t.is_contiguous() &&
+           (t.dim() == 1 || (t.dim() == 2 && *std::min_element(t_sizes.begin(), t_sizes.end()) == 1));
+  };
+
+  TORCH_CHECK(mat_a.size(0) == scales_a.size(0), "size of scales_a is not matched");
+  TORCH_CHECK(mat_a.size(1) / 128 == scales_a.size(1), "size of scales_a is not matched");
+  TORCH_CHECK(scales_a.stride(0) == 1 || is_contiguous_vector(scales_a), "scales_a must be M major");
+  TORCH_CHECK(mat_b.size(0) / 128 == scales_b.size(0), "size of scales_b is not matched");
+  TORCH_CHECK(mat_b.size(1) / 128 == scales_b.size(1), "size of scales_b is not matched");
+  TORCH_CHECK(scales_b.stride(0) == 1 || is_contiguous_vector(scales_b), "scales_b must be K major");
+  TORCH_CHECK(scales_a.scalar_type() == torch::kFloat32, "scales_a must be Float32");
+  TORCH_CHECK(scales_b.scalar_type() == torch::kFloat32, "scales_b must be Float32");
+
+  torch::Tensor out = torch::empty({mat_a.size(0), mat_b.size(1)}, mat_a.options().dtype(out_dtype));
+  TORCH_CHECK((out.size(1) * out.element_size()) % 16 == 0, "out must be multiple of 16 bytes for memory alignment");
+
+  auto sm_version = getSMVersion();
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+#if defined CUDA_VERSION && CUDA_VERSION >= 12000
+  if (sm_version >= 90) {
+    if (out_dtype == torch::kBFloat16) {
+      sm90_fp8_blockwise_dispatch_shape<cutlass::bfloat16_t>(out, mat_a, mat_b, scales_a, scales_b);
+    } else {
+      sm90_fp8_blockwise_dispatch_shape<cutlass::half_t>(out, mat_a, mat_b, scales_a, scales_b);
+    }
+    return out;
+  }
+#endif
+#endif
+
+  TORCH_CHECK_NOT_IMPLEMENTED(false,
+                              "No implemented fp8_blockwise_scaled_mm for current compute capability: ", sm_version);
+}
--- a/sgl-kernel/src/sgl-kernel/include/sgl_kernels_ops.h
+++ b/sgl-kernel/src/sgl-kernel/include/sgl_kernels_ops.h
@@ -60,6 +60,11 @@ torch::Tensor fp8_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat
                            const torch::Tensor& scales_b, const torch::Dtype& out_dtype,
                            const c10::optional<torch::Tensor>& bias);

+// fp8_blockwise_scaled_mm
+torch::Tensor fp8_blockwise_scaled_mm(const torch::Tensor& mat_a, const torch::Tensor& mat_b,
+                                      const torch::Tensor& scales_a, const torch::Tensor& scales_b,
+                                      const torch::Dtype& out_dtype);
+
 // lightning_attention_decode
 void lightning_attention_decode(const torch::Tensor& q, const torch::Tensor& k, const torch::Tensor& v,
                                const torch::Tensor& past_kv, const torch::Tensor& slope, torch::Tensor output,
--- a/sgl-kernel/src/sgl-kernel/ops/init.py
+++ b/sgl-kernel/src/sgl-kernel/ops/init.py
@@ -125,6 +125,16 @@ def int8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None):
    )


+def fp8_blockwise_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype):
+    return torch.ops.sgl_kernels.fp8_blockwise_scaled_mm(
+        mat_a,
+        mat_b,
+        scales_a,
+        scales_b,
+        out_dtype,
+    )
+
+
 def fp8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None):
    return torch.ops.sgl_kernels.fp8_scaled_mm(
        mat_a,
--- a/sgl-kernel/src/sgl-kernel/torch_extension.cc
+++ b/sgl-kernel/src/sgl-kernel/torch_extension.cc
@@ -54,6 +54,12 @@ TORCH_LIBRARY_EXPAND(sgl_kernels, m) {
      "bias) -> Tensor");
  m.impl("fp8_scaled_mm", torch::kCUDA, &fp8_scaled_mm);

+  // fp8_blockwise_scaled_mm
+  m.def(
+      "fp8_blockwise_scaled_mm(Tensor mat_a, Tensor mat_b, Tensor scales_a, Tensor scales_b, ScalarType out_dtype) -> "
+      "Tensor");
+  m.impl("fp8_blockwise_scaled_mm", torch::kCUDA, &fp8_blockwise_scaled_mm);
+
  // lightning_attention_decode
  m.def(
      "lightning_attention_decode(Tensor q, Tensor k, Tensor v, Tensor past_kv, Tensor slope, Tensor! output, Tensor! "
--- a/sgl-kernel/tests/test_fp8_blockwise_gemm.py
+++ b/sgl-kernel/tests/test_fp8_blockwise_gemm.py
@@ -0,0 +1,112 @@
+import unittest
+from typing import Optional, Type
+
+import torch
+from sgl_kernel import fp8_blockwise_scaled_mm
+
+
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+
+
+def scale_shape(shape, group_shape):
+    assert len(shape) == len(group_shape)
+    return tuple(cdiv(shape[i], group_shape[i]) for i in range(len(group_shape)))
+
+
+def baseline_scaled_mm(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    out_dtype: Type[torch.dtype],
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+
+    # We treat N-dimensional group scaling as extended numpy-style broadcasting
+    # in numpy simply stretches dimensions with an extent of 1 to match the
+    # the target shape by repeating the data along that dimension (broadcasting)
+    # , we extend these semantics to say if the extent of a dimension in the
+    # source shape is not 1 and does not match the target shape we repeat each
+    # element along that dimension src_shape[dim] // target_shape[dim] times
+    # example if we have:
+    #       a = [[1, 2], and target_shape = (2, 4)
+    #            [3, 4]]
+    # then we would expand a to:
+    #       a = [[1, 1, 2, 2],
+    #            [3, 3, 4, 4]]
+    # NOTE this function this function does not explicitly broadcast dimensions
+    # with an extent of 1, since this can be done implicitly by pytorch
+    def group_broadcast(t, shape):
+        for i, s in enumerate(shape):
+            if t.shape[i] != s and t.shape[i] != 1:
+                assert s % t.shape[i] == 0
+                t = (
+                    t.unsqueeze(i + 1)
+                    .expand(*t.shape[: i + 1], s // t.shape[i], *t.shape[i + 1 :])
+                    .flatten(i, i + 1)
+                )
+        return t
+
+    scale_a = group_broadcast(scale_a, a.shape)
+    scale_b = group_broadcast(scale_b, b.shape)
+
+    output = torch.mm(
+        (scale_a * a.to(dtype=torch.float32)), (scale_b * b.to(dtype=torch.float32))
+    ).to(out_dtype)
+
+    if bias is not None:
+        output = output + bias
+
+    return output
+
+
+class TestFp8Gemm(unittest.TestCase):
+    def _test_accuracy_once(self, M, N, K, out_dtype, device):
+        fp8_info = torch.finfo(torch.float8_e4m3fn)
+        fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+        a_fp32 = (
+            (torch.rand(M, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
+        )
+        a_fp8 = a_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+        b_fp32 = (
+            (torch.rand(N, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
+        )
+        b_fp8 = b_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn).t()
+
+        scale_a_group_shape = (1, 128)
+        scale_b_group_shape = (128, 128)
+        scale_a_shape = scale_shape(a_fp8.shape, scale_a_group_shape)
+        scale_b_shape = scale_shape(b_fp8.shape, scale_b_group_shape)
+
+        scale_a = torch.randn(scale_a_shape, device=device, dtype=torch.float32) * 0.001
+        scale_b = torch.randn(scale_b_shape, device=device, dtype=torch.float32) * 0.001
+        scale_a = scale_a.t().contiguous().t()
+        scale_b = scale_b.t().contiguous().t()
+
+        o1 = torch.empty((M, N), device=device, dtype=torch.bfloat16)
+        o = baseline_scaled_mm(a_fp8, b_fp8, scale_a, scale_b, out_dtype)
+        o1 = fp8_blockwise_scaled_mm(a_fp8, b_fp8, scale_a, scale_b, out_dtype)
+
+        rtol = 0.02
+        atol = 1
+        torch.testing.assert_close(o, o1, rtol=rtol, atol=atol)
+        print(f"M={M}, N={N}, K={K}, out_dtype={out_dtype}: OK")
+
+    def test_accuracy(self):
+        Ms = [1, 128, 512, 1024, 4096]
+        Ns = [128, 512, 1024, 4096]
+        Ks = [512, 1024, 4096, 8192, 16384]
+        out_dtypes = [torch.bfloat16, torch.float16]
+        for M in Ms:
+            for N in Ns:
+                for K in Ks:
+                    for out_dtype in out_dtypes:
+                        self._test_accuracy_once(M, N, K, out_dtype, "cuda")
+
+
+if __name__ == "__main__":
+    unittest.main()