Update CUTLASS 4.2 & Enable K-Major Scale Factor for SM90 FP8 Blockwise Group GEMM (#9559)

2025-08-25 14:24:43 +08:00
parent a0b22f2f17
commit fda4792620
5 changed files with 104 additions and 134 deletions
--- a/sgl-kernel/CMakeLists.txt
+++ b/sgl-kernel/CMakeLists.txt
@@ -45,7 +45,7 @@ include(FetchContent)
 FetchContent_Declare(
    repo-cutlass
    GIT_REPOSITORY https://github.com/NVIDIA/cutlass
-    GIT_TAG        664c4f7b3ed1959414905025728eef5568209479
+    GIT_TAG        a49a78ffefc86a87160dfe0ccc3a3a2d1622c918
    GIT_SHALLOW    OFF
 )
 FetchContent_Populate(repo-cutlass)
--- a/sgl-kernel/csrc/moe/fp8_blockwise_moe_kernel.cu
+++ b/sgl-kernel/csrc/moe/fp8_blockwise_moe_kernel.cu
@@ -457,39 +457,40 @@ void sm90_fp8_blockwise_group_mm_dispatch_shape(
    const torch::Tensor& problem_sizes,
    const torch::Tensor& expert_offsets,
    const torch::Tensor& workspace) {
-  struct MmaConfig0 {
+  struct MmaConfigSmallM {
+    // Swap A/B
+    using ElementA = cutlass::float_e4m3_t;
+    using MmaTileShape = Shape<_128, _32, _128>;
+    using ClusterShape = Shape<_2, _1, _1>;
+    // TODO: Check Pingpong or Cooperative
+    using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8BlockScaledAccum;
+    using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
+    using ScaleConfig =
+        cutlass::detail::Sm90BlockwiseScaleConfig<128, 1, 128, cute::GMMA::Major::K, cute::GMMA::Major::K>;
+    using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
+    using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
+  };
+
+  struct MmaConfigH20LargeK {
    using ElementA = cutlass::float_e4m3_t;
    using MmaTileShape = Shape<_64, _128, _128>;
    using ClusterShape = Shape<_2, _1, _1>;
    using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8BlockScaledAccum;
    using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
-    using ScaleConfig = cutlass::detail::Sm90BlockwiseScaleConfig<1, 128, 128>;
-
+    using ScaleConfig =
+        cutlass::detail::Sm90BlockwiseScaleConfig<1, 128, 128, cute::GMMA::Major::K, cute::GMMA::Major::K>;
    using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
    using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
  };

-  struct MmaConfig1 {
+  struct MmaConfigHx00AndH20SmallK {
    using ElementA = cutlass::float_e4m3_t;
    using MmaTileShape = Shape<_128, _128, _128>;
    using ClusterShape = Shape<_1, _2, _1>;
    using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperativeFP8BlockScaledAccum;
    using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperative;
-    using ScaleConfig = cutlass::detail::Sm90BlockwiseScaleConfig<1, 128, 128>;
-
-    using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
-    using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
-  };
-
-  // [NOTE] default for H20
-  struct MmaConfigH20_default {
-    using ElementA = cutlass::float_e4m3_t;
-    using MmaTileShape = Shape<_64, _128, _128>;
-    using ClusterShape = Shape<_1, _2, _1>;
-    using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8BlockScaledAccum;
-    using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
-    using ScaleConfig = cutlass::detail::Sm90BlockwiseScaleConfig<1, 128, 128>;
-
+    using ScaleConfig =
+        cutlass::detail::Sm90BlockwiseScaleConfig<1, 128, 128, cute::GMMA::Major::K, cute::GMMA::Major::K>;
    using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
    using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
  };
@@ -497,33 +498,34 @@ void sm90_fp8_blockwise_group_mm_dispatch_shape(
  int num_experts = (int)expert_offsets.size(0);
  torch::TensorOptions options_int = torch::TensorOptions().dtype(torch::kInt64).device(a.device());
  torch::Tensor problem_sizes_transpose = torch::empty(num_experts * 3, options_int);
+  torch::Tensor output_t = output.t();
+  torch::Tensor a_t = a.t();
+  torch::Tensor b_t = b.transpose(1, 2);
+  torch::Tensor scales_a_t = scales_a.t();
+  torch::Tensor scales_b_t = scales_b.transpose(1, 2);

-  const std::string H20_device_type_str = "NVIDIA H20";
-  bool is_h20_device = isDeviceType(H20_device_type_str);
+  const std::string H20_device_type_str("NVIDIA H20");
+  bool is_h20_device = std::string(at::cuda::getCurrentDeviceProperties()->name) == H20_device_type_str;

-  if (is_h20_device) {
-    using execute_gemm_config = MmaConfigH20_default;
-    run_get_group_gemm_starts<
-        execute_gemm_config::LayoutSFA,
-        execute_gemm_config::LayoutSFB,
-        execute_gemm_config::ScaleConfig>(
+  if (a.size(0) <= 2048) {
+    run_get_group_gemm_starts<MmaConfigSmallM::LayoutSFA, MmaConfigSmallM::LayoutSFB, MmaConfigSmallM::ScaleConfig>(
        expert_offsets,
        a_ptrs,
        b_ptrs,
        out_ptrs,
        a_scales_ptrs,
        b_scales_ptrs,
-        a,
-        b,
-        output,
-        scales_a,
-        scales_b,
+        b_t,
+        a_t,
+        output_t,
+        scales_b_t,
+        scales_a_t,
        layout_sfa,
        layout_sfb,
        problem_sizes,
-        problem_sizes_transpose);
-
-    launch_sm90_fp8_blockwise_scaled_group_mm<OutType, execute_gemm_config, cutlass::layout::RowMajor>(
+        problem_sizes_transpose,
+        true);
+    launch_sm90_fp8_blockwise_scaled_group_mm<OutType, MmaConfigSmallM, cutlass::layout::ColumnMajor>(
        out_ptrs,
        a_ptrs,
        b_ptrs,
@@ -534,13 +536,17 @@ void sm90_fp8_blockwise_group_mm_dispatch_shape(
        stride_c,
        layout_sfa,
        layout_sfb,
-        problem_sizes,
+        problem_sizes_transpose,
        expert_offsets,
        workspace);
+    output = output_t.t();
  } else {
-    if (at::cuda::getCurrentDeviceProperties()->multiProcessorCount == 78 && a.size(1) > 128) {
+    if (is_h20_device && a.size(1) > 128) {
      // For H20 with K > 128, use Pingpong Schedule
-      run_get_group_gemm_starts<MmaConfig0::LayoutSFA, MmaConfig0::LayoutSFB, MmaConfig0::ScaleConfig>(
+      run_get_group_gemm_starts<
+          MmaConfigH20LargeK::LayoutSFA,
+          MmaConfigH20LargeK::LayoutSFB,
+          MmaConfigH20LargeK::ScaleConfig>(
          expert_offsets,
          a_ptrs,
          b_ptrs,
@@ -556,7 +562,7 @@ void sm90_fp8_blockwise_group_mm_dispatch_shape(
          layout_sfb,
          problem_sizes,
          problem_sizes_transpose);
-      launch_sm90_fp8_blockwise_scaled_group_mm<OutType, MmaConfig0, cutlass::layout::RowMajor>(
+      launch_sm90_fp8_blockwise_scaled_group_mm<OutType, MmaConfigH20LargeK, cutlass::layout::RowMajor>(
          out_ptrs,
          a_ptrs,
          b_ptrs,
@@ -572,7 +578,10 @@ void sm90_fp8_blockwise_group_mm_dispatch_shape(
          workspace);
    } else {
      // For H20 with K <= 128, and H100 & H200 & H800, use Cooperative Schedule
-      run_get_group_gemm_starts<MmaConfig1::LayoutSFA, MmaConfig1::LayoutSFB, MmaConfig1::ScaleConfig>(
+      run_get_group_gemm_starts<
+          MmaConfigHx00AndH20SmallK::LayoutSFA,
+          MmaConfigHx00AndH20SmallK::LayoutSFB,
+          MmaConfigHx00AndH20SmallK::ScaleConfig>(
          expert_offsets,
          a_ptrs,
          b_ptrs,
@@ -588,7 +597,7 @@ void sm90_fp8_blockwise_group_mm_dispatch_shape(
          layout_sfb,
          problem_sizes,
          problem_sizes_transpose);
-      launch_sm90_fp8_blockwise_scaled_group_mm<OutType, MmaConfig1, cutlass::layout::RowMajor>(
+      launch_sm90_fp8_blockwise_scaled_group_mm<OutType, MmaConfigHx00AndH20SmallK, cutlass::layout::RowMajor>(
          out_ptrs,
          a_ptrs,
          b_ptrs,
--- a/sgl-kernel/tests/test_fp8_blockwise_moe.py
+++ b/sgl-kernel/tests/test_fp8_blockwise_moe.py
@@ -5,10 +5,6 @@ import pytest
 import torch
 from sgl_kernel import fp8_blockwise_scaled_grouped_mm

-from sglang.srt.layers.quantization.fp8_kernel import (
-    per_token_group_quant_fp8_hopper_moe_mn_major,
-)
-

 def cdiv(a: int, b: int) -> int:
    return -(a // -b)
@@ -106,24 +102,19 @@ def is_sm90_supported(device=None) -> bool:
    not (is_sm100_supported() or is_sm90_supported()),
    reason="fp8_blockwise_scaled_grouped_mm at sgl-kernel is only supported on sm100 or sm90",
 )
-@pytest.mark.parametrize("num_experts", [8, 16])
+@pytest.mark.parametrize("num_experts", [8, 16, 32, 64, 128])
@pytest.mark.parametrize("out_dtype", [torch.half, torch.bfloat16])
-@pytest.mark.parametrize("use_custom_kernel", [True, False])
-def test_fp8_blockwise_scaled_grouped_mm(num_experts, out_dtype, use_custom_kernel):
-    cc = torch.cuda.get_device_capability(None)[0]
-    if cc == 10 and use_custom_kernel:
-        return
+def test_fp8_blockwise_scaled_grouped_mm(num_experts, out_dtype):
    device = "cuda"
-    alignment = 16
-    n_g = alignment * random.randint(1, 5) * 128
-    k_g = alignment * random.randint(1, 5) * 128
+    alignment = 128
+    n_g = random.randint(1, 64) * 128
+    k_g = random.randint(1, 64) * 128

    expert_offsets = torch.zeros((num_experts + 1), device=device, dtype=torch.int32)
    problem_sizes = torch.zeros((num_experts, 3), device=device, dtype=torch.int32)
    layout_sfa = torch.zeros((num_experts, 5), device=device, dtype=torch.int32)
    layout_sfb = torch.zeros((num_experts, 5), device=device, dtype=torch.int32)

-    a_original_tensors = []
    a_tensors = []
    b_tensors = []
    a_scales_tensors = []
@@ -131,7 +122,7 @@ def test_fp8_blockwise_scaled_grouped_mm(num_experts, out_dtype, use_custom_kern
    baseline_tensors = []

    for g in range(num_experts):
-        m_g = alignment * random.randint(1, 64)
+        m_g = random.randint(1, 256)
        expert_offsets[g + 1] = expert_offsets[g] + m_g
        problem_sizes[g][:] = torch.tensor([m_g, n_g, k_g], device=device)

@@ -144,7 +135,6 @@ def test_fp8_blockwise_scaled_grouped_mm(num_experts, out_dtype, use_custom_kern
        b_g, b_scale = per_block_cast_to_fp8(
            b
        )  # bg -- (K, N):(N, 1), b_scale() -- (k, n):(n, 1)
-        a_original_tensors.append(a)
        a_tensors.append(a_g)
        b_tensors.append(b_g)
        a_scales_tensors.append(a_scale)
@@ -152,9 +142,6 @@ def test_fp8_blockwise_scaled_grouped_mm(num_experts, out_dtype, use_custom_kern

        baseline = torch.mm(a, b)
        baseline_tensors.append(baseline)
-    a_original_stack = torch.empty(
-        (expert_offsets[-1], k_g), device=device, dtype=out_dtype
-    )
    a_stack = torch.empty(
        (expert_offsets[-1], k_g), device=device, dtype=torch.float8_e4m3fn
    )
@@ -162,52 +149,28 @@ def test_fp8_blockwise_scaled_grouped_mm(num_experts, out_dtype, use_custom_kern
        (num_experts, n_g, k_g), device=device, dtype=torch.float8_e4m3fn
    )
    a_scale_stack = torch.empty(
-        (expert_offsets[-1] * (k_g // 128)), device=device, dtype=torch.float32
+        (expert_offsets[-1], (k_g // 128)), device=device, dtype=torch.float32
    )
    b_scale_stack = torch.empty(
-        (num_experts, k_g // 128, n_g // 128), device=device, dtype=torch.float32
+        (num_experts, n_g // 128, k_g // 128), device=device, dtype=torch.float32
    )

    for g in range(num_experts):
        # Matrix A is Row-Major.
-        a_original_stack[expert_offsets[g] : expert_offsets[g + 1]] = (
-            a_original_tensors[g]
-        )
-        a_stack[expert_offsets[g] : expert_offsets[g + 1]] = a_tensors[
+        a_stack[expert_offsets[g] : expert_offsets[g + 1], :] = a_tensors[
            g
-        ]  # a_stack[expert_offsets[g] : expert_offsets[g + 1]] -- (M, K):(K, 1)
+        ]  # a_stack[expert_offsets[g] : expert_offsets[g + 1], :] -- (M, K):(K, 1)
        b_stack[g] = b_tensors[g].t()  # b_stack[g] -- (N, K):(K, 1)
-        if cc == 9:
-            # For SM90, we need MN-Major scale factor
-            # a_scales_tensors[g] -- (M, k):(k, 1)
-            # a_scales_tensors[g].t().contiguous() -- (k, M):(M, 1)
-            a_scale_stack[
-                expert_offsets[g] * (k_g // 128) : expert_offsets[g + 1] * (k_g // 128)
-            ] = (a_scales_tensors[g].t().contiguous().view(-1))
-            b_scale_stack[g] = b_scales_tensors[g]  # b_scale_stack[g] -- (k, n):(n, 1)
-        elif cc == 10:
-            # For SM100, we need K-Major scale factor
-            # a_scales_tensors[g] -- (M, k):(k, 1)
-            a_scale_stack[
-                expert_offsets[g] * (k_g // 128) : expert_offsets[g + 1] * (k_g // 128)
-            ] = a_scales_tensors[g].view(-1)
-            b_scale_stack[g] = b_scales_tensors[
-                g
-            ]  # b_scale_stack[g] -- (k, n):(n, 1), we need transpose & contiguous later
-    a_scale_stack = a_scale_stack.view(expert_offsets[-1], k_g // 128)
-    b_stack = b_stack.transpose(1, 2)  # Transpose Matrix B to Column-Major.
-    if cc == 10:
-        b_scale_stack = b_scale_stack.transpose(1, 2).contiguous()

-    if use_custom_kernel:
-        # Replace a_stack, a_scale_stack with custom kernel output
-        a_stack, a_scale_stack = per_token_group_quant_fp8_hopper_moe_mn_major(
-            a_original_stack,
-            expert_offsets[:-1],
-            problem_sizes,
-            128,
-            expert_tokens_alignment=alignment,
-        )
+        # We need K-Major scale factor
+        a_scale_stack[expert_offsets[g] : expert_offsets[g + 1], :] = a_scales_tensors[
+            g
+        ]
+        b_scale_stack[g] = b_scales_tensors[
+            g
+        ].t()  # b_scale_stack[g] -- (k, n):(n, 1), we need transpose & contiguous later
+    b_stack = b_stack.transpose(1, 2)  # Transpose Matrix B to Column-Major.
+    b_scale_stack = b_scale_stack.transpose(1, 2)

    c_out = torch.empty((expert_offsets[-1], n_g), device=device, dtype=out_dtype)
    a_strides = torch.full(
@@ -250,7 +213,7 @@ def test_fp8_blockwise_scaled_grouped_mm(num_experts, out_dtype, use_custom_kern
        diff = calc_diff(actual, baseline)
        assert diff < 0.001
        print(
-            f"cc={cc}0 num_experts={num_experts}, out_dtype={out_dtype}, diff={diff:.5f}: OK"
+            f"m_g={baseline.shape[0]} n_g={n_g} k_g={k_g} num_experts={num_experts}, out_dtype={out_dtype}, diff={diff:.5f}: OK"
        )