[code clean] add H20 cutlass groupGemm default config (#9333)

Co-authored-by: wanghanpei <wanghanpei@bytedance.com>
2025-08-22 10:23:29 +08:00
parent 53e2cd464f
commit 5fd311d33e
1 changed files with 15 additions and 35 deletions
--- a/sgl-kernel/csrc/moe/fp8_blockwise_moe_kernel.cu
+++ b/sgl-kernel/csrc/moe/fp8_blockwise_moe_kernel.cu
@@ -437,34 +437,6 @@ void sm100_fp8_blockwise_group_mm_dispatch_shape(
  }
 }
 #define JOIN_STRUCT_PP_NAME(m, n, k, a, b, c) sm90_fp8_pp_config##_##m##_##n##_##k##_##a##_##b##_##c
 #define JOIN_STRUCT_CO_NAME(m, n, k, a, b, c) sm90_fp8_co_config##_##m##_##n##_##k##_##a##_##b##_##c
 #define GENERATE_SM90_FP8_PP_CONFIG(M, N, K, A, B, C)                                                  \
  struct JOIN_STRUCT_PP_NAME(M, N, K, A, B, C) {                                                       \
    using ElementA = cutlass::float_e4m3_t;                                                            \
    using MmaTileShape = Shape<cute::Int<M>, cute::Int<N>, cute::Int<K>>;                              \
    using ClusterShape = Shape<cute::Int<A>, cute::Int<B>, cute::Int<C>>;                              \
    using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8BlockScaledAccum; \
    using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;                    \
    using ScaleConfig = cutlass::detail::Sm90BlockwiseScaleConfig<1, 128, 128>;                        \
    using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());                                       \
    using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());                                       \
  };
 #define GENERATE_SM90_FP8_CO_CONFIG(M, N, K, A, B, C)                                                     \
  struct JOIN_STRUCT_CO_NAME(M, N, K, A, B, C) {                                                          \
    using ElementA = cutlass::float_e4m3_t;                                                               \
    using MmaTileShape = Shape<cute::Int<M>, cute::Int<N>, cute::Int<K>>;                                 \
    using ClusterShape = Shape<cute::Int<A>, cute::Int<B>, cute::Int<C>>;                                 \
    using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperativeFP8BlockScaledAccum; \
    using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperative;                    \
    using ScaleConfig = cutlass::detail::Sm90BlockwiseScaleConfig<1, 128, 128>;                           \
    using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());                                          \
    using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());                                          \
  };
 template <typename OutType>
 void sm90_fp8_blockwise_group_mm_dispatch_shape(
    torch::Tensor& output,
@@ -509,20 +481,28 @@ void sm90_fp8_blockwise_group_mm_dispatch_shape(
    using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
  };
-  // [NOTE] Tuned for H20
+  // [NOTE] default for H20
-  GENERATE_SM90_FP8_PP_CONFIG(64, 128, 128, 1, 2, 1)
+  struct MmaConfigH20_default {
    using ElementA = cutlass::float_e4m3_t;
    using MmaTileShape = Shape<_64, _128, _128>;
    using ClusterShape = Shape<_1, _2, _1>;
    using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8BlockScaledAccum;
    using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
    using ScaleConfig = cutlass::detail::Sm90BlockwiseScaleConfig<1, 128, 128>;
    using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
    using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
  };
  int num_experts = (int)expert_offsets.size(0);
  torch::TensorOptions options_int = torch::TensorOptions().dtype(torch::kInt64).device(a.device());
  torch::Tensor problem_sizes_transpose = torch::empty(num_experts * 3, options_int);
  bool tuning_H20_kernel = getBoolEnv("SGL_TUNE_DEVICE_KERNEL");
  const std::string H20_device_type_str = "NVIDIA H20";
-  bool is_h20 = isDeviceType(H20_device_type_str);
+  bool is_h20_device = isDeviceType(H20_device_type_str);
-  if (is_h20 && tuning_H20_kernel) {
+  if (is_h20_device) {
-    using execute_gemm_config = sm90_fp8_pp_config_64_128_128_1_2_1;
+    using execute_gemm_config = MmaConfigH20_default;
    run_get_group_gemm_starts<
        execute_gemm_config::LayoutSFA,
        execute_gemm_config::LayoutSFB,