[1/2] Add FP8 Blockscale MoE CUTLASS kernel for Blackwell (#5281)

2025-04-22 22:28:20 -07:00
parent 71d1785f2d
commit e62c49557d
8 changed files with 732 additions and 1 deletions
--- a/sgl-kernel/include/sgl_kernel_ops.h
+++ b/sgl-kernel/include/sgl_kernel_ops.h
@@ -209,6 +209,20 @@ std::vector<at::Tensor> moe_fused_gate(
    int64_t n_share_experts_fusion,
    double routed_scaling_factor);

+void fp8_blockwise_scaled_grouped_mm(
+    torch::Tensor& output,
+    const torch::Tensor& a,
+    const torch::Tensor& b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b,
+    const torch::Tensor& stride_a,
+    const torch::Tensor& stride_b,
+    const torch::Tensor& stride_c,
+    const torch::Tensor& layout_sfa,
+    const torch::Tensor& layout_sfb,
+    const torch::Tensor& problem_sizes,
+    const torch::Tensor& expert_offsets);
+
 /*
 * From csrc/speculative
 */