[1/2] Add FP8 Blockscale MoE CUTLASS kernel for Blackwell (#5281)

This commit is contained in:
Elfie Guo
2025-04-22 22:28:20 -07:00
committed by GitHub
parent 71d1785f2d
commit e62c49557d
8 changed files with 732 additions and 1 deletions

5
sgl-kernel/csrc/common_extension.cc Normal file → Executable file
View File

@@ -150,6 +150,11 @@ TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) {
"n_share_experts_fusion, float routed_scaling_factor) -> "
"(Tensor[])");
m.impl("moe_fused_gate", torch::kCUDA, &moe_fused_gate);
m.def(
"fp8_blockwise_scaled_grouped_mm(Tensor output, Tensor a, Tensor b, Tensor scales_a, Tensor scales_b, Tensor "
"stride_a, Tensor stride_b, Tensor stride_c, Tensor layout_sfa, Tensor layout_sfb, Tensor problem_sizes, Tensor "
"expert_offsets) -> ()");
m.impl("fp8_blockwise_scaled_grouped_mm", torch::kCUDA, &fp8_blockwise_scaled_grouped_mm);
/*
* From csrc/speculative