[1/2] Add FP8 Blockscale MoE CUTLASS kernel for Blackwell (#5281)

This commit is contained in:
Elfie Guo
2025-04-22 22:28:20 -07:00
committed by GitHub
parent 71d1785f2d
commit e62c49557d
8 changed files with 732 additions and 1 deletions

7
sgl-kernel/python/sgl_kernel/__init__.py Normal file → Executable file
View File

@@ -41,7 +41,12 @@ from sgl_kernel.gemm import (
sgl_per_token_group_quant_int8,
sgl_per_token_quant_fp8,
)
from sgl_kernel.moe import moe_align_block_size, moe_fused_gate, topk_softmax
from sgl_kernel.moe import (
fp8_blockwise_scaled_grouped_mm,
moe_align_block_size,
moe_fused_gate,
topk_softmax,
)
from sgl_kernel.sampling import (
min_p_sampling_from_probs,
top_k_renorm_prob,

30
sgl-kernel/python/sgl_kernel/moe.py Normal file → Executable file
View File

@@ -60,3 +60,33 @@ def moe_fused_gate(
n_share_experts_fusion,
routed_scaling_factor,
)
def fp8_blockwise_scaled_grouped_mm(
output,
a,
b,
scales_a,
scales_b,
stride_a,
stride_b,
stride_c,
layout_sfa,
layout_sfb,
problem_sizes,
expert_offsets,
):
torch.ops.sgl_kernel.fp8_blockwise_scaled_grouped_mm.default(
output,
a,
b,
scales_a,
scales_b,
stride_a,
stride_b,
stride_c,
layout_sfa,
layout_sfb,
problem_sizes,
expert_offsets,
)