Add CUTLASS FP8 Blockscale MoE kernel for Hopper architecture (#7278)

Co-authored-by: HydraQYH <QYH820@Outlook.com>
Co-authored-by: TianQiLin666666 <1834987979@qq.com>
This commit is contained in:
ayrnb
2025-07-03 14:27:03 +08:00
committed by GitHub
parent 2ff572e28c
commit 2c4feaf308
3 changed files with 578 additions and 9 deletions

View File

@@ -53,9 +53,15 @@ def is_sm100_supported(device=None) -> bool:
)
def is_sm90_supported(device=None) -> bool:
return (torch.cuda.get_device_capability(device)[0] == 9) and (
torch.version.cuda >= "12.8"
)
@pytest.mark.skipif(
not is_sm100_supported(),
reason="fp8_blockwise_scaled_grouped_mm at sgl-kernel is only supported on sm100",
not (is_sm100_supported() or is_sm90_supported()),
reason="fp8_blockwise_scaled_grouped_mm at sgl-kernel is only supported on sm100 or sm90",
)
@pytest.mark.parametrize("num_experts", [8, 16])
@pytest.mark.parametrize("out_dtype", [torch.half, torch.bfloat16])
@@ -162,7 +168,7 @@ def test_fp8_blockwise_scaled_grouped_mm(num_experts, out_dtype):
for g in range(num_experts):
baseline = baseline_tensors[g]
actual = c_out[expert_offsets[g] : expert_offsets[g + 1]]
torch.testing.assert_close(actual, baseline, rtol=1e-2, atol=5e-4)
torch.testing.assert_close(actual, baseline, rtol=1e-2, atol=1e-3)
print(f"num_experts={num_experts}, out_dtype={out_dtype}: OK")