Add CUTLASS FP8 Blockscale MoE kernel for Hopper architecture (#7278)
Co-authored-by: HydraQYH <QYH820@Outlook.com> Co-authored-by: TianQiLin666666 <1834987979@qq.com>
This commit is contained in:
@@ -53,9 +53,15 @@ def is_sm100_supported(device=None) -> bool:
|
||||
)
|
||||
|
||||
|
||||
def is_sm90_supported(device=None) -> bool:
|
||||
return (torch.cuda.get_device_capability(device)[0] == 9) and (
|
||||
torch.version.cuda >= "12.8"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_sm100_supported(),
|
||||
reason="fp8_blockwise_scaled_grouped_mm at sgl-kernel is only supported on sm100",
|
||||
not (is_sm100_supported() or is_sm90_supported()),
|
||||
reason="fp8_blockwise_scaled_grouped_mm at sgl-kernel is only supported on sm100 or sm90",
|
||||
)
|
||||
@pytest.mark.parametrize("num_experts", [8, 16])
|
||||
@pytest.mark.parametrize("out_dtype", [torch.half, torch.bfloat16])
|
||||
@@ -162,7 +168,7 @@ def test_fp8_blockwise_scaled_grouped_mm(num_experts, out_dtype):
|
||||
for g in range(num_experts):
|
||||
baseline = baseline_tensors[g]
|
||||
actual = c_out[expert_offsets[g] : expert_offsets[g + 1]]
|
||||
torch.testing.assert_close(actual, baseline, rtol=1e-2, atol=5e-4)
|
||||
torch.testing.assert_close(actual, baseline, rtol=1e-2, atol=1e-3)
|
||||
print(f"num_experts={num_experts}, out_dtype={out_dtype}: OK")
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user