[1/2] Add FP8 Blockscale MoE CUTLASS kernel for Blackwell (#5281)
This commit is contained in:
5
sgl-kernel/csrc/common_extension.cc
Normal file → Executable file
5
sgl-kernel/csrc/common_extension.cc
Normal file → Executable file
@@ -150,6 +150,11 @@ TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) {
|
||||
"n_share_experts_fusion, float routed_scaling_factor) -> "
|
||||
"(Tensor[])");
|
||||
m.impl("moe_fused_gate", torch::kCUDA, &moe_fused_gate);
|
||||
m.def(
|
||||
"fp8_blockwise_scaled_grouped_mm(Tensor output, Tensor a, Tensor b, Tensor scales_a, Tensor scales_b, Tensor "
|
||||
"stride_a, Tensor stride_b, Tensor stride_c, Tensor layout_sfa, Tensor layout_sfb, Tensor problem_sizes, Tensor "
|
||||
"expert_offsets) -> ()");
|
||||
m.impl("fp8_blockwise_scaled_grouped_mm", torch::kCUDA, &fp8_blockwise_scaled_grouped_mm);
|
||||
|
||||
/*
|
||||
* From csrc/speculative
|
||||
|
||||
Reference in New Issue
Block a user