[1/2] Add FP8 Blockscale MoE CUTLASS kernel for Blackwell (#5281)

This commit is contained in:
Elfie Guo
2025-04-22 22:28:20 -07:00
committed by GitHub
parent 71d1785f2d
commit e62c49557d
8 changed files with 732 additions and 1 deletions

14
sgl-kernel/include/sgl_kernel_ops.h Normal file → Executable file
View File

@@ -209,6 +209,20 @@ std::vector<at::Tensor> moe_fused_gate(
int64_t n_share_experts_fusion,
double routed_scaling_factor);
void fp8_blockwise_scaled_grouped_mm(
torch::Tensor& output,
const torch::Tensor& a,
const torch::Tensor& b,
const torch::Tensor& scales_a,
const torch::Tensor& scales_b,
const torch::Tensor& stride_a,
const torch::Tensor& stride_b,
const torch::Tensor& stride_c,
const torch::Tensor& layout_sfa,
const torch::Tensor& layout_sfb,
const torch::Tensor& problem_sizes,
const torch::Tensor& expert_offsets);
/*
* From csrc/speculative
*/