[1/2] Add FP8 Blockscale MoE CUTLASS kernel for Blackwell (#5281)
This commit is contained in:
14
sgl-kernel/include/sgl_kernel_ops.h
Normal file → Executable file
14
sgl-kernel/include/sgl_kernel_ops.h
Normal file → Executable file
@@ -209,6 +209,20 @@ std::vector<at::Tensor> moe_fused_gate(
|
||||
int64_t n_share_experts_fusion,
|
||||
double routed_scaling_factor);
|
||||
|
||||
void fp8_blockwise_scaled_grouped_mm(
|
||||
torch::Tensor& output,
|
||||
const torch::Tensor& a,
|
||||
const torch::Tensor& b,
|
||||
const torch::Tensor& scales_a,
|
||||
const torch::Tensor& scales_b,
|
||||
const torch::Tensor& stride_a,
|
||||
const torch::Tensor& stride_b,
|
||||
const torch::Tensor& stride_c,
|
||||
const torch::Tensor& layout_sfa,
|
||||
const torch::Tensor& layout_sfb,
|
||||
const torch::Tensor& problem_sizes,
|
||||
const torch::Tensor& expert_offsets);
|
||||
|
||||
/*
|
||||
* From csrc/speculative
|
||||
*/
|
||||
|
||||
Reference in New Issue
Block a user