[sgl-kernel] Support moe_sum_reduce cuda kernel (#10321)

Co-authored-by: luoyuan.luo <luoyuan.luo@antgroup.com> Co-authored-by: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
2025-09-19 14:12:09 +08:00
parent ac2a723bb3
commit 616a3e20df
7 changed files with 346 additions and 10 deletions
--- a/sgl-kernel/include/sgl_kernel_ops.h
+++ b/sgl-kernel/include/sgl_kernel_ops.h
@@ -293,6 +293,8 @@ void moe_align_block_size(
 void topk_softmax(
    torch::Tensor& topk_weights, torch::Tensor& topk_indices, torch::Tensor& gating_output, bool renormalize);

+void moe_sum_reduce(at::Tensor& input, at::Tensor& output, double routed_scaling_factor);
+
 std::vector<at::Tensor> moe_fused_gate(
    at::Tensor& input,
    at::Tensor& bias,