Add a CUDA kernel for fusing mapping and weighted sum for MoE. (#6916)

Co-authored-by: Elfie Guo <elfiegxf@gmail.com>
2025-06-07 15:24:39 -07:00
parent 62fec60d81
commit 3e56f557fd
7 changed files with 146 additions and 12 deletions
--- a/sgl-kernel/include/sgl_kernel_ops.h
+++ b/sgl-kernel/include/sgl_kernel_ops.h
@@ -276,6 +276,12 @@ void ep_moe_post_reorder(

 void shuffle_rows(const torch::Tensor& input_tensor, const torch::Tensor& dst2src_map, torch::Tensor& output_tensor);

+void apply_shuffle_mul_sum(
+    const torch::Tensor& input,
+    torch::Tensor& output,
+    const torch::Tensor& permutation,
+    const std::optional<torch::Tensor>& factors);
+
 void cutlass_fp4_group_mm(
    torch::Tensor& output,
    const torch::Tensor& a,