Add a CUDA kernel for fusing mapping and weighted sum for MoE. (#6916)

Co-authored-by: Elfie Guo <elfiegxf@gmail.com>
2025-06-07 15:24:39 -07:00
parent 62fec60d81
commit 3e56f557fd
7 changed files with 146 additions and 12 deletions
--- a/sgl-kernel/python/sgl_kernel/init.py
+++ b/sgl-kernel/python/sgl_kernel/init.py
@@ -48,6 +48,7 @@ from sgl_kernel.gemm import (
 )
 from sgl_kernel.grammar import apply_token_bitmask_inplace_cuda
 from sgl_kernel.moe import (
+    apply_shuffle_mul_sum,
    cutlass_fp4_group_mm,
    ep_moe_post_reorder,
    ep_moe_pre_reorder,
--- a/sgl-kernel/python/sgl_kernel/moe.py
+++ b/sgl-kernel/python/sgl_kernel/moe.py
@@ -178,6 +178,17 @@ def prepare_moe_input(
    )


+def apply_shuffle_mul_sum(
+    input,
+    output,
+    permutation,
+    factors,
+):
+    torch.ops.sgl_kernel.apply_shuffle_mul_sum.default(
+        input, output, permutation, factors
+    )
+
+
 def cutlass_fp4_group_mm(
    a_fp4,
    b_fp4,