Add a CUDA kernel for fusing mapping and weighted sum for MoE. (#6916)

Co-authored-by: Elfie Guo <elfiegxf@gmail.com>
2025-06-07 15:24:39 -07:00
parent 62fec60d81
commit 3e56f557fd
7 changed files with 146 additions and 12 deletions
--- a/sgl-kernel/python/sgl_kernel/init.py
+++ b/sgl-kernel/python/sgl_kernel/init.py
@@ -48,6 +48,7 @@ from sgl_kernel.gemm import (
 )
 from sgl_kernel.grammar import apply_token_bitmask_inplace_cuda
 from sgl_kernel.moe import (
+    apply_shuffle_mul_sum,
    cutlass_fp4_group_mm,
    ep_moe_post_reorder,
    ep_moe_pre_reorder,