Add a CUDA kernel for fusing mapping and weighted sum for MoE. (#6916)

Co-authored-by: Elfie Guo <elfiegxf@gmail.com>
This commit is contained in:
Elfie Guo
2025-06-07 15:24:39 -07:00
committed by GitHub
parent 62fec60d81
commit 3e56f557fd
7 changed files with 146 additions and 12 deletions

View File

@@ -48,6 +48,7 @@ from sgl_kernel.gemm import (
)
from sgl_kernel.grammar import apply_token_bitmask_inplace_cuda
from sgl_kernel.moe import (
apply_shuffle_mul_sum,
cutlass_fp4_group_mm,
ep_moe_post_reorder,
ep_moe_pre_reorder,

View File

@@ -178,6 +178,17 @@ def prepare_moe_input(
)
def apply_shuffle_mul_sum(
input,
output,
permutation,
factors,
):
torch.ops.sgl_kernel.apply_shuffle_mul_sum.default(
input, output, permutation, factors
)
def cutlass_fp4_group_mm(
a_fp4,
b_fp4,