Add a CUDA kernel for fusing mapping and weighted sum for MoE. (#6916)
Co-authored-by: Elfie Guo <elfiegxf@gmail.com>
This commit is contained in:
@@ -48,6 +48,7 @@ from sgl_kernel.gemm import (
|
||||
)
|
||||
from sgl_kernel.grammar import apply_token_bitmask_inplace_cuda
|
||||
from sgl_kernel.moe import (
|
||||
apply_shuffle_mul_sum,
|
||||
cutlass_fp4_group_mm,
|
||||
ep_moe_post_reorder,
|
||||
ep_moe_pre_reorder,
|
||||
|
||||
@@ -178,6 +178,17 @@ def prepare_moe_input(
|
||||
)
|
||||
|
||||
|
||||
def apply_shuffle_mul_sum(
|
||||
input,
|
||||
output,
|
||||
permutation,
|
||||
factors,
|
||||
):
|
||||
torch.ops.sgl_kernel.apply_shuffle_mul_sum.default(
|
||||
input, output, permutation, factors
|
||||
)
|
||||
|
||||
|
||||
def cutlass_fp4_group_mm(
|
||||
a_fp4,
|
||||
b_fp4,
|
||||
|
||||
Reference in New Issue
Block a user