[sgl-kernel] Support moe_sum_reduce cuda kernel (#10321)
Co-authored-by: luoyuan.luo <luoyuan.luo@antgroup.com> Co-authored-by: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
This commit is contained in:
@@ -112,6 +112,7 @@ from sgl_kernel.moe import (
|
||||
fp8_blockwise_scaled_grouped_mm,
|
||||
moe_align_block_size,
|
||||
moe_fused_gate,
|
||||
moe_sum_reduce,
|
||||
prepare_moe_input,
|
||||
topk_softmax,
|
||||
)
|
||||
|
||||
@@ -36,6 +36,18 @@ def topk_softmax(
|
||||
)
|
||||
|
||||
|
||||
def moe_sum_reduce(
|
||||
input_tensor,
|
||||
output_tensor,
|
||||
routed_scaling_factor=0,
|
||||
):
|
||||
torch.ops.sgl_kernel.moe_sum_reduce.default(
|
||||
input_tensor,
|
||||
output_tensor,
|
||||
routed_scaling_factor,
|
||||
)
|
||||
|
||||
|
||||
def moe_fused_gate(
|
||||
input_tensor,
|
||||
bias,
|
||||
|
||||
Reference in New Issue
Block a user