[EP] Add cuda kernel for moe_ep_pre_reorder (#6699)

Co-authored-by: luoyuan.luo <luoyuan.luo@antgroup.com>
This commit is contained in:
Yuan Luo
2025-06-02 11:49:01 +08:00
committed by GitHub
parent 20fd53b8f6
commit 55444ed667
7 changed files with 230 additions and 0 deletions

View File

@@ -150,6 +150,10 @@ TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) {
"n_share_experts_fusion, float routed_scaling_factor) -> "
"(Tensor[])");
m.impl("moe_fused_gate", torch::kCUDA, &moe_fused_gate);
m.def(
"ep_moe_pre_reorder(Tensor input_ptr, Tensor gateup_input_ptr, Tensor src2dst_ptr, Tensor topk_ids_ptr, Tensor "
"a1_scales_ptr, int start_expert_id, int end_expert_id, int topk, bool use_per_token_if_dynamic) -> ()");
m.impl("ep_moe_pre_reorder", torch::kCUDA, &ep_moe_pre_reorder);
m.def(
"fp8_blockwise_scaled_grouped_mm(Tensor output, Tensor a_ptrs, Tensor b_ptrs, Tensor out_ptrs, Tensor "
"a_scales_ptrs, Tensor b_scales_ptrs, Tensor a, Tensor b, Tensor scales_a, Tensor scales_b, Tensor "