Add deepseek style fused moe group gate selection kernel (#4530)

This commit is contained in:
Qingquan Song
2025-03-29 11:51:45 -07:00
committed by GitHub
parent ddf8981d91
commit 45dcfc2e76
9 changed files with 616 additions and 1 deletions

View File

@@ -138,6 +138,11 @@ TORCH_LIBRARY_EXPAND(sgl_kernel, m) {
"token_expert_indices, Tensor gating_output) -> ()");
m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
m.def(
"moe_fused_gate(Tensor input, Tensor bias, int num_expert_group, int topk_group, int topk) -> "
"(Tensor[])");
m.impl("moe_fused_gate", torch::kCUDA, &moe_fused_gate);
/*
* From csrc/speculative
*/