update sgl-kernel for EP: kernel part (#8514)

Co-authored-by: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com> Co-authored-by: Ke Bao <ispobaoke@gmail.com>
2025-07-30 22:19:55 -07:00
parent 59aab76f0a
commit a5f5ab4030
7 changed files with 12 additions and 32 deletions
--- a/sgl-kernel/benchmark/bench_moe_align_block_size.py
+++ b/sgl-kernel/benchmark/bench_moe_align_block_size.py
@@ -164,9 +164,6 @@ def calculate_diff(num_tokens, num_experts=256, block_size=128, topk=8):
    num_tokens_post_pad_cuda = torch.empty(
        (1), dtype=torch.int32, device=topk_ids.device
    )
-    token_cnts_buffer = torch.zeros(
-        (num_experts + 1) * num_experts, dtype=torch.int32, device=topk_ids.device
-    )
    cumsum_buffer = torch.zeros(
        num_experts + 1, dtype=torch.int32, device=topk_ids.device
    )
@@ -189,7 +186,6 @@ def calculate_diff(num_tokens, num_experts=256, block_size=128, topk=8):
        sorted_ids_cuda,
        expert_ids_cuda,
        num_tokens_post_pad_cuda,
-        token_cnts_buffer,
        cumsum_buffer,
    )
    moe_align_block_size_triton(
@@ -273,11 +269,6 @@ def sgl_moe_align_block_size_with_empty(
    if not pad_sorted_token_ids:
        sorted_ids.fill_(topk_ids.numel())

-    token_cnts_buffer = torch.empty(
-        (num_experts + 1) * num_experts,
-        dtype=torch.int32,
-        device=topk_ids.device,
-    )
    cumsum_buffer = torch.empty(
        num_experts + 1, dtype=torch.int32, device=topk_ids.device
    )
@@ -289,7 +280,6 @@ def sgl_moe_align_block_size_with_empty(
        sorted_ids.clone(),
        expert_ids.clone(),
        num_tokens_post_pad.clone(),
-        token_cnts_buffer,
        cumsum_buffer,
        pad_sorted_token_ids,
    )