Revert "[MOE] enable efficient moe_alignment multi-blocks execution (3x~6x)" (#3982)

2025-02-28 23:57:17 -08:00
parent 6b859e7ddd
commit 18bb216c28
5 changed files with 94 additions and 381 deletions
--- a/sgl-kernel/tests/test_moe_align.py
+++ b/sgl-kernel/tests/test_moe_align.py
@@ -171,12 +171,12 @@ def test_moe_align_block_size_compare_implementations(block_size, num_tokens, to
    num_tokens_post_pad_cuda = torch.empty(
        (1), dtype=torch.int32, device=topk_ids.device
    )
-    token_cnts_buffer = torch.zeros(
+    token_cnts_buffer = torch.empty(
        (num_experts + 1) * num_experts,
        dtype=torch.int32,
        device=topk_ids.device,
    )
-    cumsum_buffer = torch.zeros(
+    cumsum_buffer = torch.empty(
        num_experts + 1, dtype=torch.int32, device=topk_ids.device
    )