reduce moe_align_block_size_kernel small batch mode overhead (#5086)

2025-04-10 08:59:35 +08:00
parent e3c4bd3153
commit f730362ee2
4 changed files with 139 additions and 52 deletions
--- a/sgl-kernel/tests/test_moe_align.py
+++ b/sgl-kernel/tests/test_moe_align.py
@@ -151,7 +151,6 @@ def moe_align_block_size_triton(
 def test_moe_align_block_size_compare_implementations(
    block_size, num_tokens, topk, num_experts
 ):
-    # For DeepSeek V3, we have 256 experts

    topk_ids = torch.stack(
        [