reduce moe_align_block_size_kernel small batch mode overhead (#5086)

This commit is contained in:
Xiaoyu Zhang
2025-04-10 08:59:35 +08:00
committed by GitHub
parent e3c4bd3153
commit f730362ee2
4 changed files with 139 additions and 52 deletions

View File

@@ -151,7 +151,6 @@ def moe_align_block_size_triton(
def test_moe_align_block_size_compare_implementations(
block_size, num_tokens, topk, num_experts
):
# For DeepSeek V3, we have 256 experts
topk_ids = torch.stack(
[