reduce moe_align_block_size_kernel small batch mode overhead (#5086)

This commit is contained in:
Xiaoyu Zhang
2025-04-10 08:59:35 +08:00
committed by GitHub
parent e3c4bd3153
commit f730362ee2
4 changed files with 139 additions and 52 deletions

View File

@@ -702,7 +702,7 @@ def moe_align_block_size(
num_tokens_post_pad,
)
else:
token_cnts_buffer = torch.zeros(
token_cnts_buffer = torch.empty(
(num_experts + 1) * num_experts,
dtype=torch.int32,
device=topk_ids.device,