From 64480df4950a2b120c905917bfedc785ef4c50eb Mon Sep 17 00:00:00 2001 From: yiakwy-xpu-ml-framework-team <89890040+yiakwy-xpu-ml-framework-team@users.noreply.github.com> Date: Sat, 8 Feb 2025 15:39:44 +0800 Subject: [PATCH] [BUG] fix moe benchmark when bs*seq is small (#3382) --- .../fused_moe_triton/benchmark_deepseekv3_moe_align_blocks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmark/kernels/fused_moe_triton/benchmark_deepseekv3_moe_align_blocks.py b/benchmark/kernels/fused_moe_triton/benchmark_deepseekv3_moe_align_blocks.py index e64f57d87..1d9504d0a 100644 --- a/benchmark/kernels/fused_moe_triton/benchmark_deepseekv3_moe_align_blocks.py +++ b/benchmark/kernels/fused_moe_triton/benchmark_deepseekv3_moe_align_blocks.py @@ -157,7 +157,7 @@ def calculate_diff(batch_size, seq_len): ) sorted_ids_cuda.fill_(topk_ids.numel()) max_num_m_blocks = max_num_tokens_padded // block_size - expert_ids_cuda = torch.empty( + expert_ids_cuda = torch.zeros( (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device ) num_tokens_post_pad_cuda = torch.empty( @@ -172,7 +172,7 @@ def calculate_diff(batch_size, seq_len): sorted_ids_triton = torch.empty_like(sorted_ids_cuda) sorted_ids_triton.fill_(topk_ids.numel()) - expert_ids_triton = torch.empty_like(expert_ids_cuda) + expert_ids_triton = torch.zeros_like(expert_ids_cuda) num_tokens_post_pad_triton = torch.empty_like(num_tokens_post_pad_cuda) # compare the performance of cuda and triton implementation