optimize moe_align_kernel cuda (#3347)

2025-02-07 00:53:46 +08:00
parent adeee15204
commit cdae77b03d
3 changed files with 29 additions and 21 deletions
--- a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
@@ -417,12 +417,12 @@ def moe_align_block_size(
                num_tokens_post_pad,
            )
        else:
-            token_cnts_buffer = torch.empty(
+            token_cnts_buffer = torch.zeros(
                (num_experts + 1) * num_experts,
                dtype=torch.int32,
                device=topk_ids.device,
            )
-            cumsum_buffer = torch.empty(
+            cumsum_buffer = torch.zeros(
                num_experts + 1, dtype=torch.int32, device=topk_ids.device
            )