fix moe_align_block_size_kernel for shared memory issue (#2579)

Co-authored-by: ispobock <ispobaoke@163.com>
2024-12-26 05:31:04 +08:00
parent 53aed988cb
commit 31548116a8
6 changed files with 225 additions and 2 deletions
--- a/sgl-kernel/tests/test_moe_align.py
+++ b/sgl-kernel/tests/test_moe_align.py
@@ -0,0 +1,26 @@
+import torch
+from sgl_kernel import moe_align_block_size
+
+
+def test_moe_align_block_size():
+    num_experts = 256
+    block_size = 128
+    topk_ids = torch.randint(0, num_experts, (3, 4), dtype=torch.int32, device="cuda")
+
+    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+    sorted_ids = torch.empty(
+        (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device
+    )
+    sorted_ids.fill_(topk_ids.numel())
+    max_num_m_blocks = max_num_tokens_padded // block_size
+    expert_ids = torch.empty(
+        (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
+    )
+    num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
+
+    moe_align_block_size(
+        topk_ids, num_experts, block_size, sorted_ids, expert_ids, num_tokens_post_pad
+    )
+
+
+test_moe_align_block_size()