update sgl-kernel for EP: kernel part (#8514)

Co-authored-by: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
Co-authored-by: Ke Bao <ispobaoke@gmail.com>
This commit is contained in:
Cheng Wan
2025-07-30 22:19:55 -07:00
committed by GitHub
parent 59aab76f0a
commit a5f5ab4030
7 changed files with 12 additions and 32 deletions

View File

@@ -164,9 +164,6 @@ def calculate_diff(num_tokens, num_experts=256, block_size=128, topk=8):
num_tokens_post_pad_cuda = torch.empty(
(1), dtype=torch.int32, device=topk_ids.device
)
token_cnts_buffer = torch.zeros(
(num_experts + 1) * num_experts, dtype=torch.int32, device=topk_ids.device
)
cumsum_buffer = torch.zeros(
num_experts + 1, dtype=torch.int32, device=topk_ids.device
)
@@ -189,7 +186,6 @@ def calculate_diff(num_tokens, num_experts=256, block_size=128, topk=8):
sorted_ids_cuda,
expert_ids_cuda,
num_tokens_post_pad_cuda,
token_cnts_buffer,
cumsum_buffer,
)
moe_align_block_size_triton(
@@ -273,11 +269,6 @@ def sgl_moe_align_block_size_with_empty(
if not pad_sorted_token_ids:
sorted_ids.fill_(topk_ids.numel())
token_cnts_buffer = torch.empty(
(num_experts + 1) * num_experts,
dtype=torch.int32,
device=topk_ids.device,
)
cumsum_buffer = torch.empty(
num_experts + 1, dtype=torch.int32, device=topk_ids.device
)
@@ -289,7 +280,6 @@ def sgl_moe_align_block_size_with_empty(
sorted_ids.clone(),
expert_ids.clone(),
num_tokens_post_pad.clone(),
token_cnts_buffer,
cumsum_buffer,
pad_sorted_token_ids,
)