From e179e0b79738b0718f2ae4e8653554188badc904 Mon Sep 17 00:00:00 2001 From: Cheng Wan <54331508+ch-wan@users.noreply.github.com> Date: Thu, 31 Jul 2025 00:14:39 -0700 Subject: [PATCH] update sgl-kernel for EP: python part (#8550) --- python/pyproject.toml | 2 +- python/sglang/srt/entrypoints/engine.py | 2 +- .../srt/layers/moe/fused_moe_triton/fused_moe.py | 13 ++++--------- 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index 1924352d0..ca0c92cbf 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -54,7 +54,7 @@ runtime_common = [ srt = [ "sglang[runtime_common]", - "sgl-kernel==0.2.7", + "sgl-kernel==0.2.8", "torch==2.7.1", "torchaudio==2.7.1", "torchvision==0.22.1", diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py index e52c546a0..8e1fc51d2 100644 --- a/python/sglang/srt/entrypoints/engine.py +++ b/python/sglang/srt/entrypoints/engine.py @@ -648,7 +648,7 @@ def _set_envs_and_config(server_args: ServerArgs): if _is_cuda: assert_pkg_version( "sgl-kernel", - "0.2.7", + "0.2.8", "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`", ) diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py index 267b594c0..cd027d113 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +++ b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py @@ -568,7 +568,7 @@ def moe_align_block_size( - The padding ensures that the total number of tokens is now divisible by block_size for proper block matrix operations. """ - max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1) + max_num_tokens_padded = topk_ids.numel() + (num_experts + 1) * (block_size - 1) sorted_ids = torch.empty( (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device ) @@ -578,13 +578,9 @@ def moe_align_block_size( ) num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device) + # In EP, expert_ids for filtered experts are -1. We have num_experts + 1 ids in total. cumsum_buffer = torch.empty( - (num_experts + 1,), dtype=torch.int32, device=topk_ids.device - ) - token_cnts_buffer = torch.empty( - (num_experts + 1) * num_experts, - dtype=torch.int32, - device=topk_ids.device, + (num_experts + 2,), dtype=torch.int32, device=topk_ids.device ) # Threshold based on benchmark results @@ -594,12 +590,11 @@ def moe_align_block_size( sgl_moe_align_block_size( topk_ids, - num_experts, + num_experts + 1, block_size, sorted_ids, expert_ids, num_tokens_post_pad, - token_cnts_buffer, cumsum_buffer, fuse_sorted_ids_padding, )