From dd949ace23d6d112d9e3e1e2020deb3278cc8b3c Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Sun, 10 Aug 2025 17:34:54 -0700 Subject: [PATCH] =?UTF-8?q?Revert=20"[1/2][resubmit]=20sgl-kernel:=20Fuse?= =?UTF-8?q?=20routed=20scaling=20factor=20into=20m=E2=80=A6=20(#9035)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- python/sglang/srt/layers/moe/topk.py | 24 ---------------------- sgl-kernel/csrc/common_extension.cc | 2 +- sgl-kernel/csrc/moe/moe_fused_gate.cu | 27 +++++++------------------ sgl-kernel/include/sgl_kernel_ops.h | 3 +-- sgl-kernel/python/sgl_kernel/moe.py | 11 ++-------- sgl-kernel/tests/test_moe_fused_gate.py | 7 +------ 6 files changed, 12 insertions(+), 62 deletions(-) diff --git a/python/sglang/srt/layers/moe/topk.py b/python/sglang/srt/layers/moe/topk.py index 5313a0eff..8830cd272 100644 --- a/python/sglang/srt/layers/moe/topk.py +++ b/python/sglang/srt/layers/moe/topk.py @@ -132,7 +132,6 @@ class TopK(CustomOp): scoring_func: str = "softmax", correction_bias: Optional[torch.Tensor] = None, routed_scaling_factor: Optional[float] = None, - apply_routed_scaling_factor_on_output: Optional[bool] = False, ): # NOTE: scoring_func is not used for now, but we keep it for future use # see https://github.com/sgl-project/sglang/pull/4505 for more details @@ -148,9 +147,6 @@ class TopK(CustomOp): self.custom_routing_function = custom_routing_function self.correction_bias = correction_bias self.routed_scaling_factor = routed_scaling_factor - self.apply_routed_scaling_factor_on_output = ( - apply_routed_scaling_factor_on_output - ) self.use_triton_kernels = global_server_args_dict["enable_triton_kernel_moe"] @@ -211,7 +207,6 @@ class TopK(CustomOp): routed_scaling_factor=self.routed_scaling_factor, num_token_non_padded=num_token_non_padded, expert_location_dispatch_info=expert_location_dispatch_info, - apply_routed_scaling_factor_on_output=self.apply_routed_scaling_factor_on_output, ) def forward_cpu( @@ -381,7 +376,6 @@ def grouped_topk_gpu( routed_scaling_factor: Optional[float] = None, num_token_non_padded: Optional[torch.Tensor] = None, expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None, - apply_routed_scaling_factor_on_output: Optional[bool] = False, ): assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch" @@ -429,8 +423,6 @@ def grouped_topk_gpu( else topk_weights[:, :-1].sum(dim=-1, keepdim=True) ) topk_weights = topk_weights / topk_weights_sum - if apply_routed_scaling_factor_on_output: - topk_weights *= routed_scaling_factor topk_weights, topk_ids = topk_weights.to(torch.float32), topk_ids.to(torch.int32) topk_ids = topk_ids_logical_to_physical(topk_ids, expert_location_dispatch_info) @@ -479,7 +471,6 @@ def biased_grouped_topk_impl( routed_scaling_factor: Optional[float] = None, num_token_non_padded: Optional[torch.Tensor] = None, expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None, - apply_routed_scaling_factor_on_output: Optional[bool] = False, ): assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch" @@ -531,8 +522,6 @@ def biased_grouped_topk_impl( else topk_weights[:, :-1].sum(dim=-1, keepdim=True) ) topk_weights = topk_weights / topk_weights_sum - if apply_routed_scaling_factor_on_output: - topk_weights *= routed_scaling_factor topk_weights, topk_ids = topk_weights.to(torch.float32), topk_ids.to(torch.int32) topk_ids = topk_ids_logical_to_physical(topk_ids, expert_location_dispatch_info) @@ -575,10 +564,7 @@ def biased_grouped_topk_gpu( routed_scaling_factor: Optional[float] = None, num_token_non_padded: Optional[torch.Tensor] = None, expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None, - apply_routed_scaling_factor_on_output: Optional[bool] = False, ): - # TODO(trevor-m): Remove once sgl-kernel is updated - assert not apply_routed_scaling_factor_on_output assert ( routed_scaling_factor is not None ), "routed_scaling_factor is required for biased_grouped_topk" @@ -597,8 +583,6 @@ def biased_grouped_topk_gpu( topk, num_fused_shared_experts, routed_scaling_factor, - # TODO(trevor-m): Uncomment once sgl-kernel is updated - # apply_routed_scaling_factor_on_output, ) # TODO merge into kernel if (expert_location_dispatch_info is not None) or ( @@ -609,7 +593,6 @@ def biased_grouped_topk_gpu( ) return topk_weights, topk_ids elif _use_aiter: - assert not apply_routed_scaling_factor_on_output, "Not implemented" token = gating_output.shape[0] device = gating_output.device assert ( @@ -641,7 +624,6 @@ def biased_grouped_topk_gpu( routed_scaling_factor=routed_scaling_factor, num_token_non_padded=num_token_non_padded, expert_location_dispatch_info=expert_location_dispatch_info, - apply_routed_scaling_factor_on_output=apply_routed_scaling_factor_on_output, ) @@ -701,7 +683,6 @@ def select_experts( routed_scaling_factor: Optional[float] = None, num_token_non_padded: Optional[torch.Tensor] = None, expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None, - apply_routed_scaling_factor_on_output: Optional[bool] = False, ) -> TopKOutput: router_logits, correction_bias = ( expert_location_dispatch.transform_select_experts_inputs( @@ -727,7 +708,6 @@ def select_experts( routed_scaling_factor=routed_scaling_factor, num_token_non_padded=num_token_non_padded, expert_location_dispatch_info=expert_location_dispatch_info, - apply_routed_scaling_factor_on_output=apply_routed_scaling_factor_on_output, ) else: topk_weights, topk_ids = biased_grouped_topk( @@ -742,14 +722,12 @@ def select_experts( routed_scaling_factor=routed_scaling_factor, num_token_non_padded=num_token_non_padded, expert_location_dispatch_info=expert_location_dispatch_info, - apply_routed_scaling_factor_on_output=apply_routed_scaling_factor_on_output, ) elif torch_native and custom_routing_function is None: assert ( num_token_non_padded is None ), "num_token_non_padded is not yet supported in fused_topk_native" assert expert_location_dispatch_info is None - assert not apply_routed_scaling_factor_on_output, "Not implemented" topk_weights, topk_ids = fused_topk_native( hidden_states=hidden_states, gating_output=router_logits, @@ -757,7 +735,6 @@ def select_experts( renormalize=renormalize, ) elif custom_routing_function is None: - assert not apply_routed_scaling_factor_on_output, "Not implemented" # Qwen3MOE uses fused_topk topk_weights, topk_ids = fused_topk( hidden_states=hidden_states, @@ -772,7 +749,6 @@ def select_experts( num_token_non_padded is None ), "num_token_non_padded is not yet supported in custom_routing_function" assert expert_location_dispatch_info is None - assert not apply_routed_scaling_factor_on_output, "Not implemented" topk_weights, topk_ids = custom_routing_function( hidden_states=hidden_states, gating_output=router_logits, diff --git a/sgl-kernel/csrc/common_extension.cc b/sgl-kernel/csrc/common_extension.cc index 2c3b9b767..fde329699 100644 --- a/sgl-kernel/csrc/common_extension.cc +++ b/sgl-kernel/csrc/common_extension.cc @@ -174,7 +174,7 @@ TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) { m.def( "moe_fused_gate(Tensor input, Tensor bias, int num_expert_group, int topk_group, int topk, int " - "num_fused_shared_experts, float routed_scaling_factor, bool apply_routed_scaling_factor_on_output) -> " + "num_fused_shared_experts, float routed_scaling_factor) -> " "(Tensor[])"); m.impl("moe_fused_gate", torch::kCUDA, &moe_fused_gate); m.def( diff --git a/sgl-kernel/csrc/moe/moe_fused_gate.cu b/sgl-kernel/csrc/moe/moe_fused_gate.cu index 782a884fb..24bf2d36b 100644 --- a/sgl-kernel/csrc/moe/moe_fused_gate.cu +++ b/sgl-kernel/csrc/moe/moe_fused_gate.cu @@ -59,7 +59,6 @@ __device__ void moe_fused_gate_impl( int64_t topk, int64_t num_fused_shared_experts, double routed_scaling_factor, - bool apply_routed_scaling_factor_on_output, Params params) { int tidx = threadIdx.x; int64_t thread_row = @@ -249,9 +248,6 @@ __device__ void moe_fused_gate_impl( for (int ii = 0; ii < topk; ++ii) { int64_t const idx = topk * thread_row + ii; output_ptr[idx] = output_ptr[idx] / output_sum; - if (apply_routed_scaling_factor_on_output) { - output_ptr[idx] *= routed_scaling_factor; - } } } } @@ -286,8 +282,7 @@ __global__ void moe_fused_gate_kernel( int64_t topk_group, int64_t topk, int64_t num_fused_shared_experts, - double routed_scaling_factor, - bool apply_routed_scaling_factor_on_output) { + double routed_scaling_factor) { KernelParams params; moe_fused_gate_impl( input, @@ -299,7 +294,6 @@ __global__ void moe_fused_gate_kernel( topk, num_fused_shared_experts, routed_scaling_factor, - apply_routed_scaling_factor_on_output, params); } @@ -320,8 +314,7 @@ __global__ void moe_fused_gate_kernel( topk_group, \ topk, \ num_fused_shared_experts, \ - routed_scaling_factor, \ - apply_routed_scaling_factor_on_output); \ + routed_scaling_factor); \ dispatched = true; \ } while (0) @@ -349,8 +342,7 @@ __global__ void moe_fused_gate_kernel_dynamic( int64_t topk_group, int64_t topk, int64_t num_fused_shared_experts, - double routed_scaling_factor, - bool apply_routed_scaling_factor_on_output) { + double routed_scaling_factor) { KernelParamsDynamic params; params.NUM_EXPERTS = num_experts; // e.g, for deepseek v3, this is 256 params.VPT = num_experts / num_expert_group; // e.g., for deepseek v3, this is 256 / 8 = 32 @@ -369,7 +361,6 @@ __global__ void moe_fused_gate_kernel_dynamic( topk, num_fused_shared_experts, routed_scaling_factor, - apply_routed_scaling_factor_on_output, params); } @@ -383,8 +374,7 @@ std::vector moe_fused_gate( int64_t topk_group, int64_t topk, int64_t num_fused_shared_experts, - double routed_scaling_factor, - bool apply_routed_scaling_factor_on_output) { + double routed_scaling_factor) { int64_t num_rows = input.size(0); int32_t num_experts = input.size(1); auto options = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA); @@ -483,8 +473,7 @@ std::vector moe_fused_gate( topk_group, topk, num_fused_shared_experts, - routed_scaling_factor, - apply_routed_scaling_factor_on_output); + routed_scaling_factor); } else if (input.scalar_type() == at::kHalf) { moe_fused_gate_kernel_dynamic<<>>( input.data_ptr(), @@ -497,8 +486,7 @@ std::vector moe_fused_gate( topk_group, topk, num_fused_shared_experts, - routed_scaling_factor, - apply_routed_scaling_factor_on_output); + routed_scaling_factor); } else if (input.scalar_type() == at::kFloat) { moe_fused_gate_kernel_dynamic<<>>( input.data_ptr(), @@ -511,8 +499,7 @@ std::vector moe_fused_gate( topk_group, topk, num_fused_shared_experts, - routed_scaling_factor, - apply_routed_scaling_factor_on_output); + routed_scaling_factor); } else { TORCH_CHECK(false, "Unsupported data type for moe_fused_gate"); } diff --git a/sgl-kernel/include/sgl_kernel_ops.h b/sgl-kernel/include/sgl_kernel_ops.h index 15b3e2db7..9e71cd8c8 100644 --- a/sgl-kernel/include/sgl_kernel_ops.h +++ b/sgl-kernel/include/sgl_kernel_ops.h @@ -243,8 +243,7 @@ std::vector moe_fused_gate( int64_t topk_group, int64_t topk, int64_t num_fused_shared_experts, - double routed_scaling_factor, - bool apply_routed_scaling_factor_on_output); + double routed_scaling_factor); void fp8_blockwise_scaled_grouped_mm( torch::Tensor& output, diff --git a/sgl-kernel/python/sgl_kernel/moe.py b/sgl-kernel/python/sgl_kernel/moe.py index 9008e7a79..c16a2b6fe 100755 --- a/sgl-kernel/python/sgl_kernel/moe.py +++ b/sgl-kernel/python/sgl_kernel/moe.py @@ -44,7 +44,6 @@ def moe_fused_gate( topk, num_fused_shared_experts=0, routed_scaling_factor=0, - apply_routed_scaling_factor_on_output=False, ): # This fused kernel function is used to select topk expert in a hierarchical 2-layer fashion # it split group of expert into num_expert_group, and use top2 expert weight sum in each group @@ -52,13 +51,8 @@ def moe_fused_gate( # the #experts is decided by the input tensor shape and we currently only support power of 2 #experts # and #experts should be divisible by num_expert_group. #expert/num_expert_group <= 32 is limited for now. # for non-supported case, we suggest to use the biased_grouped_topk func in sglang.srt.layers.moe.topk - # num_fused_shared_experts: if > 0, the last several experts will be - # replaced with shared experts. the shared experts will be divided by the - # routed_scaling_factor - this is intended to cancel out later when routed+shared - # output is scaled so that shared experts are not scaled. - # routed_scaling_factor: if > 0, the experts will be scaled by this factor - # apply_routed_scaling_factor_on_output: if true, output will be - # scaled by the routed_scaling_factor + # num_fused_shared_experts: if > 0, the last several experts will be replaced with shared experts + # routed_scaling_factor: if > 0, the shared experts will be scaled by this factor return torch.ops.sgl_kernel.moe_fused_gate.default( input_tensor, bias, @@ -67,7 +61,6 @@ def moe_fused_gate( topk, num_fused_shared_experts, routed_scaling_factor, - apply_routed_scaling_factor_on_output, ) diff --git a/sgl-kernel/tests/test_moe_fused_gate.py b/sgl-kernel/tests/test_moe_fused_gate.py index 274f387a9..70c4ea209 100644 --- a/sgl-kernel/tests/test_moe_fused_gate.py +++ b/sgl-kernel/tests/test_moe_fused_gate.py @@ -19,10 +19,7 @@ from sglang.srt.layers.moe.topk import biased_grouped_topk ], ) @pytest.mark.parametrize("num_fused_shared_experts", [0, 1, 2]) -@pytest.mark.parametrize("apply_routed_scaling_factor_on_output", [True, False]) -def test_moe_fused_gate_combined( - seq_length, params, num_fused_shared_experts, apply_routed_scaling_factor_on_output -): +def test_moe_fused_gate_combined(seq_length, params, num_fused_shared_experts): num_experts, num_expert_group, topk_group, topk = params dtype = torch.float32 @@ -40,7 +37,6 @@ def test_moe_fused_gate_combined( topk=topk, num_fused_shared_experts=num_fused_shared_experts, routed_scaling_factor=2.5, - apply_routed_scaling_factor_on_output=apply_routed_scaling_factor_on_output, ) ref_output, ref_indices = biased_grouped_topk( scores, @@ -52,7 +48,6 @@ def test_moe_fused_gate_combined( topk_group=topk_group, num_fused_shared_experts=num_fused_shared_experts, routed_scaling_factor=2.5, - apply_routed_scaling_factor_on_output=apply_routed_scaling_factor_on_output, ) # When num_fused_shared_experts > 0, ignore the comparison of the last topk dimension