diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py index 478fe9ed2..dfc9d8d7c 100644 --- a/python/sglang/srt/models/deepseek_v2.py +++ b/python/sglang/srt/models/deepseek_v2.py @@ -65,6 +65,7 @@ from sglang.srt.layers.moe import ( get_deepep_mode, get_moe_a2a_backend, should_use_flashinfer_cutlass_moe_fp4_allgather, + should_use_flashinfer_trtllm_moe, ) from sglang.srt.layers.moe.ep_moe.layer import DeepEPMoE, get_moe_impl_class from sglang.srt.layers.moe.fused_moe_triton.layer import ( @@ -375,7 +376,8 @@ class DeepseekV2MoE(nn.Module): ) correction_bias = self.gate.e_score_correction_bias - if _is_fp4_quantization_enabled(): + # https://github.com/sgl-project/sglang/pull/9834#discussion_r2324480643 + if _is_fp4_quantization_enabled() and should_use_flashinfer_trtllm_moe(): correction_bias = correction_bias.to(torch.bfloat16) self.topk = TopK( top_k=config.num_experts_per_tok + self.num_fused_shared_experts, diff --git a/sgl-kernel/csrc/moe/moe_fused_gate.cu b/sgl-kernel/csrc/moe/moe_fused_gate.cu index 782a884fb..1f70a23d9 100644 --- a/sgl-kernel/csrc/moe/moe_fused_gate.cu +++ b/sgl-kernel/csrc/moe/moe_fused_gate.cu @@ -385,6 +385,8 @@ std::vector moe_fused_gate( int64_t num_fused_shared_experts, double routed_scaling_factor, bool apply_routed_scaling_factor_on_output) { + TORCH_CHECK(input.dtype() == bias.dtype(), "input and bias should have the same dtype"); + int64_t num_rows = input.size(0); int32_t num_experts = input.size(1); auto options = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);