diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py
index 478fe9ed2..dfc9d8d7c 100644
--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -65,6 +65,7 @@ from sglang.srt.layers.moe import (
     get_deepep_mode,
     get_moe_a2a_backend,
     should_use_flashinfer_cutlass_moe_fp4_allgather,
+    should_use_flashinfer_trtllm_moe,
 )
 from sglang.srt.layers.moe.ep_moe.layer import DeepEPMoE, get_moe_impl_class
 from sglang.srt.layers.moe.fused_moe_triton.layer import (
@@ -375,7 +376,8 @@ class DeepseekV2MoE(nn.Module):
         )
 
         correction_bias = self.gate.e_score_correction_bias
-        if _is_fp4_quantization_enabled():
+        # https://github.com/sgl-project/sglang/pull/9834#discussion_r2324480643
+        if _is_fp4_quantization_enabled() and should_use_flashinfer_trtllm_moe():
             correction_bias = correction_bias.to(torch.bfloat16)
         self.topk = TopK(
             top_k=config.num_experts_per_tok + self.num_fused_shared_experts,
diff --git a/sgl-kernel/csrc/moe/moe_fused_gate.cu b/sgl-kernel/csrc/moe/moe_fused_gate.cu
index 782a884fb..1f70a23d9 100644
--- a/sgl-kernel/csrc/moe/moe_fused_gate.cu
+++ b/sgl-kernel/csrc/moe/moe_fused_gate.cu
@@ -385,6 +385,8 @@ std::vector<at::Tensor> moe_fused_gate(
     int64_t num_fused_shared_experts,
     double routed_scaling_factor,
     bool apply_routed_scaling_factor_on_output) {
+  TORCH_CHECK(input.dtype() == bias.dtype(), "input and bias should have the same dtype");
+
   int64_t num_rows = input.size(0);
   int32_t num_experts = input.size(1);
   auto options = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);