Revert "[Fix] DeepSeek EP accuracy issue on B200 GPUs (#9946)" (#9955)

2025-09-02 23:49:56 -07:00
parent df397a72e8
commit 2c7ca33abb
3 changed files with 1 additions and 31 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -268,7 +268,6 @@ class ServerArgs:
    flashinfer_mxfp4_moe_precision: Literal["default", "bf16"] = "default"
    enable_flashinfer_allreduce_fusion: bool = False
    deepep_mode: Literal["auto", "normal", "low_latency"] = "auto"
-    disable_deepgemm_ue8m0: bool = False
    ep_num_redundant_experts: int = 0
    ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
    init_expert_location: str = "trivial"
@@ -1563,11 +1562,6 @@ class ServerArgs:
            default="auto",
            help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
        )
-        parser.add_argument(
-            "--disable-deepgemm-ue8m0",
-            action="store_true",
-            help="Disable DeepGEMM UE8M0 scaling optimizations. This can help with accuracy issues on Blackwell GPUs (B200) for certain models like DeepSeek.",
-        )
        parser.add_argument(
            "--ep-num-redundant-experts",
            type=int,