Revert "[Fix] DeepSeek EP accuracy issue on B200 GPUs (#9946)" (#9955)

2025-09-02 23:49:56 -07:00
parent df397a72e8
commit 2c7ca33abb
3 changed files with 1 additions and 31 deletions
--- a/python/sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py
+++ b/python/sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py
@@ -44,13 +44,6 @@ def update_deep_gemm_config(gpu_id: int, server_args: ServerArgs):
    global _DO_COMPILE_ALL
    global _IS_FIRST_RANK_ON_NODE

-    # Update UE8M0 scaling configuration based on server args
-    from sglang.srt.layers.quantization.deep_gemm_wrapper.configurer import (
-        update_deepgemm_scale_ue8m0,
-    )
-
-    update_deepgemm_scale_ue8m0(server_args.disable_deepgemm_ue8m0)
-
    # Generate m_max
    m_max = 1024 * 16
    if server_args.chunked_prefill_size < 1:
--- a/python/sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py
+++ b/python/sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py
@@ -29,21 +29,4 @@ def _is_blackwell_arch() -> bool:
 ENABLE_JIT_DEEPGEMM = _compute_enable_deep_gemm()

 DEEPGEMM_BLACKWELL = ENABLE_JIT_DEEPGEMM and _is_blackwell_arch()
-# Allow disabling UE8M0 scaling for accuracy-critical workloads
-# This can help with DeepSeek EP accuracy issues on B200 GPUs
-# Will be updated by server args in update_deepgemm_scale_ue8m0()
-DEEPGEMM_SCALE_UE8M0 = DEEPGEMM_BLACKWELL and get_bool_env_var(
-    "SGL_ENABLE_DEEPGEMM_UE8M0", default="true"
-)
-
-
-def update_deepgemm_scale_ue8m0(disable_ue8m0: bool):
-    """Update DEEPGEMM_SCALE_UE8M0 based on server arguments."""
-    global DEEPGEMM_SCALE_UE8M0
-    if disable_ue8m0:
-        DEEPGEMM_SCALE_UE8M0 = False
-        logger.info("DeepGEMM UE8M0 scaling disabled via server argument")
-    else:
-        DEEPGEMM_SCALE_UE8M0 = DEEPGEMM_BLACKWELL and get_bool_env_var(
-            "SGL_ENABLE_DEEPGEMM_UE8M0", default="true"
-        )
+DEEPGEMM_SCALE_UE8M0 = DEEPGEMM_BLACKWELL