[Fix] DeepSeek EP accuracy issue on B200 GPUs (#9946)

This commit is contained in:
Al-Ekram Elahee Hridoy
2025-09-02 20:31:15 -06:00
committed by GitHub
parent 60e37f8028
commit 6243c36702
3 changed files with 31 additions and 1 deletions

View File

@@ -44,6 +44,13 @@ def update_deep_gemm_config(gpu_id: int, server_args: ServerArgs):
global _DO_COMPILE_ALL
global _IS_FIRST_RANK_ON_NODE
# Update UE8M0 scaling configuration based on server args
from sglang.srt.layers.quantization.deep_gemm_wrapper.configurer import (
update_deepgemm_scale_ue8m0,
)
update_deepgemm_scale_ue8m0(server_args.disable_deepgemm_ue8m0)
# Generate m_max
m_max = 1024 * 16
if server_args.chunked_prefill_size < 1:

View File

@@ -29,4 +29,21 @@ def _is_blackwell_arch() -> bool:
ENABLE_JIT_DEEPGEMM = _compute_enable_deep_gemm()
DEEPGEMM_BLACKWELL = ENABLE_JIT_DEEPGEMM and _is_blackwell_arch()
DEEPGEMM_SCALE_UE8M0 = DEEPGEMM_BLACKWELL
# Allow disabling UE8M0 scaling for accuracy-critical workloads
# This can help with DeepSeek EP accuracy issues on B200 GPUs
# Will be updated by server args in update_deepgemm_scale_ue8m0()
DEEPGEMM_SCALE_UE8M0 = DEEPGEMM_BLACKWELL and get_bool_env_var(
"SGL_ENABLE_DEEPGEMM_UE8M0", default="true"
)
def update_deepgemm_scale_ue8m0(disable_ue8m0: bool):
"""Update DEEPGEMM_SCALE_UE8M0 based on server arguments."""
global DEEPGEMM_SCALE_UE8M0
if disable_ue8m0:
DEEPGEMM_SCALE_UE8M0 = False
logger.info("DeepGEMM UE8M0 scaling disabled via server argument")
else:
DEEPGEMM_SCALE_UE8M0 = DEEPGEMM_BLACKWELL and get_bool_env_var(
"SGL_ENABLE_DEEPGEMM_UE8M0", default="true"
)