@@ -44,13 +44,6 @@ def update_deep_gemm_config(gpu_id: int, server_args: ServerArgs):
|
|||||||
global _DO_COMPILE_ALL
|
global _DO_COMPILE_ALL
|
||||||
global _IS_FIRST_RANK_ON_NODE
|
global _IS_FIRST_RANK_ON_NODE
|
||||||
|
|
||||||
# Update UE8M0 scaling configuration based on server args
|
|
||||||
from sglang.srt.layers.quantization.deep_gemm_wrapper.configurer import (
|
|
||||||
update_deepgemm_scale_ue8m0,
|
|
||||||
)
|
|
||||||
|
|
||||||
update_deepgemm_scale_ue8m0(server_args.disable_deepgemm_ue8m0)
|
|
||||||
|
|
||||||
# Generate m_max
|
# Generate m_max
|
||||||
m_max = 1024 * 16
|
m_max = 1024 * 16
|
||||||
if server_args.chunked_prefill_size < 1:
|
if server_args.chunked_prefill_size < 1:
|
||||||
|
|||||||
@@ -29,21 +29,4 @@ def _is_blackwell_arch() -> bool:
|
|||||||
ENABLE_JIT_DEEPGEMM = _compute_enable_deep_gemm()
|
ENABLE_JIT_DEEPGEMM = _compute_enable_deep_gemm()
|
||||||
|
|
||||||
DEEPGEMM_BLACKWELL = ENABLE_JIT_DEEPGEMM and _is_blackwell_arch()
|
DEEPGEMM_BLACKWELL = ENABLE_JIT_DEEPGEMM and _is_blackwell_arch()
|
||||||
# Allow disabling UE8M0 scaling for accuracy-critical workloads
|
DEEPGEMM_SCALE_UE8M0 = DEEPGEMM_BLACKWELL
|
||||||
# This can help with DeepSeek EP accuracy issues on B200 GPUs
|
|
||||||
# Will be updated by server args in update_deepgemm_scale_ue8m0()
|
|
||||||
DEEPGEMM_SCALE_UE8M0 = DEEPGEMM_BLACKWELL and get_bool_env_var(
|
|
||||||
"SGL_ENABLE_DEEPGEMM_UE8M0", default="true"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def update_deepgemm_scale_ue8m0(disable_ue8m0: bool):
|
|
||||||
"""Update DEEPGEMM_SCALE_UE8M0 based on server arguments."""
|
|
||||||
global DEEPGEMM_SCALE_UE8M0
|
|
||||||
if disable_ue8m0:
|
|
||||||
DEEPGEMM_SCALE_UE8M0 = False
|
|
||||||
logger.info("DeepGEMM UE8M0 scaling disabled via server argument")
|
|
||||||
else:
|
|
||||||
DEEPGEMM_SCALE_UE8M0 = DEEPGEMM_BLACKWELL and get_bool_env_var(
|
|
||||||
"SGL_ENABLE_DEEPGEMM_UE8M0", default="true"
|
|
||||||
)
|
|
||||||
|
|||||||
@@ -268,7 +268,6 @@ class ServerArgs:
|
|||||||
flashinfer_mxfp4_moe_precision: Literal["default", "bf16"] = "default"
|
flashinfer_mxfp4_moe_precision: Literal["default", "bf16"] = "default"
|
||||||
enable_flashinfer_allreduce_fusion: bool = False
|
enable_flashinfer_allreduce_fusion: bool = False
|
||||||
deepep_mode: Literal["auto", "normal", "low_latency"] = "auto"
|
deepep_mode: Literal["auto", "normal", "low_latency"] = "auto"
|
||||||
disable_deepgemm_ue8m0: bool = False
|
|
||||||
ep_num_redundant_experts: int = 0
|
ep_num_redundant_experts: int = 0
|
||||||
ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
|
ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
|
||||||
init_expert_location: str = "trivial"
|
init_expert_location: str = "trivial"
|
||||||
@@ -1563,11 +1562,6 @@ class ServerArgs:
|
|||||||
default="auto",
|
default="auto",
|
||||||
help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
|
help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
|
||||||
"--disable-deepgemm-ue8m0",
|
|
||||||
action="store_true",
|
|
||||||
help="Disable DeepGEMM UE8M0 scaling optimizations. This can help with accuracy issues on Blackwell GPUs (B200) for certain models like DeepSeek.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--ep-num-redundant-experts",
|
"--ep-num-redundant-experts",
|
||||||
type=int,
|
type=int,
|
||||||
|
|||||||
Reference in New Issue
Block a user