[Fix] DeepSeek EP accuracy issue on B200 GPUs (#9946)
This commit is contained in:
committed by
GitHub
parent
60e37f8028
commit
6243c36702
@@ -268,6 +268,7 @@ class ServerArgs:
|
||||
flashinfer_mxfp4_moe_precision: Literal["default", "bf16"] = "default"
|
||||
enable_flashinfer_allreduce_fusion: bool = False
|
||||
deepep_mode: Literal["auto", "normal", "low_latency"] = "auto"
|
||||
disable_deepgemm_ue8m0: bool = False
|
||||
ep_num_redundant_experts: int = 0
|
||||
ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
|
||||
init_expert_location: str = "trivial"
|
||||
@@ -1562,6 +1563,11 @@ class ServerArgs:
|
||||
default="auto",
|
||||
help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--disable-deepgemm-ue8m0",
|
||||
action="store_true",
|
||||
help="Disable DeepGEMM UE8M0 scaling optimizations. This can help with accuracy issues on Blackwell GPUs (B200) for certain models like DeepSeek.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ep-num-redundant-experts",
|
||||
type=int,
|
||||
|
||||
Reference in New Issue
Block a user