Update DeepSeek-R1-FP4 default config on blackwell (#11512)
This commit is contained in:
@@ -802,7 +802,32 @@ class ServerArgs:
|
||||
|
||||
hf_config = self.get_hf_config()
|
||||
model_arch = hf_config.architectures[0]
|
||||
if model_arch in ["GptOssForCausalLM"]:
|
||||
if model_arch in ["DeepseekV3ForCausalLM"]:
|
||||
if is_cuda() and is_sm100_supported():
|
||||
if (
|
||||
self.attention_backend is None
|
||||
and self.prefill_attention_backend is None
|
||||
and self.decode_attention_backend is None
|
||||
):
|
||||
self.attention_backend = "trtllm_mla"
|
||||
logger.info(
|
||||
"Use trtllm_mla as attention backend on sm100 for DeepseekV3ForCausalLM"
|
||||
)
|
||||
if not self.enable_dp_attention:
|
||||
self.enable_flashinfer_allreduce_fusion = True
|
||||
logger.info(
|
||||
"Enable FlashInfer AllReduce Fusion on sm100 for DeepseekV3ForCausalLM"
|
||||
)
|
||||
if (
|
||||
self.quantization == "modelopt_fp4"
|
||||
and self.moe_runner_backend == "auto"
|
||||
):
|
||||
self.moe_runner_backend = "flashinfer_trtllm"
|
||||
logger.info(
|
||||
"Use flashinfer_trtllm as moe runner backend on sm100 for DeepseekV3ForCausalLM"
|
||||
)
|
||||
|
||||
elif model_arch in ["GptOssForCausalLM"]:
|
||||
if (
|
||||
self.attention_backend is None
|
||||
and self.prefill_attention_backend is None
|
||||
|
||||
Reference in New Issue
Block a user