Update DeepSeek-R1-FP4 default config on blackwell (#11512)
This commit is contained in:
@@ -802,7 +802,32 @@ class ServerArgs:
|
|||||||
|
|
||||||
hf_config = self.get_hf_config()
|
hf_config = self.get_hf_config()
|
||||||
model_arch = hf_config.architectures[0]
|
model_arch = hf_config.architectures[0]
|
||||||
if model_arch in ["GptOssForCausalLM"]:
|
if model_arch in ["DeepseekV3ForCausalLM"]:
|
||||||
|
if is_cuda() and is_sm100_supported():
|
||||||
|
if (
|
||||||
|
self.attention_backend is None
|
||||||
|
and self.prefill_attention_backend is None
|
||||||
|
and self.decode_attention_backend is None
|
||||||
|
):
|
||||||
|
self.attention_backend = "trtllm_mla"
|
||||||
|
logger.info(
|
||||||
|
"Use trtllm_mla as attention backend on sm100 for DeepseekV3ForCausalLM"
|
||||||
|
)
|
||||||
|
if not self.enable_dp_attention:
|
||||||
|
self.enable_flashinfer_allreduce_fusion = True
|
||||||
|
logger.info(
|
||||||
|
"Enable FlashInfer AllReduce Fusion on sm100 for DeepseekV3ForCausalLM"
|
||||||
|
)
|
||||||
|
if (
|
||||||
|
self.quantization == "modelopt_fp4"
|
||||||
|
and self.moe_runner_backend == "auto"
|
||||||
|
):
|
||||||
|
self.moe_runner_backend = "flashinfer_trtllm"
|
||||||
|
logger.info(
|
||||||
|
"Use flashinfer_trtllm as moe runner backend on sm100 for DeepseekV3ForCausalLM"
|
||||||
|
)
|
||||||
|
|
||||||
|
elif model_arch in ["GptOssForCausalLM"]:
|
||||||
if (
|
if (
|
||||||
self.attention_backend is None
|
self.attention_backend is None
|
||||||
and self.prefill_attention_backend is None
|
and self.prefill_attention_backend is None
|
||||||
|
|||||||
Reference in New Issue
Block a user