diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index b19b7bb32..773cb31a3 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -802,7 +802,32 @@ class ServerArgs: hf_config = self.get_hf_config() model_arch = hf_config.architectures[0] - if model_arch in ["GptOssForCausalLM"]: + if model_arch in ["DeepseekV3ForCausalLM"]: + if is_cuda() and is_sm100_supported(): + if ( + self.attention_backend is None + and self.prefill_attention_backend is None + and self.decode_attention_backend is None + ): + self.attention_backend = "trtllm_mla" + logger.info( + "Use trtllm_mla as attention backend on sm100 for DeepseekV3ForCausalLM" + ) + if not self.enable_dp_attention: + self.enable_flashinfer_allreduce_fusion = True + logger.info( + "Enable FlashInfer AllReduce Fusion on sm100 for DeepseekV3ForCausalLM" + ) + if ( + self.quantization == "modelopt_fp4" + and self.moe_runner_backend == "auto" + ): + self.moe_runner_backend = "flashinfer_trtllm" + logger.info( + "Use flashinfer_trtllm as moe runner backend on sm100 for DeepseekV3ForCausalLM" + ) + + elif model_arch in ["GptOssForCausalLM"]: if ( self.attention_backend is None and self.prefill_attention_backend is None