diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 923482d72..317734578 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -378,6 +378,12 @@ class ModelRunner: ) server_args.attention_backend = "torch_native" + if server_args.prefill_attention_backend is not None and ( + server_args.prefill_attention_backend + == server_args.decode_attention_backend + ): # override the default attention backend + server_args.attention_backend = server_args.prefill_attention_backend + if server_args.attention_backend is None: """ Auto select the fastest attention backend.