[2/2] Support MHA prefill with FlashAttention 4. (#10937)

Co-authored-by: Hieu Pham <hyhieu@gmail.com>
This commit is contained in:
Lifu Huang
2025-10-08 00:54:20 -07:00
committed by GitHub
parent 97cd38e58d
commit edefab0c64
7 changed files with 34 additions and 23 deletions

View File

@@ -1746,16 +1746,10 @@ class ModelRunner:
def _get_attention_backend(self):
"""Init attention kernel backend."""
self.decode_attention_backend_str = (
self.server_args.decode_attention_backend
if self.server_args.decode_attention_backend
else self.server_args.attention_backend
)
self.prefill_attention_backend_str = (
self.server_args.prefill_attention_backend
if self.server_args.prefill_attention_backend
else self.server_args.attention_backend
self.prefill_attention_backend_str, self.decode_attention_backend_str = (
self.server_args.get_attention_backends()
)
if self.decode_attention_backend_str != self.prefill_attention_backend_str:
from sglang.srt.layers.attention.hybrid_attn_backend import (
HybridAttnBackend,