[2/2] Support MHA prefill with FlashAttention 4. (#10937)
Co-authored-by: Hieu Pham <hyhieu@gmail.com>
This commit is contained in:
@@ -1746,16 +1746,10 @@ class ModelRunner:
|
||||
|
||||
def _get_attention_backend(self):
|
||||
"""Init attention kernel backend."""
|
||||
self.decode_attention_backend_str = (
|
||||
self.server_args.decode_attention_backend
|
||||
if self.server_args.decode_attention_backend
|
||||
else self.server_args.attention_backend
|
||||
)
|
||||
self.prefill_attention_backend_str = (
|
||||
self.server_args.prefill_attention_backend
|
||||
if self.server_args.prefill_attention_backend
|
||||
else self.server_args.attention_backend
|
||||
self.prefill_attention_backend_str, self.decode_attention_backend_str = (
|
||||
self.server_args.get_attention_backends()
|
||||
)
|
||||
|
||||
if self.decode_attention_backend_str != self.prefill_attention_backend_str:
|
||||
from sglang.srt.layers.attention.hybrid_attn_backend import (
|
||||
HybridAttnBackend,
|
||||
|
||||
Reference in New Issue
Block a user