Support FA3 as Attention backend by using --attention-backend fa3 (#4680)
Co-authored-by: qsong <qsong@linkedin.com> Co-authored-by: qingquansong <ustcsqq@gmail.com>
This commit is contained in:
@@ -868,6 +868,19 @@ class ModelRunner:
|
||||
from sglang.srt.layers.attention.flashmla_backend import FlashMLABackend
|
||||
|
||||
self.attn_backend = FlashMLABackend(self)
|
||||
elif self.server_args.attention_backend == "fa3":
|
||||
assert torch.cuda.get_device_capability()[0] >= 9, (
|
||||
"FlashAttention v3 Backend requires SM>=90. "
|
||||
"Please use `--attention-backend flashinfer`."
|
||||
)
|
||||
logger.warning(
|
||||
"FlashAttention v3 Backend is in Beta. Multimodal, Page > 1, FP8, MLA and Speculative Decoding are not supported."
|
||||
)
|
||||
from sglang.srt.layers.attention.flashattention_backend import (
|
||||
FlashAttentionBackend,
|
||||
)
|
||||
|
||||
self.attn_backend = FlashAttentionBackend(self)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Invalid attention backend: {self.server_args.attention_backend}"
|
||||
|
||||
Reference in New Issue
Block a user