Support FA3 as Attention backend by using --attention-backend fa3 (#4680)

Co-authored-by: qsong <qsong@linkedin.com>
Co-authored-by: qingquansong <ustcsqq@gmail.com>
This commit is contained in:
Stefan He
2025-03-23 23:28:11 -07:00
committed by GitHub
parent af6535e7aa
commit 5d7edc8e55
5 changed files with 622 additions and 1 deletions

View File

@@ -868,6 +868,19 @@ class ModelRunner:
from sglang.srt.layers.attention.flashmla_backend import FlashMLABackend
self.attn_backend = FlashMLABackend(self)
elif self.server_args.attention_backend == "fa3":
assert torch.cuda.get_device_capability()[0] >= 9, (
"FlashAttention v3 Backend requires SM>=90. "
"Please use `--attention-backend flashinfer`."
)
logger.warning(
"FlashAttention v3 Backend is in Beta. Multimodal, Page > 1, FP8, MLA and Speculative Decoding are not supported."
)
from sglang.srt.layers.attention.flashattention_backend import (
FlashAttentionBackend,
)
self.attn_backend = FlashAttentionBackend(self)
else:
raise ValueError(
f"Invalid attention backend: {self.server_args.attention_backend}"