Support FA3 as Attention backend by using --attention-backend fa3 (#4680)

Co-authored-by: qsong <qsong@linkedin.com> Co-authored-by: qingquansong <ustcsqq@gmail.com>
2025-03-23 23:28:11 -07:00
parent af6535e7aa
commit 5d7edc8e55
5 changed files with 622 additions and 1 deletions
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -868,6 +868,19 @@ class ModelRunner:
            from sglang.srt.layers.attention.flashmla_backend import FlashMLABackend

            self.attn_backend = FlashMLABackend(self)
+        elif self.server_args.attention_backend == "fa3":
+            assert torch.cuda.get_device_capability()[0] >= 9, (
+                "FlashAttention v3 Backend requires SM>=90. "
+                "Please use `--attention-backend flashinfer`."
+            )
+            logger.warning(
+                "FlashAttention v3 Backend is in Beta. Multimodal, Page > 1, FP8, MLA and Speculative Decoding are not supported."
+            )
+            from sglang.srt.layers.attention.flashattention_backend import (
+                FlashAttentionBackend,
+            )
+
+            self.attn_backend = FlashAttentionBackend(self)
        else:
            raise ValueError(
                f"Invalid attention backend: {self.server_args.attention_backend}"