[Feature] Support FA3 backend for MLA (#4831)

2025-03-28 18:30:14 -07:00
parent ec3ee0289d
commit 20c90be23d
3 changed files with 180 additions and 74 deletions
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -230,6 +230,10 @@ class ModelRunner:
                elif server_args.enable_flashmla:
                    logger.info("MLA optimization is turned on. Use flashmla decode.")
                    server_args.attention_backend = "flashmla"
+                elif server_args.attention_backend == "fa3":
+                    logger.info(
+                        f"MLA optimization is turned on. Use flash attention 3 backend."
+                    )
                else:
                    logger.info("MLA optimization is turned on. Use triton backend.")
                    server_args.attention_backend = "triton"
@@ -879,7 +883,7 @@ class ModelRunner:
                "Please use `--attention-backend flashinfer`."
            )
            logger.warning(
-                "FlashAttention v3 Backend is in Beta. Multimodal, Page > 1, FP8, MLA and Speculative Decoding are not supported."
+                "FlashAttention v3 Backend is in Beta. Multimodal, FP8, and Speculative Decoding are not supported."
            )
            from sglang.srt.layers.attention.flashattention_backend import (
                FlashAttentionBackend,