[Feature] Add FlashAttention3 as a backend for VisionAttention (#5764)

Co-authored-by: othame <chenzhu_912@zju.edu.cn> Co-authored-by: Mick <mickjagger19@icloud.com> Co-authored-by: Yi Zhang <1109276519@qq.com>
2025-05-09 01:01:19 +08:00
parent f1ff736d68
commit fa7d7fd9e5
11 changed files with 328 additions and 186 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -187,6 +187,7 @@ class ServerArgs:
    n_share_experts_fusion: int = 0
    disable_chunked_prefix_cache: bool = False
    disable_fast_image_processor: bool = False
+    mm_attention_backend: Optional[str] = None

    # Debug tensor dumps
    debug_tensor_dump_output_folder: Optional[str] = None
@@ -1265,6 +1266,14 @@ class ServerArgs:
            help="The URL of the PD disaggregation load balancer. If set, the prefill/decode server will register with the load balancer.",
        )

+        parser.add_argument(
+            "--mm-attention-backend",
+            type=str,
+            choices=["sdpa", "fa3", "triton_attn"],
+            default=ServerArgs.mm_attention_backend,
+            help="Set multimodal attention backend.",
+        )
+
    @classmethod
    def from_cli_args(cls, args: argparse.Namespace):
        args.tp_size = args.tensor_parallel_size