[Feature] Add FlashAttention3 as a backend for VisionAttention (#5764)

Co-authored-by: othame <chenzhu_912@zju.edu.cn>
Co-authored-by: Mick <mickjagger19@icloud.com>
Co-authored-by: Yi Zhang <1109276519@qq.com>
This commit is contained in:
Zhu Chen
2025-05-09 01:01:19 +08:00
committed by GitHub
parent f1ff736d68
commit fa7d7fd9e5
11 changed files with 328 additions and 186 deletions

View File

@@ -187,6 +187,7 @@ class ServerArgs:
n_share_experts_fusion: int = 0
disable_chunked_prefix_cache: bool = False
disable_fast_image_processor: bool = False
mm_attention_backend: Optional[str] = None
# Debug tensor dumps
debug_tensor_dump_output_folder: Optional[str] = None
@@ -1265,6 +1266,14 @@ class ServerArgs:
help="The URL of the PD disaggregation load balancer. If set, the prefill/decode server will register with the load balancer.",
)
parser.add_argument(
"--mm-attention-backend",
type=str,
choices=["sdpa", "fa3", "triton_attn"],
default=ServerArgs.mm_attention_backend,
help="Set multimodal attention backend.",
)
@classmethod
def from_cli_args(cls, args: argparse.Namespace):
args.tp_size = args.tensor_parallel_size