[Feature] Add FlashAttention3 as a backend for VisionAttention (#5764)
Co-authored-by: othame <chenzhu_912@zju.edu.cn> Co-authored-by: Mick <mickjagger19@icloud.com> Co-authored-by: Yi Zhang <1109276519@qq.com>
This commit is contained in:
@@ -187,6 +187,7 @@ class ServerArgs:
|
||||
n_share_experts_fusion: int = 0
|
||||
disable_chunked_prefix_cache: bool = False
|
||||
disable_fast_image_processor: bool = False
|
||||
mm_attention_backend: Optional[str] = None
|
||||
|
||||
# Debug tensor dumps
|
||||
debug_tensor_dump_output_folder: Optional[str] = None
|
||||
@@ -1265,6 +1266,14 @@ class ServerArgs:
|
||||
help="The URL of the PD disaggregation load balancer. If set, the prefill/decode server will register with the load balancer.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--mm-attention-backend",
|
||||
type=str,
|
||||
choices=["sdpa", "fa3", "triton_attn"],
|
||||
default=ServerArgs.mm_attention_backend,
|
||||
help="Set multimodal attention backend.",
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_cli_args(cls, args: argparse.Namespace):
|
||||
args.tp_size = args.tensor_parallel_size
|
||||
|
||||
Reference in New Issue
Block a user