feat: Add FlexAttention Backend for Efficient Sparse Attention (#9947)

Co-authored-by: Baizhou Zhang <sobereddiezhang@gmail.com>
This commit is contained in:
yuk.igalaxy
2025-09-19 02:49:17 +08:00
committed by GitHub
parent 388c05d544
commit 9a5c42f9ad
4 changed files with 390 additions and 0 deletions

View File

@@ -93,6 +93,7 @@ ATTENTION_BACKEND_CHOICES = [
# Common
"triton",
"torch_native",
"flex_attention",
# NVIDIA specific
"cutlass_mla",
"fa3",
@@ -592,6 +593,15 @@ class ServerArgs:
)
self.disable_cuda_graph = True
if self.attention_backend == "flex_attention":
logger.warning(
"Cuda graph is disabled because of using torch Flex Attention backend"
)
self.disable_cuda_graph = True
assert (
self.speculative_algorithm is None
), "Speculative decoding is currently not supported with Flex Attention backend"
if is_npu() and self.attention_backend in ["ascend", "hybrid_linear_attn"]:
logger.warning(
"At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."