feat: Add FlexAttention Backend for Efficient Sparse Attention (#9947)
Co-authored-by: Baizhou Zhang <sobereddiezhang@gmail.com>
This commit is contained in:
@@ -93,6 +93,7 @@ ATTENTION_BACKEND_CHOICES = [
|
||||
# Common
|
||||
"triton",
|
||||
"torch_native",
|
||||
"flex_attention",
|
||||
# NVIDIA specific
|
||||
"cutlass_mla",
|
||||
"fa3",
|
||||
@@ -592,6 +593,15 @@ class ServerArgs:
|
||||
)
|
||||
self.disable_cuda_graph = True
|
||||
|
||||
if self.attention_backend == "flex_attention":
|
||||
logger.warning(
|
||||
"Cuda graph is disabled because of using torch Flex Attention backend"
|
||||
)
|
||||
self.disable_cuda_graph = True
|
||||
assert (
|
||||
self.speculative_algorithm is None
|
||||
), "Speculative decoding is currently not supported with Flex Attention backend"
|
||||
|
||||
if is_npu() and self.attention_backend in ["ascend", "hybrid_linear_attn"]:
|
||||
logger.warning(
|
||||
"At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."
|
||||
|
||||
Reference in New Issue
Block a user