feat: Add FlexAttention Backend for Efficient Sparse Attention (#9947)

Co-authored-by: Baizhou Zhang <sobereddiezhang@gmail.com>
2025-09-19 02:49:17 +08:00
parent 388c05d544
commit 9a5c42f9ad
4 changed files with 390 additions and 0 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -93,6 +93,7 @@ ATTENTION_BACKEND_CHOICES = [
    # Common
    "triton",
    "torch_native",
+    "flex_attention",
    # NVIDIA specific
    "cutlass_mla",
    "fa3",
@@ -592,6 +593,15 @@ class ServerArgs:
            )
            self.disable_cuda_graph = True

+        if self.attention_backend == "flex_attention":
+            logger.warning(
+                "Cuda graph is disabled because of using torch Flex Attention backend"
+            )
+            self.disable_cuda_graph = True
+            assert (
+                self.speculative_algorithm is None
+            ), "Speculative decoding is currently not supported with Flex Attention backend"
+
        if is_npu() and self.attention_backend in ["ascend", "hybrid_linear_attn"]:
            logger.warning(
                "At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."