[Feat] QWen-1M context support[2/2]: Update block sparse attention backend (#5949)

2025-08-07 14:49:36 +08:00
parent a69b637014
commit b7cd743038
15 changed files with 2121 additions and 4 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -502,6 +502,20 @@ class ServerArgs:
                # use bf16 for mxfp4 triton kernels
                self.dtype = "bfloat16"

+        if self.attention_backend == "dual_chunk_flash_attn":
+            logger.warning(
+                "Mixed chunk is disabled because of using dual chunk flash attention backend"
+            )
+            logger.warning(
+                "Radix cache is disabled because of using dual chunk flash attention backend"
+            )
+            logger.warning(
+                "Cuda graph is disabled because of using dual chunk flash attention backend"
+            )
+            self.enable_mixed_chunk = False
+            self.disable_cuda_graph = True
+            self.disable_radix_cache = True
+
        # Set page size
        if self.page_size is None:
            self.page_size = 1
@@ -1337,6 +1351,7 @@ class ServerArgs:
                "triton",
                "trtllm_mla",
                "trtllm_mha",
+                "dual_chunk_flash_attn",
            ],
            default=ServerArgs.attention_backend,
            help="Choose the kernels for attention layers.",