[1/n] Enable DCA CUDA graph capture (#9537)

This commit is contained in:
b8zhong
2025-10-02 20:30:00 -07:00
committed by GitHub
parent 7e61737d3f
commit a2faf8940c
2 changed files with 2 additions and 3 deletions

View File

@@ -1537,7 +1537,7 @@ class DualChunkFlashAttentionBackend(AttentionBackend):
query_inter,
key_cache,
value_cache,
block_table[:, : decode_meta.max_seq_len_inter],
block_table,
decode_meta.seq_lens_inter,
softmax_scale,
causal=False,

View File

@@ -878,10 +878,9 @@ class ServerArgs:
if self.attention_backend == "dual_chunk_flash_attn":
logger.warning(
"Mixed chunk, radix cache, and cuda graphs are disabled because of using dual chunk flash attention backend"
"Mixed chunk and radix cache are disabled when using dual-chunk flash attention backend"
)
self.enable_mixed_chunk = False
self.disable_cuda_graph = True
self.disable_radix_cache = True
def _handle_page_size(self):