[1/n] Enable DCA CUDA graph capture (#9537)
This commit is contained in:
@@ -1537,7 +1537,7 @@ class DualChunkFlashAttentionBackend(AttentionBackend):
|
|||||||
query_inter,
|
query_inter,
|
||||||
key_cache,
|
key_cache,
|
||||||
value_cache,
|
value_cache,
|
||||||
block_table[:, : decode_meta.max_seq_len_inter],
|
block_table,
|
||||||
decode_meta.seq_lens_inter,
|
decode_meta.seq_lens_inter,
|
||||||
softmax_scale,
|
softmax_scale,
|
||||||
causal=False,
|
causal=False,
|
||||||
|
|||||||
@@ -878,10 +878,9 @@ class ServerArgs:
|
|||||||
|
|
||||||
if self.attention_backend == "dual_chunk_flash_attn":
|
if self.attention_backend == "dual_chunk_flash_attn":
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Mixed chunk, radix cache, and cuda graphs are disabled because of using dual chunk flash attention backend"
|
"Mixed chunk and radix cache are disabled when using dual-chunk flash attention backend"
|
||||||
)
|
)
|
||||||
self.enable_mixed_chunk = False
|
self.enable_mixed_chunk = False
|
||||||
self.disable_cuda_graph = True
|
|
||||||
self.disable_radix_cache = True
|
self.disable_radix_cache = True
|
||||||
|
|
||||||
def _handle_page_size(self):
|
def _handle_page_size(self):
|
||||||
|
|||||||
Reference in New Issue
Block a user