diff --git a/python/sglang/srt/layers/attention/dual_chunk_flashattention_backend.py b/python/sglang/srt/layers/attention/dual_chunk_flashattention_backend.py index 84876b438..775e03bb2 100644 --- a/python/sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +++ b/python/sglang/srt/layers/attention/dual_chunk_flashattention_backend.py @@ -1537,7 +1537,7 @@ class DualChunkFlashAttentionBackend(AttentionBackend): query_inter, key_cache, value_cache, - block_table[:, : decode_meta.max_seq_len_inter], + block_table, decode_meta.seq_lens_inter, softmax_scale, causal=False, diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 8d4ebe74f..029502f5c 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -878,10 +878,9 @@ class ServerArgs: if self.attention_backend == "dual_chunk_flash_attn": logger.warning( - "Mixed chunk, radix cache, and cuda graphs are disabled because of using dual chunk flash attention backend" + "Mixed chunk and radix cache are disabled when using dual-chunk flash attention backend" ) self.enable_mixed_chunk = False - self.disable_cuda_graph = True self.disable_radix_cache = True def _handle_page_size(self):