From aac531c53b0166bc3883d1f6491f7f0fbb928197 Mon Sep 17 00:00:00 2001 From: Chang Su Date: Tue, 8 Apr 2025 18:43:13 -0700 Subject: [PATCH] [Bugfix] Fix index out of bounds in local attention with large sequences (#5173) --- .../sglang/srt/layers/attention/flashattention_backend.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/attention/flashattention_backend.py b/python/sglang/srt/layers/attention/flashattention_backend.py index 78efc4332..9e6365cbf 100644 --- a/python/sglang/srt/layers/attention/flashattention_backend.py +++ b/python/sglang/srt/layers/attention/flashattention_backend.py @@ -236,7 +236,11 @@ def make_local_attention_virtual_batches( np.arange(pages_per_local_batch, dtype=np.int32), (virtual_batches, pages_per_local_batch), ) + np.expand_dims(block_starts, axis=1) - block_indices = block_indices.flatten() + # Ensure block_indices doesn't exceed block_table dimensions + # This is a critical safety check that prevents index out of bounds errors + # when dealing with large sequences (>8192 tokens) or when the block_table + # dimensions are smaller than what would be needed for the full attention chunk size. + block_indices = block_indices.flatten().clip(max=block_table.shape[1] - 1) batch_indices = np.repeat( np.arange(actual_batch_size, dtype=np.int32), local_blocks * pages_per_local_batch,