From aac531c53b0166bc3883d1f6491f7f0fbb928197 Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Tue, 8 Apr 2025 18:43:13 -0700
Subject: [PATCH] [Bugfix] Fix index out of bounds in local attention with
 large sequences (#5173)

---
 .../sglang/srt/layers/attention/flashattention_backend.py   | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/attention/flashattention_backend.py b/python/sglang/srt/layers/attention/flashattention_backend.py
index 78efc4332..9e6365cbf 100644
--- a/python/sglang/srt/layers/attention/flashattention_backend.py
+++ b/python/sglang/srt/layers/attention/flashattention_backend.py
@@ -236,7 +236,11 @@ def make_local_attention_virtual_batches(
         np.arange(pages_per_local_batch, dtype=np.int32),
         (virtual_batches, pages_per_local_batch),
     ) + np.expand_dims(block_starts, axis=1)
-    block_indices = block_indices.flatten()
+    # Ensure block_indices doesn't exceed block_table dimensions
+    # This is a critical safety check that prevents index out of bounds errors
+    # when dealing with large sequences (>8192 tokens) or when the block_table
+    # dimensions are smaller than what would be needed for the full attention chunk size.
+    block_indices = block_indices.flatten().clip(max=block_table.shape[1] - 1)
     batch_indices = np.repeat(
         np.arange(actual_batch_size, dtype=np.int32),
         local_blocks * pages_per_local_batch,