From 912788c095c9306daabc996fd06e59cf062a783b Mon Sep 17 00:00:00 2001 From: Chang Su Date: Tue, 13 May 2025 17:18:38 -0700 Subject: [PATCH] perf: optimize local_block_table memory allocation (#6273) --- python/sglang/srt/layers/attention/flashattention_backend.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/sglang/srt/layers/attention/flashattention_backend.py b/python/sglang/srt/layers/attention/flashattention_backend.py index f200a367b..2f974ea9a 100644 --- a/python/sglang/srt/layers/attention/flashattention_backend.py +++ b/python/sglang/srt/layers/attention/flashattention_backend.py @@ -1165,7 +1165,6 @@ class FlashAttentionBackend(AttentionBackend): max_virtual_batches = max_bs * ( (max_seq_len + attn_chunk_size - 1) // attn_chunk_size ) - max_blocks_per_seq = (max_seq_len + attn_chunk_size - 1) // attn_chunk_size max_pages_per_block = (attn_chunk_size + page_size - 1) // page_size self.decode_cuda_graph_local_attn_metadata = { @@ -1177,7 +1176,7 @@ class FlashAttentionBackend(AttentionBackend): ), "local_block_table": torch.zeros( max_virtual_batches, - max_blocks_per_seq * max_pages_per_block, + max_pages_per_block, dtype=torch.int32, device=self.device, ),