From 75d05ee200ba7e59b053e9698e04076d2ba9941c Mon Sep 17 00:00:00 2001 From: yiz-liu <136800916+yiz-liu@users.noreply.github.com> Date: Mon, 30 Jun 2025 11:25:19 +0800 Subject: [PATCH] [Core] Fix block table shape to make Prefix cache work with Ascend scheduler (#1446) ### What this PR does / why we need it? This fix the shape of block_table which was introduced by hybrid kv groups several weeks ago. Error will be raised when enable prefix-cache (eager or not) and Ascend Scheduler at the same time, just send two identical requests and it will reproduce. v0.9.1: https://github.com/vllm-project/vllm-ascend/pull/1297 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Test manually Signed-off-by: Yizhou Liu --- vllm_ascend/attention/attention_v1.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index 985997e..0255c53 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -355,11 +355,13 @@ class AscendAttentionBackendImpl(AttentionImpl): assert attn_metadata is not None assert attn_metadata.attn_mask is not None compress_mask = attn_metadata.attn_mask + batch_size = attn_metadata.query_lens.shape[0] + block_table = attn_metadata.block_tables[:batch_size, :] torch_npu._npu_flash_attention_qlens( query=query, key_cache=self.key_cache, value_cache=self.value_cache, - block_table=attn_metadata.block_tables, + block_table=block_table, mask=compress_mask, seq_len=attn_metadata.query_lens, context_lens=attn_metadata.seq_lens,