From a22b532d38fe6f1cda1a2e61d1f9af8888cac256 Mon Sep 17 00:00:00 2001 From: zhangxinyuehfad <59153331+zhangxinyuehfad@users.noreply.github.com> Date: Fri, 19 Sep 2025 22:35:14 +0800 Subject: [PATCH] [Fixbug] Fix shape not match when sliding_window and dynamic batch_size (#2830) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What this PR does / why we need it? Fix shape not match when test LLM-Research/Phi-4-mini-instruct accuarcy ### Does this PR introduce _any_ user-facing change? Users can't set dynamic batch_size or use lm_eval test accuracy when using models(sliding_window) ### How was this patch tested? accuarcy of LLM-Research/Phi-4-mini-instruct is ok : ``` vllm (pretrained=LLM-Research/Phi-4-mini-instruct,max_model_len=4096,dtype=auto,tensor_parallel_size=1), gen_kwargs: (None), limit: None, num_fewshot: 5, batch_size: auto |Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr| |-----|------:|----------------|-----:|-----------|---|-----:|---|-----:| |gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.8105|± |0.0108| | | |strict-match | 5|exact_match|↑ |0.8097|± |0.0108| ``` - vLLM version: v0.10.2 - vLLM main: https://github.com/vllm-project/vllm/commit/3c96e7b8a1ffac31b947c753f4fe1d0cd14abf87 Signed-off-by: hfadzxy --- tests/ut/attention/test_attention_v1.py | 35 +++++++++++++++++++++++++ vllm_ascend/attention/attention_v1.py | 3 ++- 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/tests/ut/attention/test_attention_v1.py b/tests/ut/attention/test_attention_v1.py index 94a34e9..54be152 100644 --- a/tests/ut/attention/test_attention_v1.py +++ b/tests/ut/attention/test_attention_v1.py @@ -435,6 +435,41 @@ class TestAscendAttentionBackendImpl(TestBase): mock_fused_infer_attention_score.assert_called_once() assert output.shape == (10, 8 * 64) + @patch('torch_npu._npu_reshape_and_cache') + @patch('torch_npu._npu_paged_attention') + @patch('torch_npu.npu_fused_infer_attention_score') + def test_forward_decode_only_swa_seq_len_mismatch( + self, mock_fused_infer_attention_score, mock_paged_attention, + mock_npu_reshape_and_cache): + """Test forward pass in DecodeOnly state when seq)len_mismatch""" + query = torch.randn(10, 8 * 64) + key = torch.randn(10, 8 * 64) + value = torch.randn(10, 8 * 64) + kv_cache = torch.empty(2, 5, 128, 8, 64) + + metadata = self.attn_metadata + metadata.attn_state = AscendAttentionState.DecodeOnly + metadata.seq_lens = torch.tensor([10]) # len == 1 != query.size(0)==10 + metadata.block_tables = torch.zeros(1, 5, dtype=torch.long) + metadata.num_actual_tokens = 10 + metadata.slot_mapping = torch.zeros(10, dtype=torch.long) + + mock_fused_infer_attention_score.return_value = (torch.ones(10, 8, + 64), 1) + + output = self.impl_swa.forward(self.layer_no_quant, + query, + key, + value, + kv_cache, + metadata, + trace_flag=False) + + mock_paged_attention.assert_called_once() + mock_fused_infer_attention_score.assert_not_called() + + assert output.shape == (10, 8 * 64) + @patch('vllm_ascend.attention.attention_v1.is_310p', return_value=False) @patch('torch_npu._npu_reshape_and_cache') @patch('vllm_ascend.attention.attention_v1.vanilla_chunked_prefill') diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index e0905fa..10a2f6a 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -378,7 +378,8 @@ class AscendAttentionBackendImpl(AttentionImpl): # seq_lens_tensor needs to be transferred to the device for 310P. attn_metadata.seq_lens = \ attn_metadata.seq_lens.to(device=query.device) - if self.sliding_window is not None: + if self.sliding_window is not None and attn_metadata.seq_lens.shape[ + 0] == query.size(0): batch_size = attn_metadata.seq_lens.shape[0] block_size = 128 query = query.view(batch_size, 1, self.num_heads * self.head_size)