From 9fadc8df4f36bb85829a2be05afad0ce28196992 Mon Sep 17 00:00:00 2001 From: Shaoxu Cheng <2906339855@qq.com> Date: Wed, 28 Jan 2026 16:41:32 +0800 Subject: [PATCH] [Fixbugs]: fix refactor cause to 310p chunkprefill error (#6340) Adapt modelrunner refactor change to make 310p work - vLLM version: v0.14.1 - vLLM main: https://github.com/vllm-project/vllm/commit/dc917cceb877dfd13f98c538c4c96158047d98bd Signed-off-by: Tflowers-0129 <2906339855@qq.com> --- vllm_ascend/_310p/attention/attention_v1.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm_ascend/_310p/attention/attention_v1.py b/vllm_ascend/_310p/attention/attention_v1.py index feaef523..3637685b 100644 --- a/vllm_ascend/_310p/attention/attention_v1.py +++ b/vllm_ascend/_310p/attention/attention_v1.py @@ -101,8 +101,7 @@ class AscendAttentionBackendImpl310(_BaseImpl): out=output, ) - out_real = output[:real_tokens, :, :] - return out_real + return output[:aligned_tokens, :, :] def _forward_chunked_prefill_310p(self, query, attn_metadata, output): assert attn_metadata is not None @@ -110,6 +109,10 @@ class AscendAttentionBackendImpl310(_BaseImpl): if query.dtype == torch.float32: query = query.to(torch.float16) + num_actual_tokens = int(attn_metadata.num_actual_tokens) + query = query[:num_actual_tokens] + output = output[:num_actual_tokens] + qsl_cpu = attn_metadata.query_start_loc.detach().to("cpu", dtype=torch.int32) qlens = (qsl_cpu[1:] - qsl_cpu[:-1]).to(torch.int32) @@ -163,8 +166,7 @@ class AscendAttentionBackendImpl310(_BaseImpl): k = key[:num_tokens] v = value[:num_tokens] out = self._forward_prefill_310p_fallback(q, k, v, attn_metadata, output) - output[:num_tokens] = out - return output + return out if state == AscendAttentionState.ChunkedPrefill: self._forward_chunked_prefill_310p(query, attn_metadata, output)