From 91c39ebae6af173ad923d3ce962c102037370763 Mon Sep 17 00:00:00 2001 From: dsxsteven <36877507+dsxsteven@users.noreply.github.com> Date: Thu, 5 Mar 2026 16:51:08 +0800 Subject: [PATCH] [BugFix] [dcp] Fix GQA Model Error when Enable both DP and DCP (#7012) ### What this PR does / why we need it? For GQA model, when we enable both dp and dcp (disable pcp), the key-value pairs were not being captured correctly; we have now fixed it. Signed-off-by: dsxsteven --- vllm_ascend/attention/context_parallel/attention_cp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/attention/context_parallel/attention_cp.py b/vllm_ascend/attention/context_parallel/attention_cp.py index c9da487b..14e0895a 100644 --- a/vllm_ascend/attention/context_parallel/attention_cp.py +++ b/vllm_ascend/attention/context_parallel/attention_cp.py @@ -938,8 +938,8 @@ class AscendAttentionCPImpl(AscendAttentionBackendImpl): prefill_query = query[self.pcp_size * num_decode_tokens :] else: prefill_query = query[num_decode_tokens:num_actual_tokens_pcp_padded].contiguous() - key = key[self.pcp_size * num_decode_tokens :].contiguous() - value = value[self.pcp_size * num_decode_tokens :].contiguous() + key = key[self.pcp_size * num_decode_tokens : attn_metadata.num_actual_tokens_pcp_padded].contiguous() + value = value[self.pcp_size * num_decode_tokens : attn_metadata.num_actual_tokens_pcp_padded].contiguous() if has_chunked_context: # all_gather q for chunked prefill // overlap the computation inner current chunk