From 1858f3d36ecd649257c81a8699aae2ba84ee68b8 Mon Sep 17 00:00:00 2001
From: Feng Liu <46866849+ader47@users.noreply.github.com>
Date: Thu, 25 Dec 2025 22:46:08 +0800
Subject: [PATCH] [Bugfix] Fix Qwen P/D Disaggregation accuracy issue (#5340)

### What this PR does / why we need it?
Fix Qwen P/D Disaggregation accuracy issue

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/bc0a5a0c089844b17cb93f3294348f411e523586

Signed-off-by: F.Liu <liufeng248@huawei.com>
Co-authored-by: F.Liu <liufeng248@huawei.com>
---
 vllm_ascend/attention/attention_cp.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm_ascend/attention/attention_cp.py b/vllm_ascend/attention/attention_cp.py
index 22c58369..9a426ec2 100644
--- a/vllm_ascend/attention/attention_cp.py
+++ b/vllm_ascend/attention/attention_cp.py
@@ -549,6 +549,11 @@ class AscendAttentionCPImpl(AscendAttentionBackendImpl):
             attn_out, attn_lse = torch_npu.npu_fused_infer_attention_score(
                 query, k_nope, value, **common_kwargs)
 
+        out_mask = attn_metadata.decode_meta.batch_seq_mask[:, None,
+                                                            None].expand_as(
+                                                                attn_out)
+        attn_out = torch.where(out_mask, 0, attn_out)
+
         lse_mask = attn_metadata.decode_meta.batch_seq_mask[:, None,
                                                             None].expand_as(
                                                                 attn_lse)