From 1858f3d36ecd649257c81a8699aae2ba84ee68b8 Mon Sep 17 00:00:00 2001 From: Feng Liu <46866849+ader47@users.noreply.github.com> Date: Thu, 25 Dec 2025 22:46:08 +0800 Subject: [PATCH] [Bugfix] Fix Qwen P/D Disaggregation accuracy issue (#5340) ### What this PR does / why we need it? Fix Qwen P/D Disaggregation accuracy issue - vLLM version: release/v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/bc0a5a0c089844b17cb93f3294348f411e523586 Signed-off-by: F.Liu Co-authored-by: F.Liu --- vllm_ascend/attention/attention_cp.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm_ascend/attention/attention_cp.py b/vllm_ascend/attention/attention_cp.py index 22c58369..9a426ec2 100644 --- a/vllm_ascend/attention/attention_cp.py +++ b/vllm_ascend/attention/attention_cp.py @@ -549,6 +549,11 @@ class AscendAttentionCPImpl(AscendAttentionBackendImpl): attn_out, attn_lse = torch_npu.npu_fused_infer_attention_score( query, k_nope, value, **common_kwargs) + out_mask = attn_metadata.decode_meta.batch_seq_mask[:, None, + None].expand_as( + attn_out) + attn_out = torch.where(out_mask, 0, attn_out) + lse_mask = attn_metadata.decode_meta.batch_seq_mask[:, None, None].expand_as( attn_lse)