From 50e0e87646ac59547b9ec859c92505c91e4aaf47 Mon Sep 17 00:00:00 2001 From: Qiu Date: Thu, 29 Jan 2026 16:48:37 +0800 Subject: [PATCH] [bugfix](CP,MLA) fix wrong slot_mapping of decode for mixed p/d batch (#6344) ### What this PR does / why we need it? PR #5672 attempted to remove the -1 padding for duplicate tokens in the decode slot_mapping when adapting PCP for MLAPO, and adopted a simpler slicing approach. However, in the single-ops logic and mixed PD batches, the decode slot_mapping did not eliminate the -1 and also shared the slicing method, resulting in incorrect slot_mapping. This PR resolves this issue, and the logic will be further consolidated in subsequent refactoring PRs. - vLLM version: v0.14.1 - vLLM main: https://github.com/vllm-project/vllm/commit/dc917cceb877dfd13f98c538c4c96158047d98bd --------- Signed-off-by: QiuChunshuo --- tests/e2e/multicard/2-cards/test_external_launcher.py | 2 ++ vllm_ascend/attention/context_parallel/mla_cp.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/e2e/multicard/2-cards/test_external_launcher.py b/tests/e2e/multicard/2-cards/test_external_launcher.py index dfc4ee75..f8b59e38 100644 --- a/tests/e2e/multicard/2-cards/test_external_launcher.py +++ b/tests/e2e/multicard/2-cards/test_external_launcher.py @@ -29,6 +29,7 @@ from unittest.mock import patch import pytest import torch_npu from modelscope import snapshot_download # type: ignore +from tests.e2e.conftest import wait_until_npu_memory_free MODELS = ["Qwen/Qwen3-0.6B"] MOE_MODELS = ["Qwen/Qwen3-30B-A3B"] @@ -110,6 +111,7 @@ def test_qwen3_moe_external_launcher_ep_tp2(model): @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"}) +@wait_until_npu_memory_free() def test_qwen3_external_launcher_with_sleepmode(): script = Path( __file__ diff --git a/vllm_ascend/attention/context_parallel/mla_cp.py b/vllm_ascend/attention/context_parallel/mla_cp.py index a53dfb58..de1bc5f3 100644 --- a/vllm_ascend/attention/context_parallel/mla_cp.py +++ b/vllm_ascend/attention/context_parallel/mla_cp.py @@ -79,7 +79,7 @@ class AscendMlaCPMetadataBuilder(AscendMLAMetadataBuilder): fast_build: bool = False, ) -> AscendMLAMetadata: metadata_cls = super().build(common_prefix_len, common_attn_metadata) - if self.num_prefills == 0 and self.pcp_size > 1: + if self.pcp_size > 1: self.slot_mapping[: self.num_decode_tokens] = self.slot_mapping[ : self.num_decode_tokens * self.pcp_size : self.pcp_size ]