[bugfix](CP,MLA) fix wrong slot_mapping of decode for mixed p/d batch (#6344)
### What this PR does / why we need it?
PR #5672 attempted to remove the -1 padding for duplicate tokens in the
decode slot_mapping when adapting PCP for MLAPO, and adopted a simpler
slicing approach. However, in the single-ops logic and mixed PD batches,
the decode slot_mapping did not eliminate the -1 and also shared the
slicing method, resulting in incorrect slot_mapping. This PR resolves
this issue, and the logic will be further consolidated in subsequent
refactoring PRs.
- vLLM version: v0.14.1
- vLLM main:
dc917cceb8
---------
Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com>
This commit is contained in:
@@ -29,6 +29,7 @@ from unittest.mock import patch
|
|||||||
import pytest
|
import pytest
|
||||||
import torch_npu
|
import torch_npu
|
||||||
from modelscope import snapshot_download # type: ignore
|
from modelscope import snapshot_download # type: ignore
|
||||||
|
from tests.e2e.conftest import wait_until_npu_memory_free
|
||||||
|
|
||||||
MODELS = ["Qwen/Qwen3-0.6B"]
|
MODELS = ["Qwen/Qwen3-0.6B"]
|
||||||
MOE_MODELS = ["Qwen/Qwen3-30B-A3B"]
|
MOE_MODELS = ["Qwen/Qwen3-30B-A3B"]
|
||||||
@@ -110,6 +111,7 @@ def test_qwen3_moe_external_launcher_ep_tp2(model):
|
|||||||
|
|
||||||
|
|
||||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
|
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
|
||||||
|
@wait_until_npu_memory_free()
|
||||||
def test_qwen3_external_launcher_with_sleepmode():
|
def test_qwen3_external_launcher_with_sleepmode():
|
||||||
script = Path(
|
script = Path(
|
||||||
__file__
|
__file__
|
||||||
|
|||||||
@@ -79,7 +79,7 @@ class AscendMlaCPMetadataBuilder(AscendMLAMetadataBuilder):
|
|||||||
fast_build: bool = False,
|
fast_build: bool = False,
|
||||||
) -> AscendMLAMetadata:
|
) -> AscendMLAMetadata:
|
||||||
metadata_cls = super().build(common_prefix_len, common_attn_metadata)
|
metadata_cls = super().build(common_prefix_len, common_attn_metadata)
|
||||||
if self.num_prefills == 0 and self.pcp_size > 1:
|
if self.pcp_size > 1:
|
||||||
self.slot_mapping[: self.num_decode_tokens] = self.slot_mapping[
|
self.slot_mapping[: self.num_decode_tokens] = self.slot_mapping[
|
||||||
: self.num_decode_tokens * self.pcp_size : self.pcp_size
|
: self.num_decode_tokens * self.pcp_size : self.pcp_size
|
||||||
]
|
]
|
||||||
|
|||||||
Reference in New Issue
Block a user