From 5f4b13ab3d440f79eb3b2a1f1978ed35413f5a31 Mon Sep 17 00:00:00 2001 From: Qiu Date: Mon, 12 Jan 2026 20:11:46 +0800 Subject: [PATCH] [bugfix](cp) align max_context_chunk to cp_virtual_block_size (#5767) ### What this PR does / why we need it? In the chunked prefill scenario, CP needs to align the `max_context_chunk` to the `cp_virtual_block_size`, but the current implementation only aligns it to the `block_size`. For PD-disaggregation, `cp_kv_cache_interleave_size` is typically set equal to `block_size`, in which case `cp_virtual_block_size=block_size * dcp_size * pcp_size`. Under specific conditions, this can lead to misalignment of certain chunks, subsequently triggering assertion check errors. ### Does this PR introduce _any_ user-facing change? No - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d Signed-off-by: QiuChunshuo --- vllm_ascend/attention/context_parallel/mla_cp.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm_ascend/attention/context_parallel/mla_cp.py b/vllm_ascend/attention/context_parallel/mla_cp.py index 6ff20557..c9c8bd0b 100644 --- a/vllm_ascend/attention/context_parallel/mla_cp.py +++ b/vllm_ascend/attention/context_parallel/mla_cp.py @@ -69,6 +69,9 @@ class AscendMlaCPMetadataBuilder(AscendMLAMetadataBuilder): self.decode_threshold, dtype=torch.uint8, device=device) + self.block_size = (self.block_size * + self.cp_virtual_block_size) // np.gcd( + self.block_size, self.cp_virtual_block_size) def build( self,