From eed9e366a79403eeac5ca0d7c759239c3c0a006d Mon Sep 17 00:00:00 2001 From: liziyu <56102866+liziyu179@users.noreply.github.com> Date: Tue, 13 Jan 2026 17:30:33 +0800 Subject: [PATCH] [Bugfix][P/D] fix layerwise connector for decoder tp size > num kv heads (#5846) ### What this PR does / why we need it? Fix layerwise connector for decoder tp size > num kv heads. In this case prefiller should push kv cache to all decoder npu. - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d Signed-off-by: liziyu --- vllm_ascend/distributed/mooncake_layerwise_connector.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/distributed/mooncake_layerwise_connector.py b/vllm_ascend/distributed/mooncake_layerwise_connector.py index e7e3219a..f9e15d5c 100644 --- a/vllm_ascend/distributed/mooncake_layerwise_connector.py +++ b/vllm_ascend/distributed/mooncake_layerwise_connector.py @@ -1078,10 +1078,14 @@ class MooncakeLayerwiseConnectorWorker: ): # enable decode prefix cache if self.use_mla or self.use_sparse: - num_kv_head = self._decode_tp_size + num_need_send = self._decode_tp_size else: num_kv_head = self.vllm_config.model_config.hf_config.num_key_value_heads - num_replica_groups = self.tp_size // num_kv_head if self.tp_size >= num_kv_head else 1 + if self.tp_size <= num_kv_head: + num_need_send = self.tp_size + else: + num_need_send = self._decode_tp_size if self._decode_tp_size >= num_kv_head else num_kv_head + num_replica_groups = self.tp_size // num_need_send if self.tp_size >= num_need_send else 1 replica_group_idx = self.tp_rank % num_replica_groups req_ids = sorted(list(connector_metadata.requests.keys())) selected_req_ids = [