From c3b1d409a9c589a478b7080a4ae4faf2d1c6c2b3 Mon Sep 17 00:00:00 2001 From: liziyu <56102866+liziyu179@users.noreply.github.com> Date: Thu, 23 Apr 2026 09:16:37 +0800 Subject: [PATCH] [BugFix] [P/D] [CherryPick] 8540 In scenarios where TP is not equal, the KV cache at the MTP layer is not handled. (#8541) ### What this PR does / why we need it? Fix the issue where the Mooncake connector does not handle the MTP layer KV cache when TP is unbalanced. backport: #8540 ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? by nightly Signed-off-by: liziyu --- vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_connector.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_connector.py b/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_connector.py index 4c692e33..ed48d0b5 100644 --- a/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_connector.py +++ b/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_connector.py @@ -587,6 +587,7 @@ class KVCacheRecvingThread(threading.Thread): block_size = self.vllm_config.cache_config.block_size num_kv_head = max(self.model_config.hf_text_config.num_key_value_heads // self.tp_size, 1) layers = self.model_config.hf_text_config.num_hidden_layers + layers = len(self.kv_caches) flat_block_ids = [item for sublist in block_ids for item in sublist] block_ids_tensor = torch.tensor(flat_block_ids, dtype=torch.int64, device=device)