From 73cadecfb47a54f36bed84313754abb444b80010 Mon Sep 17 00:00:00 2001 From: liziyu <56102866+liziyu179@users.noreply.github.com> Date: Tue, 24 Mar 2026 15:49:46 +0800 Subject: [PATCH] [P/D] [Bugfix] fix mooncake layerconnector dead when update_decoder_info fail (#7514) ### What this PR does / why we need it? Fix mooncake layerconnector dead when update_decoder_info fail. For the scenario where node D is dead, node P failing to update_decoder_info should not cause node P to become dead. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? by CI - vLLM version: v0.17.0 - vLLM main: https://github.com/vllm-project/vllm/commit/8b6325758cce5f9c36d38f2462edbd368b97a07c --------- Signed-off-by: liziyu --- .../kv_transfer/kv_p2p/mooncake_layerwise_connector.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_layerwise_connector.py b/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_layerwise_connector.py index b800b9ed..f7206365 100644 --- a/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_layerwise_connector.py +++ b/vllm_ascend/distributed/kv_transfer/kv_p2p/mooncake_layerwise_connector.py @@ -1626,7 +1626,14 @@ class MooncakeLayerwiseConnectorWorker: for req_id, req_meta in connector_metadata.requests.items(): if len(req_meta.local_block_ids[layer_group_idx]) == 0: continue - req_meta_update = self.update_decoder_info(req_id, req_meta) + try: + req_meta_update = self.update_decoder_info(req_id, req_meta) + except Exception as e: + logger.warning( + f"MooncakeLayerwiseConnector transfer fail for req_id {req_id} in layer_idx " + f"{self.current_layer}, update_decoder_info with error: {e}" + ) + continue logger.debug(f"Add request {req_id} to kv send layer thread. {req_meta_update=}") layer_send_task.send_request[req_id] = req_meta_update @@ -1681,6 +1688,7 @@ class MooncakeLayerwiseConnectorWorker: f"from {req_meta.remote_host}:{req_meta.remote_port}" f"fail with error: {e}" ) + raise e assert req_meta.remote_engine_id != self.engine_id, ( f"Conflict engine id {req_meta.remote_engine_id} with local engine id {self.local_engine_id}." )