[P/D] [Bugfix] fix mooncake layerconnector dead when update_decoder_info fail (#7514)

### What this PR does / why we need it?
Fix mooncake layerconnector dead when update_decoder_info fail. For the
scenario where node D is dead, node P failing to update_decoder_info
should not cause node P to become dead.

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?
by CI

- vLLM version: v0.17.0
- vLLM main:
8b6325758c

---------

Signed-off-by: liziyu <liziyu16@huawei.com>
This commit is contained in:
liziyu
2026-03-24 15:49:46 +08:00
committed by GitHub
parent 67aad1fce8
commit 73cadecfb4

View File

@@ -1626,7 +1626,14 @@ class MooncakeLayerwiseConnectorWorker:
for req_id, req_meta in connector_metadata.requests.items():
if len(req_meta.local_block_ids[layer_group_idx]) == 0:
continue
req_meta_update = self.update_decoder_info(req_id, req_meta)
try:
req_meta_update = self.update_decoder_info(req_id, req_meta)
except Exception as e:
logger.warning(
f"MooncakeLayerwiseConnector transfer fail for req_id {req_id} in layer_idx "
f"{self.current_layer}, update_decoder_info with error: {e}"
)
continue
logger.debug(f"Add request {req_id} to kv send layer thread. {req_meta_update=}")
layer_send_task.send_request[req_id] = req_meta_update
@@ -1681,6 +1688,7 @@ class MooncakeLayerwiseConnectorWorker:
f"from {req_meta.remote_host}:{req_meta.remote_port}"
f"fail with error: {e}"
)
raise e
assert req_meta.remote_engine_id != self.engine_id, (
f"Conflict engine id {req_meta.remote_engine_id} with local engine id {self.local_engine_id}."
)