[P/D] [Bugfix] fix mooncake layerconnector dead when update_decoder_info fail (#7514)
### What this PR does / why we need it?
Fix mooncake layerconnector dead when update_decoder_info fail. For the
scenario where node D is dead, node P failing to update_decoder_info
should not cause node P to become dead.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
by CI
- vLLM version: v0.17.0
- vLLM main:
8b6325758c
---------
Signed-off-by: liziyu <liziyu16@huawei.com>
This commit is contained in:
@@ -1626,7 +1626,14 @@ class MooncakeLayerwiseConnectorWorker:
|
|||||||
for req_id, req_meta in connector_metadata.requests.items():
|
for req_id, req_meta in connector_metadata.requests.items():
|
||||||
if len(req_meta.local_block_ids[layer_group_idx]) == 0:
|
if len(req_meta.local_block_ids[layer_group_idx]) == 0:
|
||||||
continue
|
continue
|
||||||
req_meta_update = self.update_decoder_info(req_id, req_meta)
|
try:
|
||||||
|
req_meta_update = self.update_decoder_info(req_id, req_meta)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
f"MooncakeLayerwiseConnector transfer fail for req_id {req_id} in layer_idx "
|
||||||
|
f"{self.current_layer}, update_decoder_info with error: {e}"
|
||||||
|
)
|
||||||
|
continue
|
||||||
logger.debug(f"Add request {req_id} to kv send layer thread. {req_meta_update=}")
|
logger.debug(f"Add request {req_id} to kv send layer thread. {req_meta_update=}")
|
||||||
layer_send_task.send_request[req_id] = req_meta_update
|
layer_send_task.send_request[req_id] = req_meta_update
|
||||||
|
|
||||||
@@ -1681,6 +1688,7 @@ class MooncakeLayerwiseConnectorWorker:
|
|||||||
f"from {req_meta.remote_host}:{req_meta.remote_port}"
|
f"from {req_meta.remote_host}:{req_meta.remote_port}"
|
||||||
f"fail with error: {e}"
|
f"fail with error: {e}"
|
||||||
)
|
)
|
||||||
|
raise e
|
||||||
assert req_meta.remote_engine_id != self.engine_id, (
|
assert req_meta.remote_engine_id != self.engine_id, (
|
||||||
f"Conflict engine id {req_meta.remote_engine_id} with local engine id {self.local_engine_id}."
|
f"Conflict engine id {req_meta.remote_engine_id} with local engine id {self.local_engine_id}."
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user