[Bugfix][PD] Set conclude state before clear when failure happens (#7362)

Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
This commit is contained in:
Shangming Cai
2025-06-20 02:26:53 +08:00
committed by GitHub
parent 4f838c09cd
commit f88e70853e

View File

@@ -742,12 +742,12 @@ class MooncakeKVSender(BaseKVSender):
self.kv_mgr.request_status.pop(self.bootstrap_room)
def failure_exception(self):
self.clear()
# Explicitly set the status to failure since this request has failed in another rank
if self.conclude_state is None:
self.conclude_state = KVPoll.Failed
self.clear()
with self.kv_mgr.failure_lock:
failure_reason = self.kv_mgr.failure_records.pop(
self.bootstrap_room, "Failed due to an unknown reason from another rank"
@@ -1003,12 +1003,12 @@ class MooncakeKVReceiver(BaseKVReceiver):
self.kv_mgr.request_status.pop(self.bootstrap_room)
def failure_exception(self):
self.clear()
# Explicitly set the status to failure since this request has failed in another rank
if self.conclude_state is None:
self.conclude_state = KVPoll.Failed
self.clear()
with self.kv_mgr.failure_lock:
failure_reason = self.kv_mgr.failure_records.pop(
self.bootstrap_room, "Failed due to an unknown reason from another rank"