[Bugfix][PD] Set conclude state before clear when failure happens (#7362)
Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
This commit is contained in:
@@ -742,12 +742,12 @@ class MooncakeKVSender(BaseKVSender):
|
|||||||
self.kv_mgr.request_status.pop(self.bootstrap_room)
|
self.kv_mgr.request_status.pop(self.bootstrap_room)
|
||||||
|
|
||||||
def failure_exception(self):
|
def failure_exception(self):
|
||||||
self.clear()
|
|
||||||
|
|
||||||
# Explicitly set the status to failure since this request has failed in another rank
|
# Explicitly set the status to failure since this request has failed in another rank
|
||||||
if self.conclude_state is None:
|
if self.conclude_state is None:
|
||||||
self.conclude_state = KVPoll.Failed
|
self.conclude_state = KVPoll.Failed
|
||||||
|
|
||||||
|
self.clear()
|
||||||
|
|
||||||
with self.kv_mgr.failure_lock:
|
with self.kv_mgr.failure_lock:
|
||||||
failure_reason = self.kv_mgr.failure_records.pop(
|
failure_reason = self.kv_mgr.failure_records.pop(
|
||||||
self.bootstrap_room, "Failed due to an unknown reason from another rank"
|
self.bootstrap_room, "Failed due to an unknown reason from another rank"
|
||||||
@@ -1003,12 +1003,12 @@ class MooncakeKVReceiver(BaseKVReceiver):
|
|||||||
self.kv_mgr.request_status.pop(self.bootstrap_room)
|
self.kv_mgr.request_status.pop(self.bootstrap_room)
|
||||||
|
|
||||||
def failure_exception(self):
|
def failure_exception(self):
|
||||||
self.clear()
|
|
||||||
|
|
||||||
# Explicitly set the status to failure since this request has failed in another rank
|
# Explicitly set the status to failure since this request has failed in another rank
|
||||||
if self.conclude_state is None:
|
if self.conclude_state is None:
|
||||||
self.conclude_state = KVPoll.Failed
|
self.conclude_state = KVPoll.Failed
|
||||||
|
|
||||||
|
self.clear()
|
||||||
|
|
||||||
with self.kv_mgr.failure_lock:
|
with self.kv_mgr.failure_lock:
|
||||||
failure_reason = self.kv_mgr.failure_records.pop(
|
failure_reason = self.kv_mgr.failure_records.pop(
|
||||||
self.bootstrap_room, "Failed due to an unknown reason from another rank"
|
self.bootstrap_room, "Failed due to an unknown reason from another rank"
|
||||||
|
|||||||
Reference in New Issue
Block a user