[PD] Optimize time out logic and add env var doc for mooncake (#6761)

Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
This commit is contained in:
shangmingc
2025-05-30 15:45:02 +08:00
committed by GitHub
parent 62cac2c43a
commit 6cb00c6398
2 changed files with 29 additions and 10 deletions

View File

@@ -677,14 +677,15 @@ class MooncakeKVSender(BaseKVSender):
self.kv_mgr.update_status(bootstrap_room, KVPoll.Bootstrapping)
self.aux_index = None
self.bootstrap_server_url = bootstrap_addr
self.init_time = time.time()
self.conclude_state = None
self.init_time = None
# inner state
self.curr_idx = 0
def init(self, num_kv_indices: int, aux_index: Optional[int] = None):
self.num_kv_indices = num_kv_indices
self.aux_index = aux_index
self.init_time = time.time()
def send(
self,
@@ -713,15 +714,16 @@ class MooncakeKVSender(BaseKVSender):
if status in (KVPoll.Success, KVPoll.Failed):
self.conclude_state = status
elif status == KVPoll.Bootstrapping:
now = time.time()
elapsed = now - self.init_time
if elapsed >= self.kv_mgr.bootstrap_time_out:
self.kv_mgr.record_failure(
self.bootstrap_room,
f"Request {self.bootstrap_room} timed out after {elapsed:.1f}s in KVPoll.Bootstrapping",
)
self.conclude_state = KVPoll.Failed
return KVPoll.Failed
if self.init_time is not None:
now = time.time()
elapsed = now - self.init_time
if elapsed >= self.kv_mgr.bootstrap_time_out:
self.kv_mgr.record_failure(
self.bootstrap_room,
f"Request {self.bootstrap_room} timed out after {elapsed:.1f}s in KVPoll.Bootstrapping",
)
self.conclude_state = KVPoll.Failed
return KVPoll.Failed
return status
else: