[PD] Abort unbootstrapped prefill requests through timeout (#6685)
Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
This commit is contained in:
@@ -168,6 +168,9 @@ class MooncakeKVManager(BaseKVManager):
|
|||||||
min(max(1, cpu_count // 8), 8),
|
min(max(1, cpu_count // 8), 8),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
self.bootstrap_time_out = get_int_env_var(
|
||||||
|
"SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT", 30
|
||||||
|
)
|
||||||
elif self.disaggregation_mode == DisaggregationMode.DECODE:
|
elif self.disaggregation_mode == DisaggregationMode.DECODE:
|
||||||
self.heartbeat_failures = {}
|
self.heartbeat_failures = {}
|
||||||
self.session_pool = defaultdict(requests.Session)
|
self.session_pool = defaultdict(requests.Session)
|
||||||
@@ -351,7 +354,7 @@ class MooncakeKVManager(BaseKVManager):
|
|||||||
if req.mooncake_session_id in self.failed_sessions:
|
if req.mooncake_session_id in self.failed_sessions:
|
||||||
self.record_failure(
|
self.record_failure(
|
||||||
kv_chunk.room,
|
kv_chunk.room,
|
||||||
f"Decode instance could be dead, {req.mooncake_session_id} failed due to multiple errors",
|
f"Decode instance could be dead, remote mooncake session {req.mooncake_session_id} is not alive",
|
||||||
)
|
)
|
||||||
self.update_status(kv_chunk.room, KVPoll.Failed)
|
self.update_status(kv_chunk.room, KVPoll.Failed)
|
||||||
self.sync_status_to_decode_endpoint(
|
self.sync_status_to_decode_endpoint(
|
||||||
@@ -447,7 +450,8 @@ class MooncakeKVManager(BaseKVManager):
|
|||||||
kv_chunk.room not in self.request_status
|
kv_chunk.room not in self.request_status
|
||||||
or self.check_status(kv_chunk.room) == KVPoll.Success
|
or self.check_status(kv_chunk.room) == KVPoll.Success
|
||||||
):
|
):
|
||||||
self.transfer_infos.pop(kv_chunk.room)
|
if kv_chunk.room in self.transfer_infos:
|
||||||
|
self.transfer_infos.pop(kv_chunk.room)
|
||||||
|
|
||||||
except queue.Empty:
|
except queue.Empty:
|
||||||
continue
|
continue
|
||||||
@@ -630,7 +634,8 @@ class MooncakeKVManager(BaseKVManager):
|
|||||||
possible_affected_rooms = self.addr_to_rooms_tracker.get(
|
possible_affected_rooms = self.addr_to_rooms_tracker.get(
|
||||||
failed_bootstrap_addr, []
|
failed_bootstrap_addr, []
|
||||||
)
|
)
|
||||||
del self.addr_to_rooms_tracker[failed_bootstrap_addr]
|
if failed_bootstrap_addr in self.addr_to_rooms_tracker:
|
||||||
|
del self.addr_to_rooms_tracker[failed_bootstrap_addr]
|
||||||
|
|
||||||
# Report the requests associated with the failed bootstrap addr and mark their status as KVPoll.Failed
|
# Report the requests associated with the failed bootstrap addr and mark their status as KVPoll.Failed
|
||||||
affected_rooms = []
|
affected_rooms = []
|
||||||
@@ -660,6 +665,7 @@ class MooncakeKVSender(BaseKVSender):
|
|||||||
self.kv_mgr.update_status(bootstrap_room, KVPoll.Bootstrapping)
|
self.kv_mgr.update_status(bootstrap_room, KVPoll.Bootstrapping)
|
||||||
self.aux_index = None
|
self.aux_index = None
|
||||||
self.bootstrap_server_url = bootstrap_addr
|
self.bootstrap_server_url = bootstrap_addr
|
||||||
|
self.init_time = time.time()
|
||||||
self.conclude_state = None
|
self.conclude_state = None
|
||||||
# inner state
|
# inner state
|
||||||
self.curr_idx = 0
|
self.curr_idx = 0
|
||||||
@@ -694,13 +700,24 @@ class MooncakeKVSender(BaseKVSender):
|
|||||||
status = self.kv_mgr.check_status(self.bootstrap_room)
|
status = self.kv_mgr.check_status(self.bootstrap_room)
|
||||||
if status in (KVPoll.Success, KVPoll.Failed):
|
if status in (KVPoll.Success, KVPoll.Failed):
|
||||||
self.conclude_state = status
|
self.conclude_state = status
|
||||||
|
elif status == KVPoll.Bootstrapping:
|
||||||
|
now = time.time()
|
||||||
|
elapsed = now - self.init_time
|
||||||
|
if elapsed >= self.kv_mgr.bootstrap_time_out:
|
||||||
|
self.kv_mgr.record_failure(
|
||||||
|
self.bootstrap_room,
|
||||||
|
f"Request {self.bootstrap_room} timed out after {elapsed:.1f}s in KVPoll.Bootstrapping",
|
||||||
|
)
|
||||||
|
self.conclude_state = KVPoll.Failed
|
||||||
|
return KVPoll.Failed
|
||||||
|
|
||||||
return status
|
return status
|
||||||
else:
|
else:
|
||||||
return self.conclude_state
|
return self.conclude_state
|
||||||
|
|
||||||
def clear(self) -> None:
|
def clear(self) -> None:
|
||||||
self.kv_mgr.request_status.pop(self.bootstrap_room)
|
if self.bootstrap_room in self.kv_mgr.request_status:
|
||||||
|
self.kv_mgr.request_status.pop(self.bootstrap_room)
|
||||||
|
|
||||||
def failure_exception(self):
|
def failure_exception(self):
|
||||||
self.clear()
|
self.clear()
|
||||||
@@ -956,7 +973,8 @@ class MooncakeKVReceiver(BaseKVReceiver):
|
|||||||
return self.conclude_state
|
return self.conclude_state
|
||||||
|
|
||||||
def clear(self) -> None:
|
def clear(self) -> None:
|
||||||
self.kv_mgr.request_status.pop(self.bootstrap_room)
|
if self.bootstrap_room in self.kv_mgr.request_status:
|
||||||
|
self.kv_mgr.request_status.pop(self.bootstrap_room)
|
||||||
|
|
||||||
def failure_exception(self):
|
def failure_exception(self):
|
||||||
self.clear()
|
self.clear()
|
||||||
|
|||||||
Reference in New Issue
Block a user