[Nightly] Optimize nightly online test logger info (#4798)
### What this PR does / why we need it?
This patch do some tiny optimization for nightly ci:
1. Polling the frequency with which the service prints logs when it
starts up in order to obtain useful information more quickly.
2. Shorten the timeout for waiting server
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
@@ -224,53 +224,58 @@ class RemoteOpenAIServer:
|
||||
# Then wait for all api_server nodes
|
||||
self._wait_for_multiple_servers(targets=targets, timeout=timeout)
|
||||
|
||||
def _wait_for_multiple_servers(self, targets, timeout: float):
|
||||
def _wait_for_multiple_servers(self,
|
||||
targets,
|
||||
timeout: float,
|
||||
log_interval: float = 30.0):
|
||||
"""
|
||||
targets: List[(node_ip, url)]
|
||||
log_interval
|
||||
"""
|
||||
start = time.time()
|
||||
client = requests
|
||||
|
||||
# track readiness
|
||||
ready = {node_ip: False for node_ip, _ in targets}
|
||||
|
||||
# polling loop
|
||||
last_log_time = 0.0
|
||||
|
||||
while True:
|
||||
now = time.time()
|
||||
all_ready = True
|
||||
should_log = (now - last_log_time) >= log_interval
|
||||
|
||||
for node_ip, url in targets:
|
||||
if ready[node_ip]:
|
||||
continue # already ready
|
||||
continue
|
||||
|
||||
try:
|
||||
resp = client.get(url)
|
||||
if resp.status_code == 200:
|
||||
ready[node_ip] = True
|
||||
logger.info(f"[READY] Node {node_ip} is ready.")
|
||||
else:
|
||||
all_ready = False
|
||||
logger.info(f"[WAIT] {url}: HTTP {resp.status_code}")
|
||||
except RequestException:
|
||||
all_ready = False
|
||||
logger.info(f"[WAIT] {url}: connection failed")
|
||||
if should_log:
|
||||
logger.info(f"[WAIT] {url}: connection failed")
|
||||
|
||||
# underlying process died?
|
||||
# check unexpected exit
|
||||
result = self._poll()
|
||||
if result is not None and result != 0:
|
||||
raise RuntimeError(
|
||||
f"Server at {node_ip} exited unexpectedly."
|
||||
) from None
|
||||
|
||||
# if all nodes ready, exit
|
||||
if should_log:
|
||||
last_log_time = now
|
||||
|
||||
if all_ready:
|
||||
break
|
||||
|
||||
# check timeout
|
||||
if time.time() - start > timeout:
|
||||
if now - start > timeout:
|
||||
not_ready_nodes = [n for n, ok in ready.items() if not ok]
|
||||
self._terminate_server()
|
||||
raise RuntimeError(
|
||||
f"Timeout: these nodes did not become ready: {not_ready_nodes}"
|
||||
f"Timeout: these nodes did not become ready: {not_ready_nodes} in time: {timeout}s"
|
||||
) from None
|
||||
|
||||
time.sleep(5)
|
||||
|
||||
Reference in New Issue
Block a user