[Nightly] Optimize nightly online test logger info (#4798)

### What this PR does / why we need it?
This patch do some tiny optimization for nightly ci:

1. Polling the frequency with which the service prints logs when it
starts up in order to obtain useful information more quickly.
2. Shorten the timeout for waiting server

- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
Li Wang
2025-12-10 09:24:19 +08:00
committed by GitHub
parent 835b4c8f1d
commit 89733111fa
2 changed files with 19 additions and 14 deletions

View File

@@ -224,53 +224,58 @@ class RemoteOpenAIServer:
# Then wait for all api_server nodes
self._wait_for_multiple_servers(targets=targets, timeout=timeout)
def _wait_for_multiple_servers(self, targets, timeout: float):
def _wait_for_multiple_servers(self,
targets,
timeout: float,
log_interval: float = 30.0):
"""
targets: List[(node_ip, url)]
log_interval
"""
start = time.time()
client = requests
# track readiness
ready = {node_ip: False for node_ip, _ in targets}
# polling loop
last_log_time = 0.0
while True:
now = time.time()
all_ready = True
should_log = (now - last_log_time) >= log_interval
for node_ip, url in targets:
if ready[node_ip]:
continue # already ready
continue
try:
resp = client.get(url)
if resp.status_code == 200:
ready[node_ip] = True
logger.info(f"[READY] Node {node_ip} is ready.")
else:
all_ready = False
logger.info(f"[WAIT] {url}: HTTP {resp.status_code}")
except RequestException:
all_ready = False
logger.info(f"[WAIT] {url}: connection failed")
if should_log:
logger.info(f"[WAIT] {url}: connection failed")
# underlying process died?
# check unexpected exit
result = self._poll()
if result is not None and result != 0:
raise RuntimeError(
f"Server at {node_ip} exited unexpectedly."
) from None
# if all nodes ready, exit
if should_log:
last_log_time = now
if all_ready:
break
# check timeout
if time.time() - start > timeout:
if now - start > timeout:
not_ready_nodes = [n for n, ok in ready.items() if not ok]
self._terminate_server()
raise RuntimeError(
f"Timeout: these nodes did not become ready: {not_ready_nodes}"
f"Timeout: these nodes did not become ready: {not_ready_nodes} in time: {timeout}s"
) from None
time.sleep(5)