[Nightly] Optimize nightly online test logger info (#4798)
### What this PR does / why we need it?
This patch do some tiny optimization for nightly ci:
1. Polling the frequency with which the service prints logs when it
starts up in order to obtain useful information more quickly.
2. Shorten the timeout for waiting server
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
@@ -224,53 +224,58 @@ class RemoteOpenAIServer:
|
|||||||
# Then wait for all api_server nodes
|
# Then wait for all api_server nodes
|
||||||
self._wait_for_multiple_servers(targets=targets, timeout=timeout)
|
self._wait_for_multiple_servers(targets=targets, timeout=timeout)
|
||||||
|
|
||||||
def _wait_for_multiple_servers(self, targets, timeout: float):
|
def _wait_for_multiple_servers(self,
|
||||||
|
targets,
|
||||||
|
timeout: float,
|
||||||
|
log_interval: float = 30.0):
|
||||||
"""
|
"""
|
||||||
targets: List[(node_ip, url)]
|
targets: List[(node_ip, url)]
|
||||||
|
log_interval
|
||||||
"""
|
"""
|
||||||
start = time.time()
|
start = time.time()
|
||||||
client = requests
|
client = requests
|
||||||
|
|
||||||
# track readiness
|
|
||||||
ready = {node_ip: False for node_ip, _ in targets}
|
ready = {node_ip: False for node_ip, _ in targets}
|
||||||
|
|
||||||
# polling loop
|
last_log_time = 0.0
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
|
now = time.time()
|
||||||
all_ready = True
|
all_ready = True
|
||||||
|
should_log = (now - last_log_time) >= log_interval
|
||||||
|
|
||||||
for node_ip, url in targets:
|
for node_ip, url in targets:
|
||||||
if ready[node_ip]:
|
if ready[node_ip]:
|
||||||
continue # already ready
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
resp = client.get(url)
|
resp = client.get(url)
|
||||||
if resp.status_code == 200:
|
if resp.status_code == 200:
|
||||||
ready[node_ip] = True
|
ready[node_ip] = True
|
||||||
logger.info(f"[READY] Node {node_ip} is ready.")
|
logger.info(f"[READY] Node {node_ip} is ready.")
|
||||||
else:
|
|
||||||
all_ready = False
|
|
||||||
logger.info(f"[WAIT] {url}: HTTP {resp.status_code}")
|
|
||||||
except RequestException:
|
except RequestException:
|
||||||
all_ready = False
|
all_ready = False
|
||||||
logger.info(f"[WAIT] {url}: connection failed")
|
if should_log:
|
||||||
|
logger.info(f"[WAIT] {url}: connection failed")
|
||||||
|
|
||||||
# underlying process died?
|
# check unexpected exit
|
||||||
result = self._poll()
|
result = self._poll()
|
||||||
if result is not None and result != 0:
|
if result is not None and result != 0:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"Server at {node_ip} exited unexpectedly."
|
f"Server at {node_ip} exited unexpectedly."
|
||||||
) from None
|
) from None
|
||||||
|
|
||||||
# if all nodes ready, exit
|
if should_log:
|
||||||
|
last_log_time = now
|
||||||
|
|
||||||
if all_ready:
|
if all_ready:
|
||||||
break
|
break
|
||||||
|
|
||||||
# check timeout
|
if now - start > timeout:
|
||||||
if time.time() - start > timeout:
|
|
||||||
not_ready_nodes = [n for n, ok in ready.items() if not ok]
|
not_ready_nodes = [n for n, ok in ready.items() if not ok]
|
||||||
self._terminate_server()
|
self._terminate_server()
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"Timeout: these nodes did not become ready: {not_ready_nodes}"
|
f"Timeout: these nodes did not become ready: {not_ready_nodes} in time: {timeout}s"
|
||||||
) from None
|
) from None
|
||||||
|
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
|
|||||||
@@ -114,7 +114,7 @@ async def test_multi_node() -> None:
|
|||||||
proxy_port=proxy_port,
|
proxy_port=proxy_port,
|
||||||
disaggregated_prefill=disaggregated_prefill,
|
disaggregated_prefill=disaggregated_prefill,
|
||||||
nodes_info=nodes_info,
|
nodes_info=nodes_info,
|
||||||
max_wait_seconds=2000,
|
max_wait_seconds=1200,
|
||||||
) as remote_server:
|
) as remote_server:
|
||||||
if config.is_master:
|
if config.is_master:
|
||||||
port = proxy_port if disaggregated_prefill else server_port
|
port = proxy_port if disaggregated_prefill else server_port
|
||||||
|
|||||||
Reference in New Issue
Block a user