From 89733111fa85edbc939fc9a43d6cb338c7fd0cea Mon Sep 17 00:00:00 2001 From: Li Wang Date: Wed, 10 Dec 2025 09:24:19 +0800 Subject: [PATCH] [Nightly] Optimize nightly online test logger info (#4798) ### What this PR does / why we need it? This patch do some tiny optimization for nightly ci: 1. Polling the frequency with which the service prints logs when it starts up in order to obtain useful information more quickly. 2. Shorten the timeout for waiting server - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 --------- Signed-off-by: wangli --- tests/e2e/conftest.py | 31 +++++++++++-------- .../e2e/nightly/multi_node/test_multi_node.py | 2 +- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index 8665aa91..9d0e709a 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -224,53 +224,58 @@ class RemoteOpenAIServer: # Then wait for all api_server nodes self._wait_for_multiple_servers(targets=targets, timeout=timeout) - def _wait_for_multiple_servers(self, targets, timeout: float): + def _wait_for_multiple_servers(self, + targets, + timeout: float, + log_interval: float = 30.0): """ targets: List[(node_ip, url)] + log_interval """ start = time.time() client = requests - # track readiness ready = {node_ip: False for node_ip, _ in targets} - # polling loop + last_log_time = 0.0 + while True: + now = time.time() all_ready = True + should_log = (now - last_log_time) >= log_interval for node_ip, url in targets: if ready[node_ip]: - continue # already ready + continue try: resp = client.get(url) if resp.status_code == 200: ready[node_ip] = True logger.info(f"[READY] Node {node_ip} is ready.") - else: - all_ready = False - logger.info(f"[WAIT] {url}: HTTP {resp.status_code}") except RequestException: all_ready = False - logger.info(f"[WAIT] {url}: connection failed") + if should_log: + logger.info(f"[WAIT] {url}: connection failed") - # underlying process died? + # check unexpected exit result = self._poll() if result is not None and result != 0: raise RuntimeError( f"Server at {node_ip} exited unexpectedly." ) from None - # if all nodes ready, exit + if should_log: + last_log_time = now + if all_ready: break - # check timeout - if time.time() - start > timeout: + if now - start > timeout: not_ready_nodes = [n for n, ok in ready.items() if not ok] self._terminate_server() raise RuntimeError( - f"Timeout: these nodes did not become ready: {not_ready_nodes}" + f"Timeout: these nodes did not become ready: {not_ready_nodes} in time: {timeout}s" ) from None time.sleep(5) diff --git a/tests/e2e/nightly/multi_node/test_multi_node.py b/tests/e2e/nightly/multi_node/test_multi_node.py index 66745a97..054ea31e 100644 --- a/tests/e2e/nightly/multi_node/test_multi_node.py +++ b/tests/e2e/nightly/multi_node/test_multi_node.py @@ -114,7 +114,7 @@ async def test_multi_node() -> None: proxy_port=proxy_port, disaggregated_prefill=disaggregated_prefill, nodes_info=nodes_info, - max_wait_seconds=2000, + max_wait_seconds=1200, ) as remote_server: if config.is_master: port = proxy_port if disaggregated_prefill else server_port