[Nightly] Optimize nightly CI (#4509)

### What this PR does / why we need it?
1. Optimize multi-node waiting logic
2. Remove the `tee` pipeline for logs, which will lead to hang issue

### How was this patch tested?


- vLLM version: v0.12.0
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.12.0

Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
Li Wang
2025-12-04 22:31:07 +08:00
committed by GitHub
parent fb15fec662
commit 283bc5c7ba
8 changed files with 90 additions and 105 deletions

View File

@@ -84,8 +84,6 @@ jobs:
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
pip install jinja2-cli pip install jinja2-cli
#apt-get update -y && apt-get install -y git curl
- name: Install kubectl - name: Install kubectl
run: | run: |
# Install kubectl # Install kubectl
@@ -112,6 +110,8 @@ jobs:
run: | run: |
# prepare for lws entrypoint scripts # prepare for lws entrypoint scripts
install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh
# clear log directory
rm -fr $RESULT_FILE
- name: Clear resources - name: Clear resources
run: | run: |
@@ -263,5 +263,5 @@ jobs:
- name: Post process - name: Post process
if: always() if: always()
run: | run: |
kubectl get pods -n $NAMESPACE kubectl get pods -n $NAMESPACE --ignore-not-found=true
kubectl delete -f ./lws.yaml kubectl delete -f ./lws.yaml --ignore-not-found=true || true

View File

@@ -61,9 +61,6 @@ jobs:
- name: multi-node-qwenw8a8-2node - name: multi-node-qwenw8a8-2node
config_file_path: Qwen3-235B-W8A8.yaml config_file_path: Qwen3-235B-W8A8.yaml
size: 2 size: 2
- name: multi-node-glm-2node
config_file_path: GLM-4_5.yaml
size: 2
- name: multi-node-dpsk3.2-exp-2node - name: multi-node-dpsk3.2-exp-2node
config_file_path: DeepSeek-V3_2-Exp-bf16.yaml config_file_path: DeepSeek-V3_2-Exp-bf16.yaml
size: 2 size: 2
@@ -134,9 +131,6 @@ jobs:
- name: deepseek3_2-exp-w8a8 - name: deepseek3_2-exp-w8a8
os: linux-aarch64-a3-16 os: linux-aarch64-a3-16
tests: tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py tests: tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py
- name: glm-4-5
os: linux-aarch64-a3-16
tests: tests/e2e/nightly/models/test_glm4_5.py
uses: ./.github/workflows/_e2e_nightly_single_node.yaml uses: ./.github/workflows/_e2e_nightly_single_node.yaml
with: with:
vllm: v0.12.0 vllm: v0.12.0

View File

@@ -78,6 +78,7 @@ if [ -n "$LOCAL_DEVICE_IDS" ]; then
fi fi
if [[ -n "${GEN_RANKTABLE}" || ! -e ${PWD}/ranktable.json ]]; then if [[ -n "${GEN_RANKTABLE}" || ! -e ${PWD}/ranktable.json ]]; then
timeout 180s \
GLOO_SOCKET_IFNAME=$NETWORK_CARD_NAME torchrun \ GLOO_SOCKET_IFNAME=$NETWORK_CARD_NAME torchrun \
--nproc_per_node 1 \ --nproc_per_node 1 \
--nnodes ${NNODES} \ --nnodes ${NNODES} \

View File

@@ -20,6 +20,7 @@
import contextlib import contextlib
import gc import gc
import json import json
import logging
import os import os
import shlex import shlex
import subprocess import subprocess
@@ -35,6 +36,7 @@ import requests
import torch import torch
from modelscope import snapshot_download # type: ignore[import-untyped] from modelscope import snapshot_download # type: ignore[import-untyped]
from PIL import Image from PIL import Image
from requests.exceptions import RequestException
from torch import nn from torch import nn
from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer, from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
BatchEncoding, BatchFeature) BatchEncoding, BatchFeature)
@@ -70,6 +72,7 @@ _PromptMultiModalInput = Union[List[_M], List[List[_M]]]
PromptImageInput = _PromptMultiModalInput[Image.Image] PromptImageInput = _PromptMultiModalInput[Image.Image]
PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]] PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]]
PromptVideoInput = _PromptMultiModalInput[np.ndarray] PromptVideoInput = _PromptMultiModalInput[np.ndarray]
logger = logging.getLogger(__name__)
_TEST_DIR = os.path.dirname(__file__) _TEST_DIR = os.path.dirname(__file__)
@@ -161,22 +164,17 @@ class RemoteOpenAIServer:
max_wait_seconds = max_wait_seconds or 1800 max_wait_seconds = max_wait_seconds or 1800
if self.disaggregated_prefill: if self.disaggregated_prefill:
assert proxy_port is not None, "for disaggregated_prefill, proxy port must be provided" assert proxy_port is not None, "for disaggregated_prefill, proxy port must be provided"
self._wait_for_server_pd(proxy_port=proxy_port, self._wait_for_server_pd(timeout=max_wait_seconds)
timeout=max_wait_seconds)
else: else:
self._wait_for_server(url=self.url_for("health"), self._wait_for_multiple_servers(
[(self.host, self.url_for("health"))],
timeout=max_wait_seconds) timeout=max_wait_seconds)
def __enter__(self): def __enter__(self):
return self return self
def __exit__(self, exc_type, exc_value, traceback): def __exit__(self, exc_type, exc_value, traceback):
self.proc.terminate() self._terminate_server()
try:
self.proc.wait(8)
except subprocess.TimeoutExpired:
# force kill if needed
self.proc.kill()
def _poll(self) -> Optional[int]: def _poll(self) -> Optional[int]:
"""Subclasses override this method to customize process polling""" """Subclasses override this method to customize process polling"""
@@ -201,48 +199,95 @@ class RemoteOpenAIServer:
finally: finally:
if isinstance(client, httpx.Client): if isinstance(client, httpx.Client):
client.close() client.close()
self._terminate_server()
def _wait_for_server_pd(self, proxy_port: int, timeout: float): def _wait_for_server_pd(self, timeout: float):
# Wait for all api_server nodes ready # Wait for all api_server nodes ready
assert self.nodes_info is not None, "cluster info must be provided" assert self.nodes_info is not None, "cluster info must be provided"
for node_info in self.nodes_info: proxy_port = self.proxy_port
if node_info.headless:
continue
url_health = f"http://{node_info.ip}:{node_info.server_port}/health" def url_health(ip: str, port: int) -> str:
self._wait_for_server(url=url_health, timeout=timeout) return f"http://{ip}:{port}/health"
targets = [(node_info.ip,
url_health(node_info.ip, node_info.server_port))
for node_info in self.nodes_info if not node_info.headless]
# Wait for proxy ready # Wait for proxy ready
master_node = self.nodes_info[0] master_node = self.nodes_info[0]
url_proxy = f"http://{master_node.ip}:{proxy_port}/healthcheck" url_proxy = f"http://{master_node.ip}:{proxy_port}/healthcheck"
self._wait_for_server(url=url_proxy, timeout=timeout)
def _wait_for_server(self, *, url: str, timeout: float): # Wait for master node proxy first
# run health check self._wait_for_multiple_servers([(master_node.ip, url_proxy)],
timeout=timeout)
# Then wait for all api_server nodes
self._wait_for_multiple_servers(targets=targets, timeout=timeout)
def _wait_for_multiple_servers(self, targets, timeout: float):
"""
targets: List[(node_ip, url)]
"""
start = time.time() start = time.time()
client = requests client = requests
# track readiness
ready = {node_ip: False for node_ip, _ in targets}
# polling loop
while True: while True:
all_ready = True
for node_ip, url in targets:
if ready[node_ip]:
continue # already ready
try: try:
if client.get(url).status_code == 200: resp = client.get(url)
break if resp.status_code == 200:
except Exception: ready[node_ip] = True
# this exception can only be raised by requests.get, logger.info(f"[READY] Node {node_ip} is ready.")
# which means the server is not ready yet. else:
# the stack trace is not useful, so we suppress it all_ready = False
# by using `raise from None`. logger.info(f"[WAIT] {url}: HTTP {resp.status_code}")
except RequestException:
all_ready = False
logger.info(f"[WAIT] {url}: connection failed")
# underlying process died?
result = self._poll() result = self._poll()
if result is not None and result != 0: if result is not None and result != 0:
raise RuntimeError("Server exited unexpectedly.") from None raise RuntimeError(
f"Server at {node_ip} exited unexpectedly."
) from None
# if all nodes ready, exit
if all_ready:
break
# check timeout
if time.time() - start > timeout:
not_ready_nodes = [n for n, ok in ready.items() if not ok]
self._terminate_server()
raise RuntimeError(
f"Timeout: these nodes did not become ready: {not_ready_nodes}"
) from None
time.sleep(5) time.sleep(5)
if time.time() - start > timeout:
raise RuntimeError(
"Server failed to start in time.") from None
@property @property
def url_root(self) -> str: def url_root(self) -> str:
return f"http://{self.host}:{self.port}" return f"http://{self.host}:{self.port}"
def _terminate_server(self) -> None:
"""Subclasses override this method to customize server process termination"""
self.proc.terminate()
try:
self.proc.wait(8)
except subprocess.TimeoutExpired:
# force kill if needed
self.proc.kill()
def url_for(self, *parts: str) -> str: def url_for(self, *parts: str) -> str:
return self.url_root + "/" + "/".join(parts) return self.url_root + "/" + "/".join(parts)

View File

@@ -1,50 +0,0 @@
test_name: "test GLM-4.5 multi-dp"
model: "ZhipuAI/GLM-4.5"
num_nodes: 2
npu_per_node: 16
env_common:
VLLM_USE_MODELSCOPE: true
OMP_PROC_BIND: false
OMP_NUM_THREADS: 100
HCCL_BUFFSIZE: 1024
SERVER_PORT: 8080
NUMEXPR_MAX_THREADS: 128
deployment:
-
server_cmd: >
vllm serve "ZhipuAI/GLM-4.5"
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 4
--data-parallel-size-local 2
--data-parallel-address $LOCAL_IP
--data-parallel-rpc-port 13389
--tensor-parallel-size 8
--seed 1024
--enable-expert-parallel
--max-num-seqs 16
--max-model-len 8192
--max-num-batched-tokens 8192
--trust-remote-code
--no-enable-prefix-caching
--gpu-memory-utilization 0.9
-
server_cmd: >
vllm serve "ZhipuAI/GLM-4.5"
--headless
--data-parallel-size 4
--data-parallel-size-local 2
--data-parallel-start-rank 2
--data-parallel-address $MASTER_IP
--data-parallel-rpc-port 13389
--tensor-parallel-size 8
--seed 1024
--max-num-seqs 16
--max-model-len 8192
--max-num-batched-tokens 8192
--enable-expert-parallel
--trust-remote-code
--no-enable-prefix-caching
--gpu-memory-utilization 0.9
benchmarks:

View File

@@ -31,8 +31,8 @@ class NodeInfo:
return (f"NodeInfo:\n" return (f"NodeInfo:\n"
f" index={self.index}\n" f" index={self.index}\n"
f" ip={self.ip}\n" f" ip={self.ip}\n"
f" server_port={self.server_port}\n" f" headless={self.headless}\n"
f" headless={self.headless}") f" server_port={self.server_port}")
class MultiNodeConfig: class MultiNodeConfig:

View File

@@ -87,7 +87,7 @@ spec:
- name: VLLM_ASCEND_REMOTE_URL - name: VLLM_ASCEND_REMOTE_URL
value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }} value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }}
- name: RESULT_FILE_PATH - name: RESULT_FILE_PATH
value: {{ result_file_path | default("/root/.cache/tests/ret/test_result.txt") }} value: {{ result_file_path | default("/root/.cache/tests/ret") }}
- name: FAIL_TAG - name: FAIL_TAG
value: {{ fail_tag | default("FAIL_TAG") }} value: {{ fail_tag | default("FAIL_TAG") }}
command: command:

View File

@@ -127,19 +127,14 @@ kill_npu_processes() {
run_tests_with_log() { run_tests_with_log() {
set +e set +e
kill_npu_processes kill_npu_processes
BASENAME=$(basename "$CONFIG_YAML_PATH" .yaml) pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py
# each worker should have log file ret=$?
LOG_FILE="${RESULT_FILE_PATH}/${BASENAME}_worker_${LWS_WORKER_INDEX}.log"
mkdir -p ${RESULT_FILE_PATH}
pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py 2>&1 | tee $LOG_FILE
ret=${PIPESTATUS[0]}
set -e set -e
if [ "$LWS_WORKER_INDEX" -eq 0 ]; then if [ "$LWS_WORKER_INDEX" -eq 0 ]; then
if [ $ret -eq 0 ]; then if [ $ret -eq 0 ]; then
print_success "All tests passed!" print_success "All tests passed!"
else else
print_failure "Some tests failed!" print_failure "Some tests failed!"
mv LOG_FILE error_${LOG_FILE}
fi fi
fi fi
} }