[Nightly] Optimize nightly CI (#4509)
### What this PR does / why we need it? 1. Optimize multi-node waiting logic 2. Remove the `tee` pipeline for logs, which will lead to hang issue ### How was this patch tested? - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.12.0 Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
@@ -84,8 +84,6 @@ jobs:
|
|||||||
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
||||||
pip install jinja2-cli
|
pip install jinja2-cli
|
||||||
|
|
||||||
#apt-get update -y && apt-get install -y git curl
|
|
||||||
|
|
||||||
- name: Install kubectl
|
- name: Install kubectl
|
||||||
run: |
|
run: |
|
||||||
# Install kubectl
|
# Install kubectl
|
||||||
@@ -112,6 +110,8 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
# prepare for lws entrypoint scripts
|
# prepare for lws entrypoint scripts
|
||||||
install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh
|
install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh
|
||||||
|
# clear log directory
|
||||||
|
rm -fr $RESULT_FILE
|
||||||
|
|
||||||
- name: Clear resources
|
- name: Clear resources
|
||||||
run: |
|
run: |
|
||||||
@@ -263,5 +263,5 @@ jobs:
|
|||||||
- name: Post process
|
- name: Post process
|
||||||
if: always()
|
if: always()
|
||||||
run: |
|
run: |
|
||||||
kubectl get pods -n $NAMESPACE
|
kubectl get pods -n $NAMESPACE --ignore-not-found=true
|
||||||
kubectl delete -f ./lws.yaml
|
kubectl delete -f ./lws.yaml --ignore-not-found=true || true
|
||||||
|
|||||||
@@ -61,9 +61,6 @@ jobs:
|
|||||||
- name: multi-node-qwenw8a8-2node
|
- name: multi-node-qwenw8a8-2node
|
||||||
config_file_path: Qwen3-235B-W8A8.yaml
|
config_file_path: Qwen3-235B-W8A8.yaml
|
||||||
size: 2
|
size: 2
|
||||||
- name: multi-node-glm-2node
|
|
||||||
config_file_path: GLM-4_5.yaml
|
|
||||||
size: 2
|
|
||||||
- name: multi-node-dpsk3.2-exp-2node
|
- name: multi-node-dpsk3.2-exp-2node
|
||||||
config_file_path: DeepSeek-V3_2-Exp-bf16.yaml
|
config_file_path: DeepSeek-V3_2-Exp-bf16.yaml
|
||||||
size: 2
|
size: 2
|
||||||
@@ -134,9 +131,6 @@ jobs:
|
|||||||
- name: deepseek3_2-exp-w8a8
|
- name: deepseek3_2-exp-w8a8
|
||||||
os: linux-aarch64-a3-16
|
os: linux-aarch64-a3-16
|
||||||
tests: tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py
|
tests: tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py
|
||||||
- name: glm-4-5
|
|
||||||
os: linux-aarch64-a3-16
|
|
||||||
tests: tests/e2e/nightly/models/test_glm4_5.py
|
|
||||||
uses: ./.github/workflows/_e2e_nightly_single_node.yaml
|
uses: ./.github/workflows/_e2e_nightly_single_node.yaml
|
||||||
with:
|
with:
|
||||||
vllm: v0.12.0
|
vllm: v0.12.0
|
||||||
|
|||||||
@@ -78,6 +78,7 @@ if [ -n "$LOCAL_DEVICE_IDS" ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ -n "${GEN_RANKTABLE}" || ! -e ${PWD}/ranktable.json ]]; then
|
if [[ -n "${GEN_RANKTABLE}" || ! -e ${PWD}/ranktable.json ]]; then
|
||||||
|
timeout 180s \
|
||||||
GLOO_SOCKET_IFNAME=$NETWORK_CARD_NAME torchrun \
|
GLOO_SOCKET_IFNAME=$NETWORK_CARD_NAME torchrun \
|
||||||
--nproc_per_node 1 \
|
--nproc_per_node 1 \
|
||||||
--nnodes ${NNODES} \
|
--nnodes ${NNODES} \
|
||||||
|
|||||||
@@ -20,6 +20,7 @@
|
|||||||
import contextlib
|
import contextlib
|
||||||
import gc
|
import gc
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import shlex
|
import shlex
|
||||||
import subprocess
|
import subprocess
|
||||||
@@ -35,6 +36,7 @@ import requests
|
|||||||
import torch
|
import torch
|
||||||
from modelscope import snapshot_download # type: ignore[import-untyped]
|
from modelscope import snapshot_download # type: ignore[import-untyped]
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
from requests.exceptions import RequestException
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
|
from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
|
||||||
BatchEncoding, BatchFeature)
|
BatchEncoding, BatchFeature)
|
||||||
@@ -70,6 +72,7 @@ _PromptMultiModalInput = Union[List[_M], List[List[_M]]]
|
|||||||
PromptImageInput = _PromptMultiModalInput[Image.Image]
|
PromptImageInput = _PromptMultiModalInput[Image.Image]
|
||||||
PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]]
|
PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]]
|
||||||
PromptVideoInput = _PromptMultiModalInput[np.ndarray]
|
PromptVideoInput = _PromptMultiModalInput[np.ndarray]
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
_TEST_DIR = os.path.dirname(__file__)
|
_TEST_DIR = os.path.dirname(__file__)
|
||||||
|
|
||||||
@@ -161,22 +164,17 @@ class RemoteOpenAIServer:
|
|||||||
max_wait_seconds = max_wait_seconds or 1800
|
max_wait_seconds = max_wait_seconds or 1800
|
||||||
if self.disaggregated_prefill:
|
if self.disaggregated_prefill:
|
||||||
assert proxy_port is not None, "for disaggregated_prefill, proxy port must be provided"
|
assert proxy_port is not None, "for disaggregated_prefill, proxy port must be provided"
|
||||||
self._wait_for_server_pd(proxy_port=proxy_port,
|
self._wait_for_server_pd(timeout=max_wait_seconds)
|
||||||
timeout=max_wait_seconds)
|
|
||||||
else:
|
else:
|
||||||
self._wait_for_server(url=self.url_for("health"),
|
self._wait_for_multiple_servers(
|
||||||
|
[(self.host, self.url_for("health"))],
|
||||||
timeout=max_wait_seconds)
|
timeout=max_wait_seconds)
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def __exit__(self, exc_type, exc_value, traceback):
|
def __exit__(self, exc_type, exc_value, traceback):
|
||||||
self.proc.terminate()
|
self._terminate_server()
|
||||||
try:
|
|
||||||
self.proc.wait(8)
|
|
||||||
except subprocess.TimeoutExpired:
|
|
||||||
# force kill if needed
|
|
||||||
self.proc.kill()
|
|
||||||
|
|
||||||
def _poll(self) -> Optional[int]:
|
def _poll(self) -> Optional[int]:
|
||||||
"""Subclasses override this method to customize process polling"""
|
"""Subclasses override this method to customize process polling"""
|
||||||
@@ -201,48 +199,95 @@ class RemoteOpenAIServer:
|
|||||||
finally:
|
finally:
|
||||||
if isinstance(client, httpx.Client):
|
if isinstance(client, httpx.Client):
|
||||||
client.close()
|
client.close()
|
||||||
|
self._terminate_server()
|
||||||
|
|
||||||
def _wait_for_server_pd(self, proxy_port: int, timeout: float):
|
def _wait_for_server_pd(self, timeout: float):
|
||||||
# Wait for all api_server nodes ready
|
# Wait for all api_server nodes ready
|
||||||
assert self.nodes_info is not None, "cluster info must be provided"
|
assert self.nodes_info is not None, "cluster info must be provided"
|
||||||
for node_info in self.nodes_info:
|
proxy_port = self.proxy_port
|
||||||
if node_info.headless:
|
|
||||||
continue
|
|
||||||
|
|
||||||
url_health = f"http://{node_info.ip}:{node_info.server_port}/health"
|
def url_health(ip: str, port: int) -> str:
|
||||||
self._wait_for_server(url=url_health, timeout=timeout)
|
return f"http://{ip}:{port}/health"
|
||||||
|
|
||||||
|
targets = [(node_info.ip,
|
||||||
|
url_health(node_info.ip, node_info.server_port))
|
||||||
|
for node_info in self.nodes_info if not node_info.headless]
|
||||||
|
|
||||||
# Wait for proxy ready
|
# Wait for proxy ready
|
||||||
master_node = self.nodes_info[0]
|
master_node = self.nodes_info[0]
|
||||||
url_proxy = f"http://{master_node.ip}:{proxy_port}/healthcheck"
|
url_proxy = f"http://{master_node.ip}:{proxy_port}/healthcheck"
|
||||||
self._wait_for_server(url=url_proxy, timeout=timeout)
|
|
||||||
|
|
||||||
def _wait_for_server(self, *, url: str, timeout: float):
|
# Wait for master node proxy first
|
||||||
# run health check
|
self._wait_for_multiple_servers([(master_node.ip, url_proxy)],
|
||||||
|
timeout=timeout)
|
||||||
|
|
||||||
|
# Then wait for all api_server nodes
|
||||||
|
self._wait_for_multiple_servers(targets=targets, timeout=timeout)
|
||||||
|
|
||||||
|
def _wait_for_multiple_servers(self, targets, timeout: float):
|
||||||
|
"""
|
||||||
|
targets: List[(node_ip, url)]
|
||||||
|
"""
|
||||||
start = time.time()
|
start = time.time()
|
||||||
client = requests
|
client = requests
|
||||||
|
|
||||||
|
# track readiness
|
||||||
|
ready = {node_ip: False for node_ip, _ in targets}
|
||||||
|
|
||||||
|
# polling loop
|
||||||
while True:
|
while True:
|
||||||
|
all_ready = True
|
||||||
|
|
||||||
|
for node_ip, url in targets:
|
||||||
|
if ready[node_ip]:
|
||||||
|
continue # already ready
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if client.get(url).status_code == 200:
|
resp = client.get(url)
|
||||||
break
|
if resp.status_code == 200:
|
||||||
except Exception:
|
ready[node_ip] = True
|
||||||
# this exception can only be raised by requests.get,
|
logger.info(f"[READY] Node {node_ip} is ready.")
|
||||||
# which means the server is not ready yet.
|
else:
|
||||||
# the stack trace is not useful, so we suppress it
|
all_ready = False
|
||||||
# by using `raise from None`.
|
logger.info(f"[WAIT] {url}: HTTP {resp.status_code}")
|
||||||
|
except RequestException:
|
||||||
|
all_ready = False
|
||||||
|
logger.info(f"[WAIT] {url}: connection failed")
|
||||||
|
|
||||||
|
# underlying process died?
|
||||||
result = self._poll()
|
result = self._poll()
|
||||||
if result is not None and result != 0:
|
if result is not None and result != 0:
|
||||||
raise RuntimeError("Server exited unexpectedly.") from None
|
raise RuntimeError(
|
||||||
|
f"Server at {node_ip} exited unexpectedly."
|
||||||
|
) from None
|
||||||
|
|
||||||
|
# if all nodes ready, exit
|
||||||
|
if all_ready:
|
||||||
|
break
|
||||||
|
|
||||||
|
# check timeout
|
||||||
|
if time.time() - start > timeout:
|
||||||
|
not_ready_nodes = [n for n, ok in ready.items() if not ok]
|
||||||
|
self._terminate_server()
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Timeout: these nodes did not become ready: {not_ready_nodes}"
|
||||||
|
) from None
|
||||||
|
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
if time.time() - start > timeout:
|
|
||||||
raise RuntimeError(
|
|
||||||
"Server failed to start in time.") from None
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def url_root(self) -> str:
|
def url_root(self) -> str:
|
||||||
return f"http://{self.host}:{self.port}"
|
return f"http://{self.host}:{self.port}"
|
||||||
|
|
||||||
|
def _terminate_server(self) -> None:
|
||||||
|
"""Subclasses override this method to customize server process termination"""
|
||||||
|
self.proc.terminate()
|
||||||
|
try:
|
||||||
|
self.proc.wait(8)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
# force kill if needed
|
||||||
|
self.proc.kill()
|
||||||
|
|
||||||
def url_for(self, *parts: str) -> str:
|
def url_for(self, *parts: str) -> str:
|
||||||
return self.url_root + "/" + "/".join(parts)
|
return self.url_root + "/" + "/".join(parts)
|
||||||
|
|
||||||
|
|||||||
@@ -1,50 +0,0 @@
|
|||||||
test_name: "test GLM-4.5 multi-dp"
|
|
||||||
model: "ZhipuAI/GLM-4.5"
|
|
||||||
num_nodes: 2
|
|
||||||
npu_per_node: 16
|
|
||||||
env_common:
|
|
||||||
VLLM_USE_MODELSCOPE: true
|
|
||||||
OMP_PROC_BIND: false
|
|
||||||
OMP_NUM_THREADS: 100
|
|
||||||
HCCL_BUFFSIZE: 1024
|
|
||||||
SERVER_PORT: 8080
|
|
||||||
NUMEXPR_MAX_THREADS: 128
|
|
||||||
|
|
||||||
deployment:
|
|
||||||
-
|
|
||||||
server_cmd: >
|
|
||||||
vllm serve "ZhipuAI/GLM-4.5"
|
|
||||||
--host 0.0.0.0
|
|
||||||
--port $SERVER_PORT
|
|
||||||
--data-parallel-size 4
|
|
||||||
--data-parallel-size-local 2
|
|
||||||
--data-parallel-address $LOCAL_IP
|
|
||||||
--data-parallel-rpc-port 13389
|
|
||||||
--tensor-parallel-size 8
|
|
||||||
--seed 1024
|
|
||||||
--enable-expert-parallel
|
|
||||||
--max-num-seqs 16
|
|
||||||
--max-model-len 8192
|
|
||||||
--max-num-batched-tokens 8192
|
|
||||||
--trust-remote-code
|
|
||||||
--no-enable-prefix-caching
|
|
||||||
--gpu-memory-utilization 0.9
|
|
||||||
-
|
|
||||||
server_cmd: >
|
|
||||||
vllm serve "ZhipuAI/GLM-4.5"
|
|
||||||
--headless
|
|
||||||
--data-parallel-size 4
|
|
||||||
--data-parallel-size-local 2
|
|
||||||
--data-parallel-start-rank 2
|
|
||||||
--data-parallel-address $MASTER_IP
|
|
||||||
--data-parallel-rpc-port 13389
|
|
||||||
--tensor-parallel-size 8
|
|
||||||
--seed 1024
|
|
||||||
--max-num-seqs 16
|
|
||||||
--max-model-len 8192
|
|
||||||
--max-num-batched-tokens 8192
|
|
||||||
--enable-expert-parallel
|
|
||||||
--trust-remote-code
|
|
||||||
--no-enable-prefix-caching
|
|
||||||
--gpu-memory-utilization 0.9
|
|
||||||
benchmarks:
|
|
||||||
@@ -31,8 +31,8 @@ class NodeInfo:
|
|||||||
return (f"NodeInfo:\n"
|
return (f"NodeInfo:\n"
|
||||||
f" index={self.index}\n"
|
f" index={self.index}\n"
|
||||||
f" ip={self.ip}\n"
|
f" ip={self.ip}\n"
|
||||||
f" server_port={self.server_port}\n"
|
f" headless={self.headless}\n"
|
||||||
f" headless={self.headless}")
|
f" server_port={self.server_port}")
|
||||||
|
|
||||||
|
|
||||||
class MultiNodeConfig:
|
class MultiNodeConfig:
|
||||||
|
|||||||
@@ -87,7 +87,7 @@ spec:
|
|||||||
- name: VLLM_ASCEND_REMOTE_URL
|
- name: VLLM_ASCEND_REMOTE_URL
|
||||||
value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }}
|
value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }}
|
||||||
- name: RESULT_FILE_PATH
|
- name: RESULT_FILE_PATH
|
||||||
value: {{ result_file_path | default("/root/.cache/tests/ret/test_result.txt") }}
|
value: {{ result_file_path | default("/root/.cache/tests/ret") }}
|
||||||
- name: FAIL_TAG
|
- name: FAIL_TAG
|
||||||
value: {{ fail_tag | default("FAIL_TAG") }}
|
value: {{ fail_tag | default("FAIL_TAG") }}
|
||||||
command:
|
command:
|
||||||
|
|||||||
@@ -127,19 +127,14 @@ kill_npu_processes() {
|
|||||||
run_tests_with_log() {
|
run_tests_with_log() {
|
||||||
set +e
|
set +e
|
||||||
kill_npu_processes
|
kill_npu_processes
|
||||||
BASENAME=$(basename "$CONFIG_YAML_PATH" .yaml)
|
pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py
|
||||||
# each worker should have log file
|
ret=$?
|
||||||
LOG_FILE="${RESULT_FILE_PATH}/${BASENAME}_worker_${LWS_WORKER_INDEX}.log"
|
|
||||||
mkdir -p ${RESULT_FILE_PATH}
|
|
||||||
pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py 2>&1 | tee $LOG_FILE
|
|
||||||
ret=${PIPESTATUS[0]}
|
|
||||||
set -e
|
set -e
|
||||||
if [ "$LWS_WORKER_INDEX" -eq 0 ]; then
|
if [ "$LWS_WORKER_INDEX" -eq 0 ]; then
|
||||||
if [ $ret -eq 0 ]; then
|
if [ $ret -eq 0 ]; then
|
||||||
print_success "All tests passed!"
|
print_success "All tests passed!"
|
||||||
else
|
else
|
||||||
print_failure "Some tests failed!"
|
print_failure "Some tests failed!"
|
||||||
mv LOG_FILE error_${LOG_FILE}
|
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user