From 90ae114569a8c2b4b529eaafafe63cd9dc34436c Mon Sep 17 00:00:00 2001 From: Li Wang Date: Tue, 28 Oct 2025 20:40:03 +0800 Subject: [PATCH] [CI] Fix nightly CI (#3821) ### What this PR does / why we need it? This patch fix the nightly CI runs [failure](https://github.com/vllm-project/vllm-ascend/actions/runs/18848144365) ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/releases/v0.11.1 --------- Signed-off-by: wangli --- .../workflows/_e2e_nightly_multi_node.yaml | 11 +++- tests/e2e/conftest.py | 2 +- .../multi_node/config/multi_node_config.py | 12 ++-- tests/e2e/nightly/multi_node/scripts/run.sh | 6 +- .../e2e/nightly/multi_node/test_multi_node.py | 10 ++- tools/aisbench.py | 62 ++++++++++++++++++- 6 files changed, 79 insertions(+), 24 deletions(-) diff --git a/.github/workflows/_e2e_nightly_multi_node.yaml b/.github/workflows/_e2e_nightly_multi_node.yaml index ca2854da..d9bbcf63 100644 --- a/.github/workflows/_e2e_nightly_multi_node.yaml +++ b/.github/workflows/_e2e_nightly_multi_node.yaml @@ -88,12 +88,17 @@ jobs: - name: Install kubectl run: | # Install kubectl + arch=$(uname -m) + + if echo "$arch" | grep -qiE "arm|aarch64"; then + echo "Detected ARM architecture: $arch" + KUBECTL="$KUBECTL"_arm + fi install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl - + # Verify kubectl installation kubectl version --client=true - # TODO: Add A2 tests - name: Decode kubeconfig from secrets run: | # Decode and save kubeconfig @@ -175,7 +180,7 @@ jobs: - name: Determine is success run: | - TIMEOUT=600 + TIMEOUT=300 ELAPSED=0 while [ ! -f "$RESULT_FILE" ]; do sleep 5 diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index 6940f4bf..64ac88cc 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -116,7 +116,7 @@ class RemoteOpenAIServer: model: str, vllm_serve_args: Union[list[str], str], *, - server_host: str = "0.0.0.0", + server_host: str = '0.0.0.0', server_port: int = 8080, env_dict: Optional[dict[str, str]] = None, seed: Optional[int] = None, diff --git a/tests/e2e/nightly/multi_node/config/multi_node_config.py b/tests/e2e/nightly/multi_node/config/multi_node_config.py index 4ca8d773..620ba39c 100644 --- a/tests/e2e/nightly/multi_node/config/multi_node_config.py +++ b/tests/e2e/nightly/multi_node/config/multi_node_config.py @@ -84,16 +84,17 @@ class MultiNodeConfig: self.envs["LOCAL_IP"] = self.cur_ip self.envs["NIC_NAME"] = self.nic_name + master_ip = self.cluster_ips[0] if self.disaggregated_prefill: self.envs[ "DISAGGREGATED_PREFILL_RANK_TABLE_PATH"] = self.disaggregated_prefill.get( "ranktable_path") if self.cur_index < self.decode_start_index: - self.envs["MASTER_IP"] = self.cluster_ips[0] + master_ip = self.cluster_ips[0] else: - self.envs["MASTER_IP"] = self.cluster_ips[ - self.decode_start_index] + master_ip = self.cluster_ips[self.decode_start_index] + self.envs["MASTER_IP"] = master_ip ascend_path = "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages" self.envs[ "LD_LIBRARY_PATH"] = f"{ascend_path}:{self.envs.get('LD_LIBRARY_PATH', os.environ.get('LD_LIBRARY_PATH', ''))}" @@ -288,8 +289,3 @@ class MultiNodeConfig: subprocess.run(cmd, env=env, check=True) assert os.path.exists( str(ranktable_path)), "failed generate ranktable.json" - - -if __name__ == '__main__': - config = MultiNodeConfig.from_yaml() - print(config.perf_cmd) diff --git a/tests/e2e/nightly/multi_node/scripts/run.sh b/tests/e2e/nightly/multi_node/scripts/run.sh index f596dfe6..38cbe4ef 100644 --- a/tests/e2e/nightly/multi_node/scripts/run.sh +++ b/tests/e2e/nightly/multi_node/scripts/run.sh @@ -121,7 +121,7 @@ download_go() { } install_ais_bench() { - local AIS_BENCH="$SRC_DIR/benchmark" + local AIS_BENCH="$SRC_DIR/vllm-ascend/benchmark" git clone https://gitee.com/aisbench/benchmark.git $AIS_BENCH cd $AIS_BENCH git checkout v3.0-20250930-master @@ -166,8 +166,8 @@ run_tests() { kill_npu_processes ret=$? if [ "$LWS_WORKER_INDEX" -eq 0 ]; then - mkdir -p "$(dirname "$RESULT_PATH")" - echo $ret > "$RESULT_PATH" + mkdir -p "$(dirname "$RESULT_FILE_PATH")" + echo $ret > "$RESULT_FILE_PATH" fi return $ret } diff --git a/tests/e2e/nightly/multi_node/test_multi_node.py b/tests/e2e/nightly/multi_node/test_multi_node.py index 9b3ac26c..1f6af1da 100644 --- a/tests/e2e/nightly/multi_node/test_multi_node.py +++ b/tests/e2e/nightly/multi_node/test_multi_node.py @@ -48,7 +48,7 @@ def get_local_model_path_with_retry( async def get_completions(url: str, model: str, prompts: Union[str, List[str]], **api_kwargs: Any) -> List[str]: """ - Asynchronously send HTTP requests to a /v1/completions endpoint. + Asynchronously send HTTP requests to endpoint. Args: url: Full endpoint URL, e.g. "http://localhost:1025/v1/completions" @@ -88,7 +88,10 @@ async def get_completions(url: str, model: str, prompts: Union[str, List[str]], @pytest.mark.asyncio async def test_multi_node() -> None: config = MultiNodeConfig.from_yaml() + # To avoid modelscope 400 HttpError, we should download the model with retry local_model_path = get_local_model_path_with_retry(config.model) + config.server_cmd = config.server_cmd.replace(config.model, + local_model_path) assert local_model_path is not None, "can not find any local weight for test" env_dict = config.envs perf_cmd = config.perf_cmd @@ -113,11 +116,6 @@ async def test_multi_node() -> None: ) as remote_server: if config.is_master: port = proxy_port if disaggregated_prefill else server_port - base_url = f"http://localhost:{port}/v1/completions" - _ = await get_completions(url=base_url, - model=local_model_path, - prompts=prompts, - api_kwargs=api_keyword_args) # aisbench test if acc_cmd: run_aisbench_cases(local_model_path, port, acc_cmd) diff --git a/tools/aisbench.py b/tools/aisbench.py index f81e4069..75b8fd92 100644 --- a/tools/aisbench.py +++ b/tools/aisbench.py @@ -14,11 +14,16 @@ # limitations under the License. # This file is a part of the vllm-ascend project. # +import hashlib import json import os import re import subprocess +import tempfile +from pathlib import Path +import filelock +import huggingface_hub import pandas as pd from modelscope import snapshot_download # type: ignore @@ -63,10 +68,12 @@ class AisbenchRunner: port: int, aisbench_config: dict, verify=True): - self.dataset_path = snapshot_download(aisbench_config["dataset_path"], - repo_type='dataset') self.model = model - self.model_path = snapshot_download(model) + self.dataset_path = maybe_download_from_modelscope( + aisbench_config["dataset_path"], repo_type="dataset") + self.model_path = maybe_download_from_modelscope(model) + assert self.dataset_path is not None and self.model_path is not None, \ + f"Failed to download dataset or model: dataset={self.dataset_path}, model={self.model_path}" self.port = port self.task_type = aisbench_config["case_type"] self.request_conf = aisbench_config["request_conf"] @@ -254,3 +261,52 @@ def run_aisbench_cases(model, port, aisbench_cases): def get_TTFT(result): TTFT = result[0][0].loc["TTFT", "Average"][:-3] return float(TTFT) + + +temp_dir = tempfile.gettempdir() + + +def get_lock(model_name_or_path: str | Path, cache_dir: str | None = None): + lock_dir = cache_dir or temp_dir + model_name_or_path = str(model_name_or_path) + os.makedirs(os.path.dirname(lock_dir), exist_ok=True) + model_name = model_name_or_path.replace("/", "-") + hash_name = hashlib.sha256(model_name.encode()).hexdigest() + # add hash to avoid conflict with old users' lock files + lock_file_name = hash_name + model_name + ".lock" + # mode 0o666 is required for the filelock to be shared across users + lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name), + mode=0o666) + return lock + + +def maybe_download_from_modelscope( + model: str, + repo_type: str | None = None, + revision: str | None = None, + download_dir: str | None = None, + ignore_patterns: str | list[str] | None = None, + allow_patterns: list[str] | str | None = None, +) -> str | None: + """ + Download model/dataset from ModelScope hub. + Returns the path to the downloaded model, or None if the model is not + downloaded from ModelScope. + """ + # Use file lock to prevent multiple processes from + # downloading the same model weights at the same time. + with get_lock(model, download_dir): + if not os.path.exists(model): + model_path = snapshot_download( + model_id=model, + repo_type=repo_type, + cache_dir=download_dir, + local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, + revision=revision, + ignore_file_pattern=ignore_patterns, + allow_patterns=allow_patterns, + ) + else: + model_path = model + return model_path + return None