[CI] Fix nightly CI (#3821)

### What this PR does / why we need it? This patch fix the nightly CI runs [failure](https://github.com/vllm-project/vllm-ascend/actions/runs/18848144365) ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/releases/v0.11.1 --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2025-10-28 20:40:03 +08:00
parent a7450db1bd
commit 90ae114569
6 changed files with 79 additions and 24 deletions
--- a/.github/workflows/_e2e_nightly_multi_node.yaml
+++ b/.github/workflows/_e2e_nightly_multi_node.yaml
@@ -88,12 +88,17 @@ jobs:
        - name: Install kubectl
          run: |
            # Install kubectl
+            arch=$(uname -m)
+
+            if echo "$arch" | grep -qiE "arm|aarch64"; then
+              echo "Detected ARM architecture: $arch"
+              KUBECTL="$KUBECTL"_arm
+            fi
            install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
-            
+
            # Verify kubectl installation
            kubectl version --client=true

-        # TODO: Add A2 tests
        - name: Decode kubeconfig from secrets
          run: |
            # Decode and save kubeconfig
@@ -175,7 +180,7 @@ jobs:

        - name: Determine is success
          run: |
-            TIMEOUT=600
+            TIMEOUT=300
            ELAPSED=0
            while [ ! -f "$RESULT_FILE" ]; do
              sleep 5
--- a/tests/e2e/conftest.py
+++ b/tests/e2e/conftest.py
@@ -116,7 +116,7 @@ class RemoteOpenAIServer:
                 model: str,
                 vllm_serve_args: Union[list[str], str],
                 *,
-                 server_host: str = "0.0.0.0",
+                 server_host: str = '0.0.0.0',
                 server_port: int = 8080,
                 env_dict: Optional[dict[str, str]] = None,
                 seed: Optional[int] = None,
--- a/tests/e2e/nightly/multi_node/config/multi_node_config.py
+++ b/tests/e2e/nightly/multi_node/config/multi_node_config.py
@@ -84,16 +84,17 @@ class MultiNodeConfig:
        self.envs["LOCAL_IP"] = self.cur_ip
        self.envs["NIC_NAME"] = self.nic_name

+        master_ip = self.cluster_ips[0]
        if self.disaggregated_prefill:
            self.envs[
                "DISAGGREGATED_PREFILL_RANK_TABLE_PATH"] = self.disaggregated_prefill.get(
                    "ranktable_path")
            if self.cur_index < self.decode_start_index:
-                self.envs["MASTER_IP"] = self.cluster_ips[0]
+                master_ip = self.cluster_ips[0]
            else:
-                self.envs["MASTER_IP"] = self.cluster_ips[
-                    self.decode_start_index]
+                master_ip = self.cluster_ips[self.decode_start_index]

+        self.envs["MASTER_IP"] = master_ip
        ascend_path = "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages"
        self.envs[
            "LD_LIBRARY_PATH"] = f"{ascend_path}:{self.envs.get('LD_LIBRARY_PATH', os.environ.get('LD_LIBRARY_PATH', ''))}"
@@ -288,8 +289,3 @@ class MultiNodeConfig:
        subprocess.run(cmd, env=env, check=True)
        assert os.path.exists(
            str(ranktable_path)), "failed generate ranktable.json"
-
-
-if __name__ == '__main__':
-    config = MultiNodeConfig.from_yaml()
-    print(config.perf_cmd)
--- a/tests/e2e/nightly/multi_node/scripts/run.sh
+++ b/tests/e2e/nightly/multi_node/scripts/run.sh
@@ -121,7 +121,7 @@ download_go() {
 }

 install_ais_bench() {
-    local AIS_BENCH="$SRC_DIR/benchmark"
+    local AIS_BENCH="$SRC_DIR/vllm-ascend/benchmark"
    git clone https://gitee.com/aisbench/benchmark.git $AIS_BENCH
    cd $AIS_BENCH
    git checkout v3.0-20250930-master
@@ -166,8 +166,8 @@ run_tests() {
    kill_npu_processes
    ret=$?
    if [ "$LWS_WORKER_INDEX" -eq 0 ]; then
-        mkdir -p "$(dirname "$RESULT_PATH")"
-        echo $ret > "$RESULT_PATH"
+        mkdir -p "$(dirname "$RESULT_FILE_PATH")"
+        echo $ret > "$RESULT_FILE_PATH"
    fi
    return $ret
 }
--- a/tests/e2e/nightly/multi_node/test_multi_node.py
+++ b/tests/e2e/nightly/multi_node/test_multi_node.py
@@ -48,7 +48,7 @@ def get_local_model_path_with_retry(
 async def get_completions(url: str, model: str, prompts: Union[str, List[str]],
                          **api_kwargs: Any) -> List[str]:
    """
-    Asynchronously send HTTP requests to a /v1/completions endpoint.
+    Asynchronously send HTTP requests to endpoint.

    Args:
        url: Full endpoint URL, e.g. "http://localhost:1025/v1/completions"
@@ -88,7 +88,10 @@ async def get_completions(url: str, model: str, prompts: Union[str, List[str]],
@pytest.mark.asyncio
 async def test_multi_node() -> None:
    config = MultiNodeConfig.from_yaml()
+    # To avoid modelscope 400 HttpError, we should download the model with retry
    local_model_path = get_local_model_path_with_retry(config.model)
+    config.server_cmd = config.server_cmd.replace(config.model,
+                                                  local_model_path)
    assert local_model_path is not None, "can not find any local weight for test"
    env_dict = config.envs
    perf_cmd = config.perf_cmd
@@ -113,11 +116,6 @@ async def test_multi_node() -> None:
        ) as remote_server:
            if config.is_master:
                port = proxy_port if disaggregated_prefill else server_port
-                base_url = f"http://localhost:{port}/v1/completions"
-                _ = await get_completions(url=base_url,
-                                          model=local_model_path,
-                                          prompts=prompts,
-                                          api_kwargs=api_keyword_args)
                # aisbench test
                if acc_cmd:
                    run_aisbench_cases(local_model_path, port, acc_cmd)
--- a/tools/aisbench.py
+++ b/tools/aisbench.py
@@ -14,11 +14,16 @@
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 #
+import hashlib
 import json
 import os
 import re
 import subprocess
+import tempfile
+from pathlib import Path

+import filelock
+import huggingface_hub
 import pandas as pd
 from modelscope import snapshot_download  # type: ignore

@@ -63,10 +68,12 @@ class AisbenchRunner:
                 port: int,
                 aisbench_config: dict,
                 verify=True):
-        self.dataset_path = snapshot_download(aisbench_config["dataset_path"],
-                                              repo_type='dataset')
        self.model = model
-        self.model_path = snapshot_download(model)
+        self.dataset_path = maybe_download_from_modelscope(
+            aisbench_config["dataset_path"], repo_type="dataset")
+        self.model_path = maybe_download_from_modelscope(model)
+        assert self.dataset_path is not None and self.model_path is not None, \
+            f"Failed to download dataset or model: dataset={self.dataset_path}, model={self.model_path}"
        self.port = port
        self.task_type = aisbench_config["case_type"]
        self.request_conf = aisbench_config["request_conf"]
@@ -254,3 +261,52 @@ def run_aisbench_cases(model, port, aisbench_cases):
 def get_TTFT(result):
    TTFT = result[0][0].loc["TTFT", "Average"][:-3]
    return float(TTFT)
+
+
+temp_dir = tempfile.gettempdir()
+
+
+def get_lock(model_name_or_path: str | Path, cache_dir: str | None = None):
+    lock_dir = cache_dir or temp_dir
+    model_name_or_path = str(model_name_or_path)
+    os.makedirs(os.path.dirname(lock_dir), exist_ok=True)
+    model_name = model_name_or_path.replace("/", "-")
+    hash_name = hashlib.sha256(model_name.encode()).hexdigest()
+    # add hash to avoid conflict with old users' lock files
+    lock_file_name = hash_name + model_name + ".lock"
+    # mode 0o666 is required for the filelock to be shared across users
+    lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name),
+                             mode=0o666)
+    return lock
+
+
+def maybe_download_from_modelscope(
+    model: str,
+    repo_type: str | None = None,
+    revision: str | None = None,
+    download_dir: str | None = None,
+    ignore_patterns: str | list[str] | None = None,
+    allow_patterns: list[str] | str | None = None,
+) -> str | None:
+    """
+    Download model/dataset from ModelScope hub.
+    Returns the path to the downloaded model, or None if the model is not
+    downloaded from ModelScope.
+    """
+    # Use file lock to prevent multiple processes from
+    # downloading the same model weights at the same time.
+    with get_lock(model, download_dir):
+        if not os.path.exists(model):
+            model_path = snapshot_download(
+                model_id=model,
+                repo_type=repo_type,
+                cache_dir=download_dir,
+                local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                revision=revision,
+                ignore_file_pattern=ignore_patterns,
+                allow_patterns=allow_patterns,
+            )
+        else:
+            model_path = model
+        return model_path
+    return None