[CI] Fix nightly CI (#3821)

### What this PR does / why we need it? This patch fix the nightly CI runs [failure](https://github.com/vllm-project/vllm-ascend/actions/runs/18848144365) ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/releases/v0.11.1 --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2025-10-28 20:40:03 +08:00
parent a7450db1bd
commit 90ae114569
6 changed files with 79 additions and 24 deletions
--- a/tests/e2e/nightly/multi_node/config/multi_node_config.py
+++ b/tests/e2e/nightly/multi_node/config/multi_node_config.py
@@ -84,16 +84,17 @@ class MultiNodeConfig:
        self.envs["LOCAL_IP"] = self.cur_ip
        self.envs["NIC_NAME"] = self.nic_name

+        master_ip = self.cluster_ips[0]
        if self.disaggregated_prefill:
            self.envs[
                "DISAGGREGATED_PREFILL_RANK_TABLE_PATH"] = self.disaggregated_prefill.get(
                    "ranktable_path")
            if self.cur_index < self.decode_start_index:
-                self.envs["MASTER_IP"] = self.cluster_ips[0]
+                master_ip = self.cluster_ips[0]
            else:
-                self.envs["MASTER_IP"] = self.cluster_ips[
-                    self.decode_start_index]
+                master_ip = self.cluster_ips[self.decode_start_index]

+        self.envs["MASTER_IP"] = master_ip
        ascend_path = "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages"
        self.envs[
            "LD_LIBRARY_PATH"] = f"{ascend_path}:{self.envs.get('LD_LIBRARY_PATH', os.environ.get('LD_LIBRARY_PATH', ''))}"
@@ -288,8 +289,3 @@ class MultiNodeConfig:
        subprocess.run(cmd, env=env, check=True)
        assert os.path.exists(
            str(ranktable_path)), "failed generate ranktable.json"
-
-
-if __name__ == '__main__':
-    config = MultiNodeConfig.from_yaml()
-    print(config.perf_cmd)
--- a/tests/e2e/nightly/multi_node/scripts/run.sh
+++ b/tests/e2e/nightly/multi_node/scripts/run.sh
@@ -121,7 +121,7 @@ download_go() {
 }

 install_ais_bench() {
-    local AIS_BENCH="$SRC_DIR/benchmark"
+    local AIS_BENCH="$SRC_DIR/vllm-ascend/benchmark"
    git clone https://gitee.com/aisbench/benchmark.git $AIS_BENCH
    cd $AIS_BENCH
    git checkout v3.0-20250930-master
@@ -166,8 +166,8 @@ run_tests() {
    kill_npu_processes
    ret=$?
    if [ "$LWS_WORKER_INDEX" -eq 0 ]; then
-        mkdir -p "$(dirname "$RESULT_PATH")"
-        echo $ret > "$RESULT_PATH"
+        mkdir -p "$(dirname "$RESULT_FILE_PATH")"
+        echo $ret > "$RESULT_FILE_PATH"
    fi
    return $ret
 }
--- a/tests/e2e/nightly/multi_node/test_multi_node.py
+++ b/tests/e2e/nightly/multi_node/test_multi_node.py
@@ -48,7 +48,7 @@ def get_local_model_path_with_retry(
 async def get_completions(url: str, model: str, prompts: Union[str, List[str]],
                          **api_kwargs: Any) -> List[str]:
    """
-    Asynchronously send HTTP requests to a /v1/completions endpoint.
+    Asynchronously send HTTP requests to endpoint.

    Args:
        url: Full endpoint URL, e.g. "http://localhost:1025/v1/completions"
@@ -88,7 +88,10 @@ async def get_completions(url: str, model: str, prompts: Union[str, List[str]],
@pytest.mark.asyncio
 async def test_multi_node() -> None:
    config = MultiNodeConfig.from_yaml()
+    # To avoid modelscope 400 HttpError, we should download the model with retry
    local_model_path = get_local_model_path_with_retry(config.model)
+    config.server_cmd = config.server_cmd.replace(config.model,
+                                                  local_model_path)
    assert local_model_path is not None, "can not find any local weight for test"
    env_dict = config.envs
    perf_cmd = config.perf_cmd
@@ -113,11 +116,6 @@ async def test_multi_node() -> None:
        ) as remote_server:
            if config.is_master:
                port = proxy_port if disaggregated_prefill else server_port
-                base_url = f"http://localhost:{port}/v1/completions"
-                _ = await get_completions(url=base_url,
-                                          model=local_model_path,
-                                          prompts=prompts,
-                                          api_kwargs=api_keyword_args)
                # aisbench test
                if acc_cmd:
                    run_aisbench_cases(local_model_path, port, acc_cmd)