[CI] Optimize nightly CI (#3898)

### What this PR does / why we need it? This patch mainly fix the the problem of not being able to determine the exit status of the pod's entrypoint script and some other tiny optimizations: 1. Shorten wait for server timeout 2. fix typo 3. fix the issue of ais_bench failing to correctly access the proxy URL in a PD separation scenario. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0 - vLLM main: 83f478bb19 --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2025-10-30 23:42:20 +08:00
parent 2c291bc63f
commit eb0a2ee2d0
14 changed files with 94 additions and 66 deletions
--- a/tests/e2e/conftest.py
+++ b/tests/e2e/conftest.py
@@ -163,10 +163,11 @@ class RemoteOpenAIServer:
        self.proxy_port = proxy_port

        self._start_server(model, vllm_serve_args, env_dict)
-        max_wait_seconds = max_wait_seconds or 7200
+        max_wait_seconds = max_wait_seconds or 1800
        if self.disaggregated_prefill:
            assert proxy_port is not None, "for disaggregated_prefill, proxy port must be provided"
-            self._wait_for_server_pd(proxy_port=proxy_port)
+            self._wait_for_server_pd(proxy_port=proxy_port,
+                                     timeout=max_wait_seconds)
        else:
            self._wait_for_server(url=self.url_for("health"),
                                  timeout=max_wait_seconds)
@@ -186,7 +187,7 @@ class RemoteOpenAIServer:
        """Subclasses override this method to customize process polling"""
        return self.proc.poll()

-    def hang_until_terminated(self) -> None:
+    def hang_until_terminated(self, url) -> None:
        """
        Wait until the server process terminates.
        This is for headless mode, where the api server
@@ -196,7 +197,7 @@ class RemoteOpenAIServer:
        try:
            while True:
                try:
-                    resp = client.get(self.url_for("health"), timeout=5)
+                    resp = client.get(url, timeout=5)
                    if resp.status_code != 200:
                        break
                    time.sleep(5)
@@ -206,7 +207,7 @@ class RemoteOpenAIServer:
            if isinstance(client, httpx.Client):
                client.close()

-    def _wait_for_server_pd(self, proxy_port: int):
+    def _wait_for_server_pd(self, proxy_port: int, timeout: float):
        # Wait for all api_server nodes ready
        assert self.nodes_info is not None, "cluster info must be provided"
        for node_info in self.nodes_info:
@@ -214,12 +215,12 @@ class RemoteOpenAIServer:
                continue

            url_health = f"http://{node_info.ip}:{node_info.server_port}/health"
-            self._wait_for_server(url=url_health, timeout=7200)
+            self._wait_for_server(url=url_health, timeout=timeout)

        # Wait for proxy ready
        master_node = self.nodes_info[0]
        url_proxy = f"http://{master_node.ip}:{proxy_port}/healthcheck"
-        self._wait_for_server(url=url_proxy, timeout=7200)
+        self._wait_for_server(url=url_proxy, timeout=timeout)

    def _wait_for_server(self, *, url: str, timeout: float):
        # run health check
--- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2-torchair.yaml
+++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2-torchair.yaml
--- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml
+++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml
--- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml
+++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml
@@ -97,3 +97,12 @@ deployment:
            }
        }'
 benchmarks:
+  acc:
+    case_type: accuracy
+    dataset_path: vllm-ascend/gsm8k-lite
+    request_conf: vllm_api_general_chat
+    dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
+    max_out_len: 4096
+    batch_size: 512
+    baseline: 95
+    threshold: 5
--- a/tests/e2e/nightly/multi_node/config/models/Qwen3-235B-A3B.yaml
+++ b/tests/e2e/nightly/multi_node/config/models/Qwen3-235B-A3B.yaml
@@ -47,3 +47,4 @@ deployment:
        --no-enable-prefix-caching
        --gpu-memory-utilization 0.9
 benchmarks:
+
--- a/tests/e2e/nightly/multi_node/config/multi_node_config.py
+++ b/tests/e2e/nightly/multi_node/config/multi_node_config.py
@@ -17,6 +17,7 @@ setup_logger()
 logger = logging.getLogger(__name__)
 DISAGGREGATED_PREFILL_PROXY_SCRIPT = "examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py"
 DISAGGEGATED_PREFILL_PORT = 5333
+CONFIG_BASE_PATH = "tests/e2e/nightly/multi_node/config/models/"


@dataclass
@@ -187,9 +188,8 @@ class MultiNodeConfig:
    @classmethod
    def from_yaml(cls, yaml_path: Optional[str] = None):
        if not yaml_path:
-            yaml_path = os.getenv(
-                "CONFIG_YAML_PATH",
-                "tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml")
+            yaml_path = os.getenv("CONFIG_YAML_PATH", "DeepSeek-V3.yaml")
+        yaml_path = os.path.join(CONFIG_BASE_PATH, yaml_path)
        with open(yaml_path, 'r') as file:
            config_data = yaml.safe_load(file)
        test_name = config_data.get("test_name", "default_test")
@@ -255,6 +255,7 @@ class MultiNodeConfig:
        ranktable_path = self.disaggregated_prefill.get("ranktable_path")
        assert ranktable_gen_path is not None and ranktable_path is not None
        if os.path.exists(str(ranktable_path)):
+            logger.info("ranktable has already generated")
            return

        local_host = self.cur_ip
@@ -286,6 +287,8 @@ class MultiNodeConfig:
        assert self.nic_name is not None
        env["GLOO_SOCKET_IFNAME"] = self.nic_name

+        logger.info(
+            f"Generating ranktable from command: {' '.join(map(str, cmd))}")
        subprocess.run(cmd, env=env, check=True)
        assert os.path.exists(
            str(ranktable_path)), "failed generate ranktable.json"
--- a/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2
+++ b/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2
@@ -18,7 +18,7 @@ spec:
            image: {{ image | default("m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11") }}
            env:
              - name: CONFIG_YAML_PATH
-                value: {{ config_file_path | default("tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml") }}
+                value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
              - name: WORKSPACE
                value: "/root/workspace"
              # Set vLLM version and vLLM-Ascend version here, once there is a new release, update here.
@@ -29,11 +29,9 @@ spec:
              - name: VLLM_ASCEND_REMOTE_URL
                value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }}
              - name: RESULT_FILE_PATH
-                value: {{ result_file_path | default("/root/.cache/tests/ret/test_result.txt") }}
-              - name: CONTROLLER_NAME
-                value: {{ controller_name | default("placeholder") }}
-              - name: SECRET
-                value: {{ kb_secret | default("placeholder") }}
+                value: {{ result_file_path | default("/root/.cache/tests/ret") }}
+              - name: FAIL_TAG
+                value: {{ fail_tag | default("FAIL_TAG") }}
            command:
              - sh
              - -c
@@ -80,7 +78,7 @@ spec:
            image: {{ image | default("m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11") }}
            env:
              - name: CONFIG_YAML_PATH
-                value: {{ config_file_path | default("tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml") }}
+                value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
              - name: WORKSPACE
                value: "/root/workspace"
              # Set vLLM version and vLLM-Ascend version here, once there is a new release, update here.
@@ -92,6 +90,8 @@ spec:
                value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }}
              - name: RESULT_FILE_PATH
                value: {{ result_file_path | default("/root/.cache/tests/ret/test_result.txt") }}
+              - name: FAIL_TAG
+                value: {{ fail_tag | default("FAIL_TAG") }}
            command:
              - sh
              - -c
--- a/tests/e2e/nightly/multi_node/scripts/run.sh
+++ b/tests/e2e/nightly/multi_node/scripts/run.sh
@@ -20,6 +20,11 @@ print_section() {
    echo -e "\n${BLUE}=== $1 ===${NC}"
 }

+print_failure() {
+    echo -e "${RED}${FAIL_TAG} ✗ ERROR: $1${NC}"
+    exit 1
+}
+
 # Function to print success messages
 print_success() {
    echo -e "${GREEN}✓ $1${NC}"
@@ -161,32 +166,24 @@ kill_npu_processes() {
  sleep 4
 }

-run_tests() {
+run_tests_with_log() {
    set +e
    kill_npu_processes
-    pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py
-    ret=$?
+    BASENAME=$(basename "$CONFIG_YAML_PATH" .yaml)
+    # each worker should have log file
+    LOG_FILE="${RESULT_FILE_PATH}/${BASENAME}_worker_${LWS_WORKER_INDEX}.log"
+    mkdir -p ${RESULT_FILE_PATH}
+    pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py 2>&1 | tee $LOG_FILE
+    ret=${PIPESTATUS[0]}
+    set -e
    if [ "$LWS_WORKER_INDEX" -eq 0 ]; then
        if [ $ret -eq 0 ]; then
            print_success "All tests passed!"
        else
-            print_error "Some tests failed!"
-            kubectl delete pod $CONTROLLER_NAME -n vllm-project
+            print_failure "Some tests failed!"
+            mv LOG_FILE error_${LOG_FILE}
        fi
    fi
-    set -e
-}
-
-install_kubectl() {
-    arch=$(uname -m)
-    KUBECTL=/root/.cache/.kube/kubectl
-    if echo "$arch" | grep -qiE "arm|aarch64"; then
-        echo "Detected ARM architecture: $arch"
-        KUBECTL="$KUBECTL"_arm
-    fi
-    install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
-    echo "$SECRET" | base64 -d > /tmp/kubeconfig
-    export KUBECONFIG=/tmp/kubeconfig
 }

 main() {
@@ -194,7 +191,6 @@ main() {
    check_and_config
    checkout_src
    install_sys_dependencies
-    install_kubectl
    install_vllm
    install_ais_bench
    # to speed up mooncake build process, install Go here
@@ -203,7 +199,7 @@ main() {
    . $SRC_DIR/vllm-ascend/tests/e2e/nightly/multi_node/scripts/build_mooncake.sh \
    pooling_async_memecpy_v1 9d96b2e1dd76cc601d76b1b4c5f6e04605cd81d3
    cd "$WORKSPACE/source_code/vllm-ascend"
-    run_tests
+    run_tests_with_log
 }

 main "$@"
--- a/tests/e2e/nightly/multi_node/test_multi_node.py
+++ b/tests/e2e/nightly/multi_node/test_multi_node.py
@@ -118,6 +118,11 @@ async def test_multi_node() -> None:
                port = proxy_port if disaggregated_prefill else server_port
                # aisbench test
                aisbench_cases = [acc_cmd, perf_cmd]
-                run_aisbench_cases(local_model_path, port, aisbench_cases)
+                run_aisbench_cases(local_model_path,
+                                   port,
+                                   aisbench_cases,
+                                   host_ip=config.cluster_ips[0])
            else:
-                remote_server.hang_until_terminated()
+                # for the nodes except master, should hang until the task complete
+                master_url = f"http://{config.cluster_ips[0]}:{server_port}/health"
+                remote_server.hang_until_terminated(master_url)