[CI] Optimize nightly CI (#3898)

### What this PR does / why we need it? This patch mainly fix the the problem of not being able to determine the exit status of the pod's entrypoint script and some other tiny optimizations: 1. Shorten wait for server timeout 2. fix typo 3. fix the issue of ais_bench failing to correctly access the proxy URL in a PD separation scenario. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0 - vLLM main: 83f478bb19 --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2025-10-30 23:42:20 +08:00
parent 2c291bc63f
commit eb0a2ee2d0
14 changed files with 94 additions and 66 deletions
--- a/.github/workflows/_e2e_nightly_multi_node.yaml
+++ b/.github/workflows/_e2e_nightly_multi_node.yaml
@@ -60,13 +60,13 @@ defaults:
 # only cancel in-progress runs of the same workflow
 # and ignore the lint / 8 cards test type
 concurrency:
-  group: ascend-nightly-${{ github.ref }}-${{ inputs.config_file_path }}
+  group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.config_file_path }}
  cancel-in-progress: true
 jobs:
  e2e:
    name: ${{ inputs.config_file_path }}
-    # This is a runner with no NPU for k8s controller
+    # This is the runner with no NPU for k8s controller
    runs-on: ${{ inputs.runner }}
    container:
      image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
@@ -75,7 +75,7 @@ jobs:
        KUBECTL: /root/.cache/.kube/kubectl
        NAMESPACE: vllm-project
        LEADER_POD: vllm-0
-        RESULT_FILE: /root/.cache/tests/ret/test_result.txt
+        RESULT_FILE: /root/.cache/tests/ret_${{ inputs.soc_version }}
    steps:
        - name: Install system denpendencies
          run: |
@@ -84,7 +84,7 @@ jobs:
           pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
           pip install jinja2-cli
-           apt-get update -y && apt-get install -y git curl
+           #apt-get update -y && apt-get install -y git curl
        - name: Install kubectl
          run: |
@@ -117,8 +117,8 @@ jobs:
          run: |
            # pre clear the crd resources created by lws
            kubectl delete leaderworkerset vllm -n "$NAMESPACE" --ignore-not-found
        - name: Launch cluster
          id: launcher
          run: |
            set -e
@@ -130,6 +130,8 @@ jobs:
            vllm_ascend_ref="${{ inputs.vllm_ascend_ref }}"
            vllm_ascend_remote_url="${{ inputs.vllm_ascend_remote_url }}"
            result_file_path="$RESULT_FILE"
            fail_tag=FAIL_TAG_"${{ inputs.config_file_path }}"
            echo "FAIL_TAG=${fail_tag}" >> $GITHUB_ENV
            required_params=("size" "replicas" "image" "config_file_path")
            for param in "${required_params[@]}"; do
@@ -155,8 +157,7 @@ jobs:
              -D vllm_ascend_ref="$vllm_ascend_ref" \
              -D result_file_path="$result_file_path" \
              -D npu_per_node="$npu_per_node" \
-              -D controller_name="$HOSTNAME" \
+              -D fail_tag="$fail_tag" \
              -D kb_secret=${{ secrets.KUBECONFIG_B64 }} \
              --outfile lws.yaml
            kubectl apply -f ./lws.yaml
@@ -180,7 +181,14 @@ jobs:
        - name: Stream logs
          run: |
-            kubectl logs -f "$LEADER_POD" -n "$NAMESPACE"
+            set -euo pipefail
            echo "Looking for logs containing: $FAIL_TAG"
            kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" | while read -r line; do
              echo "$line"
              if echo "$line" | grep -q "$FAIL_TAG"; then
                exit 1   # workflow step failed
              fi
            done
        - name: Post process
          if: always()
--- a/.github/workflows/_e2e_nightly_single_node.yaml
+++ b/.github/workflows/_e2e_nightly_single_node.yaml
@@ -44,7 +44,7 @@ defaults:
 # only cancel in-progress runs of the same workflow
 # and ignore the lint / 1 card / 4 cards test type
 concurrency:
-  group: ascend-nightly-${{ github.ref }}-${{ inputs.tests }}
+  group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.tests }}
  cancel-in-progress: true
 jobs:
--- a/.github/workflows/vllm_ascend_test_nightly_a2.yaml
+++ b/.github/workflows/vllm_ascend_test_nightly_a2.yaml
@@ -42,6 +42,7 @@ concurrency:
 jobs:
  single-node-tests:
    name: single-node
    if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
    strategy:
      fail-fast: false
@@ -63,6 +64,7 @@ jobs:
      tests: ${{ matrix.test_config.tests }}
  multi-node-tests:
    name: multi-node
    needs: single-node-tests
    if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
    strategy:
@@ -71,10 +73,10 @@ jobs:
      matrix:
        test_config:
          - name: multi-node-deepseek-dp
-            config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml
+            config_file_path: DeepSeek-R1-W8A8-A2.yaml
            size: 2
          - name: multi-node-deepseek-dp-torchair
-            config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2-torchair.yaml
+            config_file_path: DeepSeek-R1-W8A8-A2-torchair.yaml
            size: 2
    uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
    with:
--- a/.github/workflows/vllm_ascend_test_nightly_a3.yaml
+++ b/.github/workflows/vllm_ascend_test_nightly_a3.yaml
@@ -42,6 +42,7 @@ concurrency:
 jobs:
  single-node-tests:
    name: single-node
    if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
    strategy:
      fail-fast: false
@@ -85,6 +86,7 @@ jobs:
      tests: ${{ matrix.test_config.tests }}
  multi-node-tests:
    name: multi-node
    needs: single-node-tests
    if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
    strategy:
@@ -93,19 +95,19 @@ jobs:
      matrix:
        test_config:
          - name: multi-node-deepseek-pd
-            config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml
+            config_file_path: DeepSeek-V3.yaml
            size: 2
          - name: multi-node-qwen3-dp
-            config_file_path: tests/e2e/nightly/multi_node/config/models/Qwen3-235B-A3B.yaml
+            config_file_path: Qwen3-235B-A3B.yaml
            size: 2
          - name: multi-node-dpsk-4node-pd
-            config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml
+            config_file_path: DeepSeek-R1-W8A8.yaml
            size: 4
          - name: multi-node-qwenw8a8-2node
-            config_file_path: tests/e2e/nightly/multi_node/config/models/Qwen3-235B-W8A8.yaml
+            config_file_path: Qwen3-235B-W8A8.yaml
            size: 2
          - name: multi-node-glm-2node
-            config_file_path: tests/e2e/nightly/multi_node/config/models/GLM-4_5.yaml
+            config_file_path: GLM-4_5.yaml
            size: 2
    uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
    with:
@@ -117,12 +119,3 @@ jobs:
      config_file_path: ${{ matrix.test_config.config_file_path }}
    secrets:
      KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }}
  clear_resources:
    needs: multi-node-tests
    if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
    uses: ./.github/workflows/_kill_lws_resources.yaml
    with:
      runner: linux-aarch64-a3-0
    secrets:
      KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }}
--- a/tests/e2e/conftest.py
+++ b/tests/e2e/conftest.py
@@ -163,10 +163,11 @@ class RemoteOpenAIServer:
        self.proxy_port = proxy_port
        self._start_server(model, vllm_serve_args, env_dict)
-        max_wait_seconds = max_wait_seconds or 7200
+        max_wait_seconds = max_wait_seconds or 1800
        if self.disaggregated_prefill:
            assert proxy_port is not None, "for disaggregated_prefill, proxy port must be provided"
-            self._wait_for_server_pd(proxy_port=proxy_port)
+            self._wait_for_server_pd(proxy_port=proxy_port,
                                     timeout=max_wait_seconds)
        else:
            self._wait_for_server(url=self.url_for("health"),
                                  timeout=max_wait_seconds)
@@ -186,7 +187,7 @@ class RemoteOpenAIServer:
        """Subclasses override this method to customize process polling"""
        return self.proc.poll()
-    def hang_until_terminated(self) -> None:
+    def hang_until_terminated(self, url) -> None:
        """
        Wait until the server process terminates.
        This is for headless mode, where the api server
@@ -196,7 +197,7 @@ class RemoteOpenAIServer:
        try:
            while True:
                try:
-                    resp = client.get(self.url_for("health"), timeout=5)
+                    resp = client.get(url, timeout=5)
                    if resp.status_code != 200:
                        break
                    time.sleep(5)
@@ -206,7 +207,7 @@ class RemoteOpenAIServer:
            if isinstance(client, httpx.Client):
                client.close()
-    def _wait_for_server_pd(self, proxy_port: int):
+    def _wait_for_server_pd(self, proxy_port: int, timeout: float):
        # Wait for all api_server nodes ready
        assert self.nodes_info is not None, "cluster info must be provided"
        for node_info in self.nodes_info:
@@ -214,12 +215,12 @@ class RemoteOpenAIServer:
                continue
            url_health = f"http://{node_info.ip}:{node_info.server_port}/health"
-            self._wait_for_server(url=url_health, timeout=7200)
+            self._wait_for_server(url=url_health, timeout=timeout)
        # Wait for proxy ready
        master_node = self.nodes_info[0]
        url_proxy = f"http://{master_node.ip}:{proxy_port}/healthcheck"
-        self._wait_for_server(url=url_proxy, timeout=7200)
+        self._wait_for_server(url=url_proxy, timeout=timeout)
    def _wait_for_server(self, *, url: str, timeout: float):
        # run health check
--- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2-torchair.yaml
+++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2-torchair.yaml
--- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml
+++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml
--- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml
+++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml
@@ -97,3 +97,12 @@ deployment:
            }
        }'
 benchmarks:
  acc:
    case_type: accuracy
    dataset_path: vllm-ascend/gsm8k-lite
    request_conf: vllm_api_general_chat
    dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
    max_out_len: 4096
    batch_size: 512
    baseline: 95
    threshold: 5
--- a/tests/e2e/nightly/multi_node/config/models/Qwen3-235B-A3B.yaml
+++ b/tests/e2e/nightly/multi_node/config/models/Qwen3-235B-A3B.yaml
@@ -47,3 +47,4 @@ deployment:
        --no-enable-prefix-caching
        --gpu-memory-utilization 0.9
 benchmarks:
--- a/tests/e2e/nightly/multi_node/config/multi_node_config.py
+++ b/tests/e2e/nightly/multi_node/config/multi_node_config.py
@@ -17,6 +17,7 @@ setup_logger()
 logger = logging.getLogger(__name__)
 DISAGGREGATED_PREFILL_PROXY_SCRIPT = "examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py"
 DISAGGEGATED_PREFILL_PORT = 5333
 CONFIG_BASE_PATH = "tests/e2e/nightly/multi_node/config/models/"
@dataclass
@@ -187,9 +188,8 @@ class MultiNodeConfig:
    @classmethod
    def from_yaml(cls, yaml_path: Optional[str] = None):
        if not yaml_path:
-            yaml_path = os.getenv(
+            yaml_path = os.getenv("CONFIG_YAML_PATH", "DeepSeek-V3.yaml")
-                "CONFIG_YAML_PATH",
+        yaml_path = os.path.join(CONFIG_BASE_PATH, yaml_path)
                "tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml")
        with open(yaml_path, 'r') as file:
            config_data = yaml.safe_load(file)
        test_name = config_data.get("test_name", "default_test")
@@ -255,6 +255,7 @@ class MultiNodeConfig:
        ranktable_path = self.disaggregated_prefill.get("ranktable_path")
        assert ranktable_gen_path is not None and ranktable_path is not None
        if os.path.exists(str(ranktable_path)):
            logger.info("ranktable has already generated")
            return
        local_host = self.cur_ip
@@ -286,6 +287,8 @@ class MultiNodeConfig:
        assert self.nic_name is not None
        env["GLOO_SOCKET_IFNAME"] = self.nic_name
        logger.info(
            f"Generating ranktable from command: {' '.join(map(str, cmd))}")
        subprocess.run(cmd, env=env, check=True)
        assert os.path.exists(
            str(ranktable_path)), "failed generate ranktable.json"
--- a/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2
+++ b/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2
@@ -18,7 +18,7 @@ spec:
            image: {{ image | default("m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11") }}
            env:
              - name: CONFIG_YAML_PATH
-                value: {{ config_file_path | default("tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml") }}
+                value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
              - name: WORKSPACE
                value: "/root/workspace"
              # Set vLLM version and vLLM-Ascend version here, once there is a new release, update here.
@@ -29,11 +29,9 @@ spec:
              - name: VLLM_ASCEND_REMOTE_URL
                value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }}
              - name: RESULT_FILE_PATH
-                value: {{ result_file_path | default("/root/.cache/tests/ret/test_result.txt") }}
+                value: {{ result_file_path | default("/root/.cache/tests/ret") }}
-              - name: CONTROLLER_NAME
+              - name: FAIL_TAG
-                value: {{ controller_name | default("placeholder") }}
+                value: {{ fail_tag | default("FAIL_TAG") }}
              - name: SECRET
                value: {{ kb_secret | default("placeholder") }}
            command:
              - sh
              - -c
@@ -80,7 +78,7 @@ spec:
            image: {{ image | default("m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11") }}
            env:
              - name: CONFIG_YAML_PATH
-                value: {{ config_file_path | default("tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml") }}
+                value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
              - name: WORKSPACE
                value: "/root/workspace"
              # Set vLLM version and vLLM-Ascend version here, once there is a new release, update here.
@@ -92,6 +90,8 @@ spec:
                value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }}
              - name: RESULT_FILE_PATH
                value: {{ result_file_path | default("/root/.cache/tests/ret/test_result.txt") }}
              - name: FAIL_TAG
                value: {{ fail_tag | default("FAIL_TAG") }}
            command:
              - sh
              - -c
--- a/tests/e2e/nightly/multi_node/scripts/run.sh
+++ b/tests/e2e/nightly/multi_node/scripts/run.sh
@@ -20,6 +20,11 @@ print_section() {
    echo -e "\n${BLUE}=== $1 ===${NC}"
 }
 print_failure() {
    echo -e "${RED}${FAIL_TAG} ✗ ERROR: $1${NC}"
    exit 1
 }
 # Function to print success messages
 print_success() {
    echo -e "${GREEN}✓ $1${NC}"
@@ -161,32 +166,24 @@ kill_npu_processes() {
  sleep 4
 }
-run_tests() {
+run_tests_with_log() {
    set +e
    kill_npu_processes
-    pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py
+    BASENAME=$(basename "$CONFIG_YAML_PATH" .yaml)
-    ret=$?
+    # each worker should have log file
    LOG_FILE="${RESULT_FILE_PATH}/${BASENAME}_worker_${LWS_WORKER_INDEX}.log"
    mkdir -p ${RESULT_FILE_PATH}
    pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py 2>&1 | tee $LOG_FILE
    ret=${PIPESTATUS[0]}
    set -e
    if [ "$LWS_WORKER_INDEX" -eq 0 ]; then
        if [ $ret -eq 0 ]; then
            print_success "All tests passed!"
        else
-            print_error "Some tests failed!"
+            print_failure "Some tests failed!"
-            kubectl delete pod $CONTROLLER_NAME -n vllm-project
+            mv LOG_FILE error_${LOG_FILE}
        fi
    fi
    set -e
 }
 install_kubectl() {
    arch=$(uname -m)
    KUBECTL=/root/.cache/.kube/kubectl
    if echo "$arch" | grep -qiE "arm|aarch64"; then
        echo "Detected ARM architecture: $arch"
        KUBECTL="$KUBECTL"_arm
    fi
    install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
    echo "$SECRET" | base64 -d > /tmp/kubeconfig
    export KUBECONFIG=/tmp/kubeconfig
 }
 main() {
@@ -194,7 +191,6 @@ main() {
    check_and_config
    checkout_src
    install_sys_dependencies
    install_kubectl
    install_vllm
    install_ais_bench
    # to speed up mooncake build process, install Go here
@@ -203,7 +199,7 @@ main() {
    . $SRC_DIR/vllm-ascend/tests/e2e/nightly/multi_node/scripts/build_mooncake.sh \
    pooling_async_memecpy_v1 9d96b2e1dd76cc601d76b1b4c5f6e04605cd81d3
    cd "$WORKSPACE/source_code/vllm-ascend"
-    run_tests
+    run_tests_with_log
 }
 main "$@"
--- a/tests/e2e/nightly/multi_node/test_multi_node.py
+++ b/tests/e2e/nightly/multi_node/test_multi_node.py
@@ -118,6 +118,11 @@ async def test_multi_node() -> None:
                port = proxy_port if disaggregated_prefill else server_port
                # aisbench test
                aisbench_cases = [acc_cmd, perf_cmd]
-                run_aisbench_cases(local_model_path, port, aisbench_cases)
+                run_aisbench_cases(local_model_path,
                                   port,
                                   aisbench_cases,
                                   host_ip=config.cluster_ips[0])
            else:
-                remote_server.hang_until_terminated()
+                # for the nodes except master, should hang until the task complete
                master_url = f"http://{config.cluster_ips[0]}:{server_port}/health"
                remote_server.hang_until_terminated(master_url)
--- a/tools/aisbench.py
+++ b/tools/aisbench.py
@@ -68,6 +68,7 @@ class AisbenchRunner:
                 model: str,
                 port: int,
                 aisbench_config: dict,
                 host_ip: str = "localhost",
                 verify=True):
        self.model = model
        self.dataset_path = maybe_download_from_modelscope(
@@ -76,6 +77,7 @@ class AisbenchRunner:
        assert self.dataset_path is not None and self.model_path is not None, \
            f"Failed to download dataset or model: dataset={self.dataset_path}, model={self.model_path}"
        self.port = port
        self.host_ip = host_ip
        self.task_type = aisbench_config["case_type"]
        self.request_conf = aisbench_config["request_conf"]
        self.dataset_conf = aisbench_config.get("dataset_conf")
@@ -131,6 +133,7 @@ class AisbenchRunner:
            content = f.read()
        content = re.sub(r'model=.*', f'model="{self.model}",', content)
        content = re.sub(r'host_port.*', f'host_port = {self.port},', content)
        content = re.sub(r'host_ip.*', f'host_ip = "{self.host_ip}",', content)
        content = re.sub(r'max_out_len.*',
                         f'max_out_len = {self.max_out_len},', content)
        content = re.sub(r'batch_size.*', f'batch_size = {self.batch_size},',
@@ -238,14 +241,21 @@ class AisbenchRunner:
        assert self.baseline - self.threshold <= acc_value <= self.baseline + self.threshold, f"Accuracy verification failed. The accuracy of {self.dataset_path} is {acc_value}, which is not within {self.threshold} relative to baseline {self.baseline}."
-def run_aisbench_cases(model, port, aisbench_cases, server_args=""):
+def run_aisbench_cases(model,
                       port,
                       aisbench_cases,
                       server_args="",
                       host_ip="localhost"):
    aisbench_results = []
    aisbench_errors = []
    for aisbench_case in aisbench_cases:
        if not aisbench_case:
            continue
        try:
-            with AisbenchRunner(model, port, aisbench_case) as aisbench:
+            with AisbenchRunner(model=model,
                                port=port,
                                host_ip=host_ip,
                                aisbench_config=aisbench_case) as aisbench:
                aisbench_results.append(aisbench.result)
        except Exception as e:
            aisbench_results.append("")