From cd8e5be7c7bb1860e00e6a41cad8bb1db6ce133a Mon Sep 17 00:00:00 2001 From: Li Wang Date: Thu, 4 Dec 2025 23:51:16 +0800 Subject: [PATCH] [Bugfix] Quick hot fix for nightly CI (#4727) Quick fix for multi-node tests --------- Signed-off-by: wangli --- .../vllm_ascend_test_nightly_a2.yaml | 53 ++++++++++--------- .../e2e/nightly/multi_node/test_multi_node.py | 6 +-- 2 files changed, 30 insertions(+), 29 deletions(-) diff --git a/.github/workflows/vllm_ascend_test_nightly_a2.yaml b/.github/workflows/vllm_ascend_test_nightly_a2.yaml index f098ccc0..01ee56ca 100644 --- a/.github/workflows/vllm_ascend_test_nightly_a2.yaml +++ b/.github/workflows/vllm_ascend_test_nightly_a2.yaml @@ -42,35 +42,9 @@ concurrency: cancel-in-progress: true jobs: - multi-node-tests: - name: multi-node - if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') - strategy: - fail-fast: false - max-parallel: 1 - matrix: - test_config: - - name: multi-node-deepseek-dp - config_file_path: DeepSeek-R1-W8A8-A2.yaml - size: 2 - - name: multi-node-deepseek-dp-torchair - config_file_path: DeepSeek-R1-W8A8-A2-torchair.yaml - size: 2 - uses: ./.github/workflows/_e2e_nightly_multi_node.yaml - with: - soc_version: a2 - runner: linux-aarch64-a2-0 - image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2' - replicas: 1 - size: ${{ matrix.test_config.size }} - config_file_path: ${{ matrix.test_config.config_file_path }} - secrets: - KUBECONFIG_B64: ${{ secrets.KUBECONFIG_A2_B64 }} - single-node-tests: name: single-node if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') - needs: multi-node-tests strategy: fail-fast: false matrix: @@ -139,3 +113,30 @@ jobs: model_list: ${{ toJson(matrix.test_config.model_list) }} image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11' upload: false + + + multi-node-tests: + name: multi-node + if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') + needs: single-node-tests + strategy: + fail-fast: false + max-parallel: 1 + matrix: + test_config: + - name: multi-node-deepseek-dp + config_file_path: DeepSeek-R1-W8A8-A2.yaml + size: 2 + - name: multi-node-deepseek-dp-torchair + config_file_path: DeepSeek-R1-W8A8-A2-torchair.yaml + size: 2 + uses: ./.github/workflows/_e2e_nightly_multi_node.yaml + with: + soc_version: a2 + runner: linux-aarch64-a2-0 + image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2' + replicas: 1 + size: ${{ matrix.test_config.size }} + config_file_path: ${{ matrix.test_config.config_file_path }} + secrets: + KUBECONFIG_B64: ${{ secrets.KUBECONFIG_A2_B64 }} diff --git a/tests/e2e/nightly/multi_node/test_multi_node.py b/tests/e2e/nightly/multi_node/test_multi_node.py index 212ad26d..66745a97 100644 --- a/tests/e2e/nightly/multi_node/test_multi_node.py +++ b/tests/e2e/nightly/multi_node/test_multi_node.py @@ -100,7 +100,7 @@ async def test_multi_node() -> None: disaggregated_prefill = config.disaggregated_prefill server_port = config.server_port proxy_port = config.proxy_port - server_host = config.node_info.ip + server_host = config.master_ip proxy_script = config.envs.get("DISAGGREGATED_PREFILL_PROXY_SCRIPT", \ 'examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py') with config.launch_server_proxy(proxy_script): @@ -123,8 +123,8 @@ async def test_multi_node() -> None: run_aisbench_cases(local_model_path, port, aisbench_cases, - host_ip=config.cluster_ips[0]) + host_ip=config.master_ip) else: # for the nodes except master, should hang until the task complete - master_url = f"http://{config.cluster_ips[0]}:{server_port}/health" + master_url = f"http://{config.master_ip}:{server_port}/health" remote_server.hang_until_terminated(master_url)