[Bugfix] Quick hot fix for nightly CI (#4727)

Quick fix for multi-node tests

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
Li Wang
2025-12-04 23:51:16 +08:00
committed by GitHub
parent ad0607f900
commit cd8e5be7c7
2 changed files with 30 additions and 29 deletions

View File

@@ -42,35 +42,9 @@ concurrency:
cancel-in-progress: true
jobs:
multi-node-tests:
name: multi-node
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
strategy:
fail-fast: false
max-parallel: 1
matrix:
test_config:
- name: multi-node-deepseek-dp
config_file_path: DeepSeek-R1-W8A8-A2.yaml
size: 2
- name: multi-node-deepseek-dp-torchair
config_file_path: DeepSeek-R1-W8A8-A2-torchair.yaml
size: 2
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
with:
soc_version: a2
runner: linux-aarch64-a2-0
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2'
replicas: 1
size: ${{ matrix.test_config.size }}
config_file_path: ${{ matrix.test_config.config_file_path }}
secrets:
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_A2_B64 }}
single-node-tests:
name: single-node
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
needs: multi-node-tests
strategy:
fail-fast: false
matrix:
@@ -139,3 +113,30 @@ jobs:
model_list: ${{ toJson(matrix.test_config.model_list) }}
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11'
upload: false
multi-node-tests:
name: multi-node
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
needs: single-node-tests
strategy:
fail-fast: false
max-parallel: 1
matrix:
test_config:
- name: multi-node-deepseek-dp
config_file_path: DeepSeek-R1-W8A8-A2.yaml
size: 2
- name: multi-node-deepseek-dp-torchair
config_file_path: DeepSeek-R1-W8A8-A2-torchair.yaml
size: 2
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
with:
soc_version: a2
runner: linux-aarch64-a2-0
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2'
replicas: 1
size: ${{ matrix.test_config.size }}
config_file_path: ${{ matrix.test_config.config_file_path }}
secrets:
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_A2_B64 }}

View File

@@ -100,7 +100,7 @@ async def test_multi_node() -> None:
disaggregated_prefill = config.disaggregated_prefill
server_port = config.server_port
proxy_port = config.proxy_port
server_host = config.node_info.ip
server_host = config.master_ip
proxy_script = config.envs.get("DISAGGREGATED_PREFILL_PROXY_SCRIPT", \
'examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py')
with config.launch_server_proxy(proxy_script):
@@ -123,8 +123,8 @@ async def test_multi_node() -> None:
run_aisbench_cases(local_model_path,
port,
aisbench_cases,
host_ip=config.cluster_ips[0])
host_ip=config.master_ip)
else:
# for the nodes except master, should hang until the task complete
master_url = f"http://{config.cluster_ips[0]}:{server_port}/health"
master_url = f"http://{config.master_ip}:{server_port}/health"
remote_server.hang_until_terminated(master_url)