[Bugfix] Quick hot fix for nightly CI (#4727)
Quick fix for multi-node tests --------- Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
@@ -42,35 +42,9 @@ concurrency:
|
|||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
multi-node-tests:
|
|
||||||
name: multi-node
|
|
||||||
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
max-parallel: 1
|
|
||||||
matrix:
|
|
||||||
test_config:
|
|
||||||
- name: multi-node-deepseek-dp
|
|
||||||
config_file_path: DeepSeek-R1-W8A8-A2.yaml
|
|
||||||
size: 2
|
|
||||||
- name: multi-node-deepseek-dp-torchair
|
|
||||||
config_file_path: DeepSeek-R1-W8A8-A2-torchair.yaml
|
|
||||||
size: 2
|
|
||||||
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
|
|
||||||
with:
|
|
||||||
soc_version: a2
|
|
||||||
runner: linux-aarch64-a2-0
|
|
||||||
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2'
|
|
||||||
replicas: 1
|
|
||||||
size: ${{ matrix.test_config.size }}
|
|
||||||
config_file_path: ${{ matrix.test_config.config_file_path }}
|
|
||||||
secrets:
|
|
||||||
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_A2_B64 }}
|
|
||||||
|
|
||||||
single-node-tests:
|
single-node-tests:
|
||||||
name: single-node
|
name: single-node
|
||||||
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
||||||
needs: multi-node-tests
|
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
@@ -139,3 +113,30 @@ jobs:
|
|||||||
model_list: ${{ toJson(matrix.test_config.model_list) }}
|
model_list: ${{ toJson(matrix.test_config.model_list) }}
|
||||||
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11'
|
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11'
|
||||||
upload: false
|
upload: false
|
||||||
|
|
||||||
|
|
||||||
|
multi-node-tests:
|
||||||
|
name: multi-node
|
||||||
|
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
||||||
|
needs: single-node-tests
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
max-parallel: 1
|
||||||
|
matrix:
|
||||||
|
test_config:
|
||||||
|
- name: multi-node-deepseek-dp
|
||||||
|
config_file_path: DeepSeek-R1-W8A8-A2.yaml
|
||||||
|
size: 2
|
||||||
|
- name: multi-node-deepseek-dp-torchair
|
||||||
|
config_file_path: DeepSeek-R1-W8A8-A2-torchair.yaml
|
||||||
|
size: 2
|
||||||
|
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
|
||||||
|
with:
|
||||||
|
soc_version: a2
|
||||||
|
runner: linux-aarch64-a2-0
|
||||||
|
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2'
|
||||||
|
replicas: 1
|
||||||
|
size: ${{ matrix.test_config.size }}
|
||||||
|
config_file_path: ${{ matrix.test_config.config_file_path }}
|
||||||
|
secrets:
|
||||||
|
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_A2_B64 }}
|
||||||
|
|||||||
@@ -100,7 +100,7 @@ async def test_multi_node() -> None:
|
|||||||
disaggregated_prefill = config.disaggregated_prefill
|
disaggregated_prefill = config.disaggregated_prefill
|
||||||
server_port = config.server_port
|
server_port = config.server_port
|
||||||
proxy_port = config.proxy_port
|
proxy_port = config.proxy_port
|
||||||
server_host = config.node_info.ip
|
server_host = config.master_ip
|
||||||
proxy_script = config.envs.get("DISAGGREGATED_PREFILL_PROXY_SCRIPT", \
|
proxy_script = config.envs.get("DISAGGREGATED_PREFILL_PROXY_SCRIPT", \
|
||||||
'examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py')
|
'examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py')
|
||||||
with config.launch_server_proxy(proxy_script):
|
with config.launch_server_proxy(proxy_script):
|
||||||
@@ -123,8 +123,8 @@ async def test_multi_node() -> None:
|
|||||||
run_aisbench_cases(local_model_path,
|
run_aisbench_cases(local_model_path,
|
||||||
port,
|
port,
|
||||||
aisbench_cases,
|
aisbench_cases,
|
||||||
host_ip=config.cluster_ips[0])
|
host_ip=config.master_ip)
|
||||||
else:
|
else:
|
||||||
# for the nodes except master, should hang until the task complete
|
# for the nodes except master, should hang until the task complete
|
||||||
master_url = f"http://{config.cluster_ips[0]}:{server_port}/health"
|
master_url = f"http://{config.master_ip}:{server_port}/health"
|
||||||
remote_server.hang_until_terminated(master_url)
|
remote_server.hang_until_terminated(master_url)
|
||||||
|
|||||||
Reference in New Issue
Block a user