[CI] Optimize nightly CI (#3898)
### What this PR does / why we need it?
This patch mainly fix the the problem of not being able to determine the
exit status of the pod's entrypoint script and some other tiny
optimizations:
1. Shorten wait for server timeout
2. fix typo
3. fix the issue of ais_bench failing to correctly access the proxy URL
in a PD separation scenario.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
83f478bb19
---------
Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
24
.github/workflows/_e2e_nightly_multi_node.yaml
vendored
24
.github/workflows/_e2e_nightly_multi_node.yaml
vendored
@@ -60,13 +60,13 @@ defaults:
|
||||
# only cancel in-progress runs of the same workflow
|
||||
# and ignore the lint / 8 cards test type
|
||||
concurrency:
|
||||
group: ascend-nightly-${{ github.ref }}-${{ inputs.config_file_path }}
|
||||
group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.config_file_path }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
e2e:
|
||||
name: ${{ inputs.config_file_path }}
|
||||
# This is a runner with no NPU for k8s controller
|
||||
# This is the runner with no NPU for k8s controller
|
||||
runs-on: ${{ inputs.runner }}
|
||||
container:
|
||||
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
|
||||
@@ -75,7 +75,7 @@ jobs:
|
||||
KUBECTL: /root/.cache/.kube/kubectl
|
||||
NAMESPACE: vllm-project
|
||||
LEADER_POD: vllm-0
|
||||
RESULT_FILE: /root/.cache/tests/ret/test_result.txt
|
||||
RESULT_FILE: /root/.cache/tests/ret_${{ inputs.soc_version }}
|
||||
steps:
|
||||
- name: Install system denpendencies
|
||||
run: |
|
||||
@@ -84,7 +84,7 @@ jobs:
|
||||
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
||||
pip install jinja2-cli
|
||||
|
||||
apt-get update -y && apt-get install -y git curl
|
||||
#apt-get update -y && apt-get install -y git curl
|
||||
|
||||
- name: Install kubectl
|
||||
run: |
|
||||
@@ -117,8 +117,8 @@ jobs:
|
||||
run: |
|
||||
# pre clear the crd resources created by lws
|
||||
kubectl delete leaderworkerset vllm -n "$NAMESPACE" --ignore-not-found
|
||||
|
||||
- name: Launch cluster
|
||||
id: launcher
|
||||
run: |
|
||||
set -e
|
||||
|
||||
@@ -130,6 +130,8 @@ jobs:
|
||||
vllm_ascend_ref="${{ inputs.vllm_ascend_ref }}"
|
||||
vllm_ascend_remote_url="${{ inputs.vllm_ascend_remote_url }}"
|
||||
result_file_path="$RESULT_FILE"
|
||||
fail_tag=FAIL_TAG_"${{ inputs.config_file_path }}"
|
||||
echo "FAIL_TAG=${fail_tag}" >> $GITHUB_ENV
|
||||
|
||||
required_params=("size" "replicas" "image" "config_file_path")
|
||||
for param in "${required_params[@]}"; do
|
||||
@@ -155,8 +157,7 @@ jobs:
|
||||
-D vllm_ascend_ref="$vllm_ascend_ref" \
|
||||
-D result_file_path="$result_file_path" \
|
||||
-D npu_per_node="$npu_per_node" \
|
||||
-D controller_name="$HOSTNAME" \
|
||||
-D kb_secret=${{ secrets.KUBECONFIG_B64 }} \
|
||||
-D fail_tag="$fail_tag" \
|
||||
--outfile lws.yaml
|
||||
|
||||
kubectl apply -f ./lws.yaml
|
||||
@@ -180,7 +181,14 @@ jobs:
|
||||
|
||||
- name: Stream logs
|
||||
run: |
|
||||
kubectl logs -f "$LEADER_POD" -n "$NAMESPACE"
|
||||
set -euo pipefail
|
||||
echo "Looking for logs containing: $FAIL_TAG"
|
||||
kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" | while read -r line; do
|
||||
echo "$line"
|
||||
if echo "$line" | grep -q "$FAIL_TAG"; then
|
||||
exit 1 # workflow step failed
|
||||
fi
|
||||
done
|
||||
|
||||
- name: Post process
|
||||
if: always()
|
||||
|
||||
@@ -44,7 +44,7 @@ defaults:
|
||||
# only cancel in-progress runs of the same workflow
|
||||
# and ignore the lint / 1 card / 4 cards test type
|
||||
concurrency:
|
||||
group: ascend-nightly-${{ github.ref }}-${{ inputs.tests }}
|
||||
group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.tests }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
|
||||
@@ -42,6 +42,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
single-node-tests:
|
||||
name: single-node
|
||||
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -63,6 +64,7 @@ jobs:
|
||||
tests: ${{ matrix.test_config.tests }}
|
||||
|
||||
multi-node-tests:
|
||||
name: multi-node
|
||||
needs: single-node-tests
|
||||
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
||||
strategy:
|
||||
@@ -71,10 +73,10 @@ jobs:
|
||||
matrix:
|
||||
test_config:
|
||||
- name: multi-node-deepseek-dp
|
||||
config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml
|
||||
config_file_path: DeepSeek-R1-W8A8-A2.yaml
|
||||
size: 2
|
||||
- name: multi-node-deepseek-dp-torchair
|
||||
config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2-torchair.yaml
|
||||
config_file_path: DeepSeek-R1-W8A8-A2-torchair.yaml
|
||||
size: 2
|
||||
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
|
||||
with:
|
||||
|
||||
@@ -42,6 +42,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
single-node-tests:
|
||||
name: single-node
|
||||
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -85,6 +86,7 @@ jobs:
|
||||
tests: ${{ matrix.test_config.tests }}
|
||||
|
||||
multi-node-tests:
|
||||
name: multi-node
|
||||
needs: single-node-tests
|
||||
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
||||
strategy:
|
||||
@@ -93,19 +95,19 @@ jobs:
|
||||
matrix:
|
||||
test_config:
|
||||
- name: multi-node-deepseek-pd
|
||||
config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml
|
||||
config_file_path: DeepSeek-V3.yaml
|
||||
size: 2
|
||||
- name: multi-node-qwen3-dp
|
||||
config_file_path: tests/e2e/nightly/multi_node/config/models/Qwen3-235B-A3B.yaml
|
||||
config_file_path: Qwen3-235B-A3B.yaml
|
||||
size: 2
|
||||
- name: multi-node-dpsk-4node-pd
|
||||
config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml
|
||||
config_file_path: DeepSeek-R1-W8A8.yaml
|
||||
size: 4
|
||||
- name: multi-node-qwenw8a8-2node
|
||||
config_file_path: tests/e2e/nightly/multi_node/config/models/Qwen3-235B-W8A8.yaml
|
||||
config_file_path: Qwen3-235B-W8A8.yaml
|
||||
size: 2
|
||||
- name: multi-node-glm-2node
|
||||
config_file_path: tests/e2e/nightly/multi_node/config/models/GLM-4_5.yaml
|
||||
config_file_path: GLM-4_5.yaml
|
||||
size: 2
|
||||
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
|
||||
with:
|
||||
@@ -117,12 +119,3 @@ jobs:
|
||||
config_file_path: ${{ matrix.test_config.config_file_path }}
|
||||
secrets:
|
||||
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }}
|
||||
|
||||
clear_resources:
|
||||
needs: multi-node-tests
|
||||
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
||||
uses: ./.github/workflows/_kill_lws_resources.yaml
|
||||
with:
|
||||
runner: linux-aarch64-a3-0
|
||||
secrets:
|
||||
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }}
|
||||
|
||||
Reference in New Issue
Block a user