[CI]Upgrade niglty multi-node-tests max-parallel to 2 (#7035)
### What this PR does / why we need it?
1. Increase nightly multi-node test max-parallel from 1 to 2, and fix
resource conflicts that arise when tests run concurrently.
2. Fix parse-trigger job: Add an if condition so it only runs on
schedule, workflow_dispatch, or PRs labeled nightly-test
3. Adjust nightly schedule: Shift trigger time from 24:00 to 23:45
(UTC+8)
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.16.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
62
.github/workflows/_e2e_nightly_multi_node.yaml
vendored
62
.github/workflows/_e2e_nightly_multi_node.yaml
vendored
@@ -66,7 +66,7 @@ defaults:
|
|||||||
# only cancel in-progress runs of the same workflow
|
# only cancel in-progress runs of the same workflow
|
||||||
# and ignore the lint / 8 cards test type
|
# and ignore the lint / 8 cards test type
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.soc_version }}
|
group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.soc_version }}-${{ inputs.config_file_path }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
@@ -80,7 +80,6 @@ jobs:
|
|||||||
env:
|
env:
|
||||||
KUBECONFIG: /tmp/kubeconfig
|
KUBECONFIG: /tmp/kubeconfig
|
||||||
NAMESPACE: vllm-project
|
NAMESPACE: vllm-project
|
||||||
LEADER_POD: vllm-0
|
|
||||||
steps:
|
steps:
|
||||||
- name: Decode kubeconfig from secrets
|
- name: Decode kubeconfig from secrets
|
||||||
run: |
|
run: |
|
||||||
@@ -101,6 +100,17 @@ jobs:
|
|||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v6
|
||||||
|
|
||||||
|
- name: Set job variables
|
||||||
|
run: |
|
||||||
|
# Derive a unique, valid k8s resource name from config_file_path.
|
||||||
|
# Strip .yaml extension, lowercase, replace dots/underscores with hyphens, cap at 50 chars.
|
||||||
|
config_file="${{ inputs.config_file_path }}"
|
||||||
|
lws_suffix=$(echo "$config_file" | sed 's/\.yaml$//' | tr '[:upper:]' '[:lower:]' | tr '._' '-' | cut -c1-50)
|
||||||
|
LWS_NAME="vllm-${lws_suffix}"
|
||||||
|
echo "LWS_NAME=${LWS_NAME}" >> $GITHUB_ENV
|
||||||
|
echo "LEADER_POD=${LWS_NAME}-0" >> $GITHUB_ENV
|
||||||
|
echo "Computed LWS_NAME=${LWS_NAME}"
|
||||||
|
|
||||||
- name: Prepare scripts
|
- name: Prepare scripts
|
||||||
run: |
|
run: |
|
||||||
# prepare for lws entrypoint scripts
|
# prepare for lws entrypoint scripts
|
||||||
@@ -110,14 +120,14 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
CRD_NAME="${CRD_NAME:-vllm}"
|
|
||||||
TIMEOUT=${TIMEOUT:-120}
|
TIMEOUT=${TIMEOUT:-120}
|
||||||
SLEEP_INTERVAL=2
|
SLEEP_INTERVAL=2
|
||||||
|
|
||||||
echo "Deleting leaderworkerset [$CRD_NAME] in namespace [$NAMESPACE]..."
|
echo "Deleting leaderworkerset [$LWS_NAME] in namespace [$NAMESPACE]..."
|
||||||
kubectl delete leaderworkerset "$CRD_NAME" -n "$NAMESPACE" --ignore-not-found
|
kubectl delete leaderworkerset "$LWS_NAME" -n "$NAMESPACE" --ignore-not-found
|
||||||
|
kubectl delete service "${LWS_NAME}-leader" -n "$NAMESPACE" --ignore-not-found
|
||||||
|
|
||||||
echo "Waiting for all pods starting with 'vllm' to be deleted..."
|
echo "Waiting for pods of leaderworkerset [$LWS_NAME] to be deleted..."
|
||||||
START_TIME=$(date +%s)
|
START_TIME=$(date +%s)
|
||||||
|
|
||||||
while true; do
|
while true; do
|
||||||
@@ -126,14 +136,14 @@ jobs:
|
|||||||
|
|
||||||
if [[ $ELAPSED -ge $TIMEOUT ]]; then
|
if [[ $ELAPSED -ge $TIMEOUT ]]; then
|
||||||
echo "Timeout reached ($TIMEOUT seconds), some pods still exist:"
|
echo "Timeout reached ($TIMEOUT seconds), some pods still exist:"
|
||||||
kubectl get pods -n "$NAMESPACE" | grep '^vllm' || true
|
kubectl get pods -n "$NAMESPACE" | grep "^${LWS_NAME}-" || true
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep '^vllm' || true)
|
PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep "^${LWS_NAME}-" || true)
|
||||||
|
|
||||||
if [[ -z "$PODS_EXIST" ]]; then
|
if [[ -z "$PODS_EXIST" ]]; then
|
||||||
echo "All vllm pods deleted."
|
echo "All pods for [$LWS_NAME] deleted."
|
||||||
break
|
break
|
||||||
else
|
else
|
||||||
echo "Waiting for pods to be deleted: $PODS_EXIST"
|
echo "Waiting for pods to be deleted: $PODS_EXIST"
|
||||||
@@ -174,6 +184,7 @@ jobs:
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
jinja2 $TEMPLATE_FILE \
|
jinja2 $TEMPLATE_FILE \
|
||||||
|
-D lws_name="$LWS_NAME" \
|
||||||
-D size="$size" \
|
-D size="$size" \
|
||||||
-D replicas="$replicas" \
|
-D replicas="$replicas" \
|
||||||
-D image="$image" \
|
-D image="$image" \
|
||||||
@@ -190,7 +201,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Waiting for pod ready
|
- name: Waiting for pod ready
|
||||||
run: |
|
run: |
|
||||||
POD_PREFIX="${POD_PREFIX:-vllm-0}"
|
POD_PREFIX="${LWS_NAME}-0"
|
||||||
SIZE="${{ inputs.size }}"
|
SIZE="${{ inputs.size }}"
|
||||||
TIMEOUT=1200 # default timeout 20 minutes
|
TIMEOUT=1200 # default timeout 20 minutes
|
||||||
|
|
||||||
@@ -260,7 +271,7 @@ jobs:
|
|||||||
trap cleanup EXIT
|
trap cleanup EXIT
|
||||||
|
|
||||||
for i in $(seq 1 $((size - 1))); do
|
for i in $(seq 1 $((size - 1))); do
|
||||||
POD="vllm-0-${i}"
|
POD="${LWS_NAME}-0-${i}"
|
||||||
|
|
||||||
echo "==== Collecting logs from worker pod: $POD ===="
|
echo "==== Collecting logs from worker pod: $POD ===="
|
||||||
kubectl logs -f "$POD" -n "$NAMESPACE" \
|
kubectl logs -f "$POD" -n "$NAMESPACE" \
|
||||||
@@ -290,5 +301,34 @@ jobs:
|
|||||||
- name: Post process
|
- name: Post process
|
||||||
if: always()
|
if: always()
|
||||||
run: |
|
run: |
|
||||||
|
echo "Current pod status:"
|
||||||
kubectl get pods -n "$NAMESPACE" --ignore-not-found=true
|
kubectl get pods -n "$NAMESPACE" --ignore-not-found=true
|
||||||
|
|
||||||
|
echo "Deleting resources for [$LWS_NAME]..."
|
||||||
kubectl delete -f ./lws.yaml --ignore-not-found=true || true
|
kubectl delete -f ./lws.yaml --ignore-not-found=true || true
|
||||||
|
|
||||||
|
echo "Waiting for pods of [$LWS_NAME] to fully terminate..."
|
||||||
|
TIMEOUT=300
|
||||||
|
SLEEP_INTERVAL=5
|
||||||
|
START_TIME=$(date +%s)
|
||||||
|
|
||||||
|
while true; do
|
||||||
|
NOW=$(date +%s)
|
||||||
|
ELAPSED=$((NOW - START_TIME))
|
||||||
|
|
||||||
|
if [[ $ELAPSED -ge $TIMEOUT ]]; then
|
||||||
|
echo "Timeout reached ($TIMEOUT seconds) waiting for termination, continuing anyway."
|
||||||
|
kubectl get pods -n "$NAMESPACE" | grep "^${LWS_NAME}-" || true
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
|
||||||
|
PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep "^${LWS_NAME}-" || true)
|
||||||
|
|
||||||
|
if [[ -z "$PODS_EXIST" ]]; then
|
||||||
|
echo "All pods for [$LWS_NAME] have terminated."
|
||||||
|
break
|
||||||
|
else
|
||||||
|
echo "Waiting for pods to terminate: $PODS_EXIST"
|
||||||
|
sleep $SLEEP_INTERVAL
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|||||||
10
.github/workflows/schedule_nightly_test_a2.yaml
vendored
10
.github/workflows/schedule_nightly_test_a2.yaml
vendored
@@ -21,8 +21,8 @@ name: Nightly-A2
|
|||||||
|
|
||||||
on:
|
on:
|
||||||
schedule:
|
schedule:
|
||||||
# Run test at 24:00 Beijing time (UTC+8)
|
# Run test at 23:45 Beijing time (UTC+8)
|
||||||
- cron: "0 16 * * *"
|
- cron: "45 15 * * *"
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
pull_request:
|
pull_request:
|
||||||
branches:
|
branches:
|
||||||
@@ -50,6 +50,10 @@ jobs:
|
|||||||
parse-trigger:
|
parse-trigger:
|
||||||
name: Parse trigger and determine test scope
|
name: Parse trigger and determine test scope
|
||||||
runs-on: linux-aarch64-a2b3-0
|
runs-on: linux-aarch64-a2b3-0
|
||||||
|
if: >-
|
||||||
|
github.event_name == 'schedule' ||
|
||||||
|
github.event_name == 'workflow_dispatch' ||
|
||||||
|
contains(github.event.pull_request.labels.*.name, 'nightly-test')
|
||||||
outputs:
|
outputs:
|
||||||
should_run: ${{ steps.parse.outputs.should_run }}
|
should_run: ${{ steps.parse.outputs.should_run }}
|
||||||
test_filter: ${{ steps.parse.outputs.test_filter }}
|
test_filter: ${{ steps.parse.outputs.test_filter }}
|
||||||
@@ -201,7 +205,7 @@ jobs:
|
|||||||
if: always() && needs.parse-trigger.outputs.should_run == 'true'
|
if: always() && needs.parse-trigger.outputs.should_run == 'true'
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
max-parallel: 1
|
max-parallel: 2
|
||||||
matrix:
|
matrix:
|
||||||
test_config:
|
test_config:
|
||||||
- name: multi-node-deepseek-dp
|
- name: multi-node-deepseek-dp
|
||||||
|
|||||||
10
.github/workflows/schedule_nightly_test_a3.yaml
vendored
10
.github/workflows/schedule_nightly_test_a3.yaml
vendored
@@ -22,8 +22,8 @@ name: Nightly-A3
|
|||||||
|
|
||||||
on:
|
on:
|
||||||
schedule:
|
schedule:
|
||||||
# Run test at 24:00 Beijing time (UTC+8)
|
# Run test at 23:45 Beijing time (UTC+8)
|
||||||
- cron: "0 16 * * *"
|
- cron: "45 15 * * *"
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
pull_request:
|
pull_request:
|
||||||
branches:
|
branches:
|
||||||
@@ -50,6 +50,10 @@ jobs:
|
|||||||
parse-trigger:
|
parse-trigger:
|
||||||
name: Parse trigger and determine test scope
|
name: Parse trigger and determine test scope
|
||||||
runs-on: linux-aarch64-a2b3-0
|
runs-on: linux-aarch64-a2b3-0
|
||||||
|
if: >-
|
||||||
|
github.event_name == 'schedule' ||
|
||||||
|
github.event_name == 'workflow_dispatch' ||
|
||||||
|
contains(github.event.pull_request.labels.*.name, 'nightly-test')
|
||||||
outputs:
|
outputs:
|
||||||
should_run: ${{ steps.parse.outputs.should_run }}
|
should_run: ${{ steps.parse.outputs.should_run }}
|
||||||
test_filter: ${{ steps.parse.outputs.test_filter }}
|
test_filter: ${{ steps.parse.outputs.test_filter }}
|
||||||
@@ -127,7 +131,7 @@ jobs:
|
|||||||
if: always() && needs.parse-trigger.outputs.should_run == 'true'
|
if: always() && needs.parse-trigger.outputs.should_run == 'true'
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
max-parallel: 1
|
max-parallel: 2
|
||||||
matrix:
|
matrix:
|
||||||
test_config:
|
test_config:
|
||||||
- name: multi-node-deepseek-pd
|
- name: multi-node-deepseek-pd
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
apiVersion: leaderworkerset.x-k8s.io/v1
|
apiVersion: leaderworkerset.x-k8s.io/v1
|
||||||
kind: LeaderWorkerSet
|
kind: LeaderWorkerSet
|
||||||
metadata:
|
metadata:
|
||||||
name: vllm
|
name: {{ lws_name | default("vllm") }}
|
||||||
namespace: vllm-project
|
namespace: vllm-project
|
||||||
spec:
|
spec:
|
||||||
replicas: {{ replicas | default(1) }}
|
replicas: {{ replicas | default(1) }}
|
||||||
@@ -128,7 +128,7 @@ spec:
|
|||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: Service
|
kind: Service
|
||||||
metadata:
|
metadata:
|
||||||
name: vllm-leader
|
name: {{ lws_name | default("vllm") }}-leader
|
||||||
namespace: vllm-project
|
namespace: vllm-project
|
||||||
spec:
|
spec:
|
||||||
ports:
|
ports:
|
||||||
@@ -137,6 +137,6 @@ spec:
|
|||||||
protocol: TCP
|
protocol: TCP
|
||||||
targetPort: 8080
|
targetPort: 8080
|
||||||
selector:
|
selector:
|
||||||
leaderworkerset.sigs.k8s.io/name: vllm
|
leaderworkerset.sigs.k8s.io/name: {{ lws_name | default("vllm") }}
|
||||||
role: leader
|
role: leader
|
||||||
type: ClusterIP
|
type: ClusterIP
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
apiVersion: leaderworkerset.x-k8s.io/v1
|
apiVersion: leaderworkerset.x-k8s.io/v1
|
||||||
kind: LeaderWorkerSet
|
kind: LeaderWorkerSet
|
||||||
metadata:
|
metadata:
|
||||||
name: vllm
|
name: {{ lws_name | default("vllm") }}
|
||||||
namespace: vllm-project
|
namespace: vllm-project
|
||||||
spec:
|
spec:
|
||||||
replicas: {{ replicas | default(1) }}
|
replicas: {{ replicas | default(1) }}
|
||||||
@@ -128,7 +128,7 @@ spec:
|
|||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: Service
|
kind: Service
|
||||||
metadata:
|
metadata:
|
||||||
name: vllm-leader
|
name: {{ lws_name | default("vllm") }}-leader
|
||||||
namespace: vllm-project
|
namespace: vllm-project
|
||||||
spec:
|
spec:
|
||||||
ports:
|
ports:
|
||||||
@@ -137,6 +137,6 @@ spec:
|
|||||||
protocol: TCP
|
protocol: TCP
|
||||||
targetPort: 8080
|
targetPort: 8080
|
||||||
selector:
|
selector:
|
||||||
leaderworkerset.sigs.k8s.io/name: vllm
|
leaderworkerset.sigs.k8s.io/name: {{ lws_name | default("vllm") }}
|
||||||
role: leader
|
role: leader
|
||||||
type: ClusterIP
|
type: ClusterIP
|
||||||
Reference in New Issue
Block a user