[CI]Upgrade niglty multi-node-tests max-parallel to 2 (#7035)

### What this PR does / why we need it?

1. Increase nightly multi-node test max-parallel from 1 to 2, and fix
resource conflicts that arise when tests run concurrently.
2. Fix parse-trigger job: Add an if condition so it only runs on
schedule, workflow_dispatch, or PRs labeled nightly-test
3. Adjust nightly schedule: Shift trigger time from 24:00 to 23:45
(UTC+8)

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.16.0
- vLLM main:
4034c3d32e

---------

Signed-off-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
zhangxinyuehfad
2026-03-10 16:25:51 +08:00
committed by GitHub
parent 5df450bca4
commit 67d40f23fd
5 changed files with 71 additions and 23 deletions

View File

@@ -66,7 +66,7 @@ defaults:
# only cancel in-progress runs of the same workflow # only cancel in-progress runs of the same workflow
# and ignore the lint / 8 cards test type # and ignore the lint / 8 cards test type
concurrency: concurrency:
group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.soc_version }} group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.soc_version }}-${{ inputs.config_file_path }}
cancel-in-progress: true cancel-in-progress: true
jobs: jobs:
@@ -80,7 +80,6 @@ jobs:
env: env:
KUBECONFIG: /tmp/kubeconfig KUBECONFIG: /tmp/kubeconfig
NAMESPACE: vllm-project NAMESPACE: vllm-project
LEADER_POD: vllm-0
steps: steps:
- name: Decode kubeconfig from secrets - name: Decode kubeconfig from secrets
run: | run: |
@@ -101,6 +100,17 @@ jobs:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v6 uses: actions/checkout@v6
- name: Set job variables
run: |
# Derive a unique, valid k8s resource name from config_file_path.
# Strip .yaml extension, lowercase, replace dots/underscores with hyphens, cap at 50 chars.
config_file="${{ inputs.config_file_path }}"
lws_suffix=$(echo "$config_file" | sed 's/\.yaml$//' | tr '[:upper:]' '[:lower:]' | tr '._' '-' | cut -c1-50)
LWS_NAME="vllm-${lws_suffix}"
echo "LWS_NAME=${LWS_NAME}" >> $GITHUB_ENV
echo "LEADER_POD=${LWS_NAME}-0" >> $GITHUB_ENV
echo "Computed LWS_NAME=${LWS_NAME}"
- name: Prepare scripts - name: Prepare scripts
run: | run: |
# prepare for lws entrypoint scripts # prepare for lws entrypoint scripts
@@ -110,14 +120,14 @@ jobs:
run: | run: |
set -euo pipefail set -euo pipefail
CRD_NAME="${CRD_NAME:-vllm}"
TIMEOUT=${TIMEOUT:-120} TIMEOUT=${TIMEOUT:-120}
SLEEP_INTERVAL=2 SLEEP_INTERVAL=2
echo "Deleting leaderworkerset [$CRD_NAME] in namespace [$NAMESPACE]..." echo "Deleting leaderworkerset [$LWS_NAME] in namespace [$NAMESPACE]..."
kubectl delete leaderworkerset "$CRD_NAME" -n "$NAMESPACE" --ignore-not-found kubectl delete leaderworkerset "$LWS_NAME" -n "$NAMESPACE" --ignore-not-found
kubectl delete service "${LWS_NAME}-leader" -n "$NAMESPACE" --ignore-not-found
echo "Waiting for all pods starting with 'vllm' to be deleted..." echo "Waiting for pods of leaderworkerset [$LWS_NAME] to be deleted..."
START_TIME=$(date +%s) START_TIME=$(date +%s)
while true; do while true; do
@@ -126,14 +136,14 @@ jobs:
if [[ $ELAPSED -ge $TIMEOUT ]]; then if [[ $ELAPSED -ge $TIMEOUT ]]; then
echo "Timeout reached ($TIMEOUT seconds), some pods still exist:" echo "Timeout reached ($TIMEOUT seconds), some pods still exist:"
kubectl get pods -n "$NAMESPACE" | grep '^vllm' || true kubectl get pods -n "$NAMESPACE" | grep "^${LWS_NAME}-" || true
exit 1 exit 1
fi fi
PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep '^vllm' || true) PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep "^${LWS_NAME}-" || true)
if [[ -z "$PODS_EXIST" ]]; then if [[ -z "$PODS_EXIST" ]]; then
echo "All vllm pods deleted." echo "All pods for [$LWS_NAME] deleted."
break break
else else
echo "Waiting for pods to be deleted: $PODS_EXIST" echo "Waiting for pods to be deleted: $PODS_EXIST"
@@ -174,6 +184,7 @@ jobs:
fi fi
jinja2 $TEMPLATE_FILE \ jinja2 $TEMPLATE_FILE \
-D lws_name="$LWS_NAME" \
-D size="$size" \ -D size="$size" \
-D replicas="$replicas" \ -D replicas="$replicas" \
-D image="$image" \ -D image="$image" \
@@ -190,7 +201,7 @@ jobs:
- name: Waiting for pod ready - name: Waiting for pod ready
run: | run: |
POD_PREFIX="${POD_PREFIX:-vllm-0}" POD_PREFIX="${LWS_NAME}-0"
SIZE="${{ inputs.size }}" SIZE="${{ inputs.size }}"
TIMEOUT=1200 # default timeout 20 minutes TIMEOUT=1200 # default timeout 20 minutes
@@ -260,7 +271,7 @@ jobs:
trap cleanup EXIT trap cleanup EXIT
for i in $(seq 1 $((size - 1))); do for i in $(seq 1 $((size - 1))); do
POD="vllm-0-${i}" POD="${LWS_NAME}-0-${i}"
echo "==== Collecting logs from worker pod: $POD ====" echo "==== Collecting logs from worker pod: $POD ===="
kubectl logs -f "$POD" -n "$NAMESPACE" \ kubectl logs -f "$POD" -n "$NAMESPACE" \
@@ -290,5 +301,34 @@ jobs:
- name: Post process - name: Post process
if: always() if: always()
run: | run: |
echo "Current pod status:"
kubectl get pods -n "$NAMESPACE" --ignore-not-found=true kubectl get pods -n "$NAMESPACE" --ignore-not-found=true
echo "Deleting resources for [$LWS_NAME]..."
kubectl delete -f ./lws.yaml --ignore-not-found=true || true kubectl delete -f ./lws.yaml --ignore-not-found=true || true
echo "Waiting for pods of [$LWS_NAME] to fully terminate..."
TIMEOUT=300
SLEEP_INTERVAL=5
START_TIME=$(date +%s)
while true; do
NOW=$(date +%s)
ELAPSED=$((NOW - START_TIME))
if [[ $ELAPSED -ge $TIMEOUT ]]; then
echo "Timeout reached ($TIMEOUT seconds) waiting for termination, continuing anyway."
kubectl get pods -n "$NAMESPACE" | grep "^${LWS_NAME}-" || true
break
fi
PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep "^${LWS_NAME}-" || true)
if [[ -z "$PODS_EXIST" ]]; then
echo "All pods for [$LWS_NAME] have terminated."
break
else
echo "Waiting for pods to terminate: $PODS_EXIST"
sleep $SLEEP_INTERVAL
fi
done

View File

@@ -21,8 +21,8 @@ name: Nightly-A2
on: on:
schedule: schedule:
# Run test at 24:00 Beijing time (UTC+8) # Run test at 23:45 Beijing time (UTC+8)
- cron: "0 16 * * *" - cron: "45 15 * * *"
workflow_dispatch: workflow_dispatch:
pull_request: pull_request:
branches: branches:
@@ -50,6 +50,10 @@ jobs:
parse-trigger: parse-trigger:
name: Parse trigger and determine test scope name: Parse trigger and determine test scope
runs-on: linux-aarch64-a2b3-0 runs-on: linux-aarch64-a2b3-0
if: >-
github.event_name == 'schedule' ||
github.event_name == 'workflow_dispatch' ||
contains(github.event.pull_request.labels.*.name, 'nightly-test')
outputs: outputs:
should_run: ${{ steps.parse.outputs.should_run }} should_run: ${{ steps.parse.outputs.should_run }}
test_filter: ${{ steps.parse.outputs.test_filter }} test_filter: ${{ steps.parse.outputs.test_filter }}
@@ -201,7 +205,7 @@ jobs:
if: always() && needs.parse-trigger.outputs.should_run == 'true' if: always() && needs.parse-trigger.outputs.should_run == 'true'
strategy: strategy:
fail-fast: false fail-fast: false
max-parallel: 1 max-parallel: 2
matrix: matrix:
test_config: test_config:
- name: multi-node-deepseek-dp - name: multi-node-deepseek-dp

View File

@@ -22,8 +22,8 @@ name: Nightly-A3
on: on:
schedule: schedule:
# Run test at 24:00 Beijing time (UTC+8) # Run test at 23:45 Beijing time (UTC+8)
- cron: "0 16 * * *" - cron: "45 15 * * *"
workflow_dispatch: workflow_dispatch:
pull_request: pull_request:
branches: branches:
@@ -50,6 +50,10 @@ jobs:
parse-trigger: parse-trigger:
name: Parse trigger and determine test scope name: Parse trigger and determine test scope
runs-on: linux-aarch64-a2b3-0 runs-on: linux-aarch64-a2b3-0
if: >-
github.event_name == 'schedule' ||
github.event_name == 'workflow_dispatch' ||
contains(github.event.pull_request.labels.*.name, 'nightly-test')
outputs: outputs:
should_run: ${{ steps.parse.outputs.should_run }} should_run: ${{ steps.parse.outputs.should_run }}
test_filter: ${{ steps.parse.outputs.test_filter }} test_filter: ${{ steps.parse.outputs.test_filter }}
@@ -127,7 +131,7 @@ jobs:
if: always() && needs.parse-trigger.outputs.should_run == 'true' if: always() && needs.parse-trigger.outputs.should_run == 'true'
strategy: strategy:
fail-fast: false fail-fast: false
max-parallel: 1 max-parallel: 2
matrix: matrix:
test_config: test_config:
- name: multi-node-deepseek-pd - name: multi-node-deepseek-pd

View File

@@ -1,7 +1,7 @@
apiVersion: leaderworkerset.x-k8s.io/v1 apiVersion: leaderworkerset.x-k8s.io/v1
kind: LeaderWorkerSet kind: LeaderWorkerSet
metadata: metadata:
name: vllm name: {{ lws_name | default("vllm") }}
namespace: vllm-project namespace: vllm-project
spec: spec:
replicas: {{ replicas | default(1) }} replicas: {{ replicas | default(1) }}
@@ -128,7 +128,7 @@ spec:
apiVersion: v1 apiVersion: v1
kind: Service kind: Service
metadata: metadata:
name: vllm-leader name: {{ lws_name | default("vllm") }}-leader
namespace: vllm-project namespace: vllm-project
spec: spec:
ports: ports:
@@ -137,6 +137,6 @@ spec:
protocol: TCP protocol: TCP
targetPort: 8080 targetPort: 8080
selector: selector:
leaderworkerset.sigs.k8s.io/name: vllm leaderworkerset.sigs.k8s.io/name: {{ lws_name | default("vllm") }}
role: leader role: leader
type: ClusterIP type: ClusterIP

View File

@@ -1,7 +1,7 @@
apiVersion: leaderworkerset.x-k8s.io/v1 apiVersion: leaderworkerset.x-k8s.io/v1
kind: LeaderWorkerSet kind: LeaderWorkerSet
metadata: metadata:
name: vllm name: {{ lws_name | default("vllm") }}
namespace: vllm-project namespace: vllm-project
spec: spec:
replicas: {{ replicas | default(1) }} replicas: {{ replicas | default(1) }}
@@ -128,7 +128,7 @@ spec:
apiVersion: v1 apiVersion: v1
kind: Service kind: Service
metadata: metadata:
name: vllm-leader name: {{ lws_name | default("vllm") }}-leader
namespace: vllm-project namespace: vllm-project
spec: spec:
ports: ports:
@@ -137,6 +137,6 @@ spec:
protocol: TCP protocol: TCP
targetPort: 8080 targetPort: 8080
selector: selector:
leaderworkerset.sigs.k8s.io/name: vllm leaderworkerset.sigs.k8s.io/name: {{ lws_name | default("vllm") }}
role: leader role: leader
type: ClusterIP type: ClusterIP