[CI] Optimize nightly CI (#3858)

### What this PR does / why we need it?
This patch optimize nightly CI:
1. Bug fixes ais_bench get None repo_type error
2. Fix A2 install kubectl error with arm arch
3. Fix the multi_node CI unable to determine whether the job was
successful error
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?


- vLLM version: v0.11.0rc3
- vLLM main:
83f478bb19

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
Li Wang
2025-10-29 22:30:19 +08:00
committed by GitHub
parent cba69e117e
commit 4a2ab13743
8 changed files with 110 additions and 39 deletions

View File

@@ -65,6 +65,7 @@ concurrency:
jobs:
e2e:
name: ${{ inputs.config_file_path }}
# This is a runner with no NPU for k8s controller
runs-on: ${{ inputs.runner }}
container:
@@ -112,9 +113,10 @@ jobs:
# prepare for lws entrypoint scripts
install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh
- name: Clear result ret
- name: Clear resources
run: |
rm -f $RESULT_FILE
# pre clear the crd resources created by lws
kubectl delete leaderworkerset vllm -n "$NAMESPACE" --ignore-not-found
- name: Launch cluster
run: |
@@ -153,6 +155,8 @@ jobs:
-D vllm_ascend_ref="$vllm_ascend_ref" \
-D result_file_path="$result_file_path" \
-D npu_per_node="$npu_per_node" \
-D controller_name="$HOSTNAME" \
-D kb_secret=${{ secrets.KUBECONFIG_B64 }} \
--outfile lws.yaml
kubectl apply -f ./lws.yaml
@@ -178,29 +182,6 @@ jobs:
run: |
kubectl logs -f "$LEADER_POD" -n "$NAMESPACE"
- name: Determine is success
run: |
TIMEOUT=300
ELAPSED=0
while [ ! -f "$RESULT_FILE" ]; do
sleep 5
ELAPSED=$((ELAPSED + 5))
if [ $ELAPSED -ge $TIMEOUT ]; then
echo "Timeout waiting for test result file"
exit 1
fi
done
RET=$(cat "$RESULT_FILE")
echo "Test result: $RET"
if [ "$RET" -ne 0 ]; then
echo "Test failed"
exit 1
else
echo "Test succeeded"
fi
- name: Post process
if: always()
run: |

View File

@@ -49,12 +49,10 @@ concurrency:
jobs:
e2e-nightly:
name: e2e-nightly
name: ${{ inputs.tests }}
runs-on: ${{ inputs.runner }}
container:
image: ${{ inputs.image }}
env:
VLLM_USE_MODELSCOPE: True
steps:
- name: Check npu and CANN info
run: |
@@ -111,5 +109,4 @@ jobs:
VLLM_USE_MODELSCOPE: True
VLLM_CI_RUNNER: ${{ inputs.runner }}
run: |
# TODO: enable more tests
pytest -sv ${{ inputs.tests }}

View File

@@ -0,0 +1,57 @@
name: 'resource clear'
on:
workflow_call:
inputs:
runner:
required: false
type: string
default: linux-aarch64-a3-0
secrets:
KUBECONFIG_B64:
required: true
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
# It's used to activate ascend-toolkit environment variables.
defaults:
run:
shell: bash -el {0}
jobs:
resource_clear:
# This is a runner with no NPU for k8s controller
runs-on: ${{ inputs.runner }}
container:
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
env:
KUBECONFIG: /tmp/kubeconfig
KUBECTL: /root/.cache/.kube/kubectl
NAMESPACE: vllm-project
LEADER_POD: vllm-0
RESULT_FILE: /root/.cache/tests/ret/test_result.txt
steps:
- name: Install kubectl
run: |
# Install kubectl
arch=$(uname -m)
if echo "$arch" | grep -qiE "arm|aarch64"; then
echo "Detected ARM architecture: $arch"
KUBECTL="$KUBECTL"_arm
fi
install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
# Verify kubectl installation
kubectl version --client=true
- name: Decode kubeconfig from secrets
run: |
# Decode and save kubeconfig
echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG
- name: Clear LWS resources
if: always()
run: |
kubectl delete leaderworkerset vllm -n "$NAMESPACE" --ignore-not-found

View File

@@ -101,6 +101,12 @@ jobs:
- name: multi-node-dpsk-4node-pd
config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml
size: 4
- name: multi-node-qwenw8a8-2node
config_file_path: tests/e2e/nightly/multi_node/config/models/Qwen3-235B-W8A8.yaml
size: 2
- name: multi-node-glm-2node
config_file_path: tests/e2e/nightly/multi_node/config/models/GLM-4_5.yaml
size: 2
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
with:
soc_version: a3
@@ -111,3 +117,12 @@ jobs:
config_file_path: ${{ matrix.test_config.config_file_path }}
secrets:
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }}
clear_resources:
needs: multi-node-tests
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
uses: ./.github/workflows/_kill_lws_resources.yaml
with:
runner: linux-aarch64-a3-0
secrets:
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }}