[CI] Optimize nightly CI (#3858)

### What this PR does / why we need it?
This patch optimize nightly CI:
1. Bug fixes ais_bench get None repo_type error
2. Fix A2 install kubectl error with arm arch
3. Fix the multi_node CI unable to determine whether the job was
successful error
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?


- vLLM version: v0.11.0rc3
- vLLM main:
83f478bb19

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
Li Wang
2025-10-29 22:30:19 +08:00
committed by GitHub
parent cba69e117e
commit 4a2ab13743
8 changed files with 110 additions and 39 deletions

View File

@@ -65,6 +65,7 @@ concurrency:
jobs: jobs:
e2e: e2e:
name: ${{ inputs.config_file_path }}
# This is a runner with no NPU for k8s controller # This is a runner with no NPU for k8s controller
runs-on: ${{ inputs.runner }} runs-on: ${{ inputs.runner }}
container: container:
@@ -112,9 +113,10 @@ jobs:
# prepare for lws entrypoint scripts # prepare for lws entrypoint scripts
install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh
- name: Clear result ret - name: Clear resources
run: | run: |
rm -f $RESULT_FILE # pre clear the crd resources created by lws
kubectl delete leaderworkerset vllm -n "$NAMESPACE" --ignore-not-found
- name: Launch cluster - name: Launch cluster
run: | run: |
@@ -153,6 +155,8 @@ jobs:
-D vllm_ascend_ref="$vllm_ascend_ref" \ -D vllm_ascend_ref="$vllm_ascend_ref" \
-D result_file_path="$result_file_path" \ -D result_file_path="$result_file_path" \
-D npu_per_node="$npu_per_node" \ -D npu_per_node="$npu_per_node" \
-D controller_name="$HOSTNAME" \
-D kb_secret=${{ secrets.KUBECONFIG_B64 }} \
--outfile lws.yaml --outfile lws.yaml
kubectl apply -f ./lws.yaml kubectl apply -f ./lws.yaml
@@ -178,29 +182,6 @@ jobs:
run: | run: |
kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" kubectl logs -f "$LEADER_POD" -n "$NAMESPACE"
- name: Determine is success
run: |
TIMEOUT=300
ELAPSED=0
while [ ! -f "$RESULT_FILE" ]; do
sleep 5
ELAPSED=$((ELAPSED + 5))
if [ $ELAPSED -ge $TIMEOUT ]; then
echo "Timeout waiting for test result file"
exit 1
fi
done
RET=$(cat "$RESULT_FILE")
echo "Test result: $RET"
if [ "$RET" -ne 0 ]; then
echo "Test failed"
exit 1
else
echo "Test succeeded"
fi
- name: Post process - name: Post process
if: always() if: always()
run: | run: |

View File

@@ -49,12 +49,10 @@ concurrency:
jobs: jobs:
e2e-nightly: e2e-nightly:
name: e2e-nightly name: ${{ inputs.tests }}
runs-on: ${{ inputs.runner }} runs-on: ${{ inputs.runner }}
container: container:
image: ${{ inputs.image }} image: ${{ inputs.image }}
env:
VLLM_USE_MODELSCOPE: True
steps: steps:
- name: Check npu and CANN info - name: Check npu and CANN info
run: | run: |
@@ -111,5 +109,4 @@ jobs:
VLLM_USE_MODELSCOPE: True VLLM_USE_MODELSCOPE: True
VLLM_CI_RUNNER: ${{ inputs.runner }} VLLM_CI_RUNNER: ${{ inputs.runner }}
run: | run: |
# TODO: enable more tests
pytest -sv ${{ inputs.tests }} pytest -sv ${{ inputs.tests }}

View File

@@ -0,0 +1,57 @@
name: 'resource clear'
on:
workflow_call:
inputs:
runner:
required: false
type: string
default: linux-aarch64-a3-0
secrets:
KUBECONFIG_B64:
required: true
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
# It's used to activate ascend-toolkit environment variables.
defaults:
run:
shell: bash -el {0}
jobs:
resource_clear:
# This is a runner with no NPU for k8s controller
runs-on: ${{ inputs.runner }}
container:
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
env:
KUBECONFIG: /tmp/kubeconfig
KUBECTL: /root/.cache/.kube/kubectl
NAMESPACE: vllm-project
LEADER_POD: vllm-0
RESULT_FILE: /root/.cache/tests/ret/test_result.txt
steps:
- name: Install kubectl
run: |
# Install kubectl
arch=$(uname -m)
if echo "$arch" | grep -qiE "arm|aarch64"; then
echo "Detected ARM architecture: $arch"
KUBECTL="$KUBECTL"_arm
fi
install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
# Verify kubectl installation
kubectl version --client=true
- name: Decode kubeconfig from secrets
run: |
# Decode and save kubeconfig
echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG
- name: Clear LWS resources
if: always()
run: |
kubectl delete leaderworkerset vllm -n "$NAMESPACE" --ignore-not-found

View File

@@ -101,6 +101,12 @@ jobs:
- name: multi-node-dpsk-4node-pd - name: multi-node-dpsk-4node-pd
config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml
size: 4 size: 4
- name: multi-node-qwenw8a8-2node
config_file_path: tests/e2e/nightly/multi_node/config/models/Qwen3-235B-W8A8.yaml
size: 2
- name: multi-node-glm-2node
config_file_path: tests/e2e/nightly/multi_node/config/models/GLM-4_5.yaml
size: 2
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
with: with:
soc_version: a3 soc_version: a3
@@ -111,3 +117,12 @@ jobs:
config_file_path: ${{ matrix.test_config.config_file_path }} config_file_path: ${{ matrix.test_config.config_file_path }}
secrets: secrets:
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }} KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }}
clear_resources:
needs: multi-node-tests
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
uses: ./.github/workflows/_kill_lws_resources.yaml
with:
runner: linux-aarch64-a3-0
secrets:
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }}

View File

@@ -15,7 +15,7 @@ from tests.e2e.nightly.multi_node.config.utils import (get_avaliable_port,
setup_logger() setup_logger()
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
DISAGGREGATED_PREFILL_PROXY_SCRIPT = "examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py" DISAGGREGATED_PREFILL_PROXY_SCRIPT = "examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py"
DISAGGEGATED_PREFILL_PORT = 5333 DISAGGEGATED_PREFILL_PORT = 5333

View File

@@ -7,7 +7,7 @@ spec:
replicas: {{ replicas | default(1) }} replicas: {{ replicas | default(1) }}
leaderWorkerTemplate: leaderWorkerTemplate:
size: {{ size | default(2) }} size: {{ size | default(2) }}
restartPolicy: RecreateGroupOnPodRestart restartPolicy: None
leaderTemplate: leaderTemplate:
metadata: metadata:
labels: labels:
@@ -30,6 +30,10 @@ spec:
value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }} value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }}
- name: RESULT_FILE_PATH - name: RESULT_FILE_PATH
value: {{ result_file_path | default("/root/.cache/tests/ret/test_result.txt") }} value: {{ result_file_path | default("/root/.cache/tests/ret/test_result.txt") }}
- name: CONTROLLER_NAME
value: {{ controller_name | default("placeholder") }}
- name: SECRET
value: {{ kb_secret | default("placeholder") }}
command: command:
- sh - sh
- -c - -c

View File

@@ -162,14 +162,31 @@ kill_npu_processes() {
} }
run_tests() { run_tests() {
pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py set +e
kill_npu_processes kill_npu_processes
pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py
ret=$? ret=$?
if [ "$LWS_WORKER_INDEX" -eq 0 ]; then if [ "$LWS_WORKER_INDEX" -eq 0 ]; then
mkdir -p "$(dirname "$RESULT_FILE_PATH")" if [ $ret -eq 0 ]; then
echo $ret > "$RESULT_FILE_PATH" print_success "All tests passed!"
else
print_error "Some tests failed!"
kubectl delete pod $CONTROLLER_NAME -n vllm-project
fi
fi fi
return $ret set -e
}
install_kubectl() {
arch=$(uname -m)
KUBECTL=/root/.cache/.kube/kubectl
if echo "$arch" | grep -qiE "arm|aarch64"; then
echo "Detected ARM architecture: $arch"
KUBECTL="$KUBECTL"_arm
fi
install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
echo "$SECRET" | base64 -d > /tmp/kubeconfig
export KUBECONFIG=/tmp/kubeconfig
} }
main() { main() {
@@ -177,6 +194,7 @@ main() {
check_and_config check_and_config
checkout_src checkout_src
install_sys_dependencies install_sys_dependencies
install_kubectl
install_vllm install_vllm
install_ais_bench install_ais_bench
# to speed up mooncake build process, install Go here # to speed up mooncake build process, install Go here

View File

@@ -284,12 +284,12 @@ def get_lock(model_name_or_path: str | Path, cache_dir: str | None = None):
def maybe_download_from_modelscope( def maybe_download_from_modelscope(
model: str, model: str,
repo_type: str | None = None, repo_type: str = "model",
revision: str | None = None, revision: str | None = None,
download_dir: str | None = None, download_dir: str | None = None,
ignore_patterns: str | list[str] | None = None, ignore_patterns: str | list[str] | None = None,
allow_patterns: list[str] | str | None = None, allow_patterns: list[str] | str | None = None,
) -> str | None: ) -> str:
""" """
Download model/dataset from ModelScope hub. Download model/dataset from ModelScope hub.
Returns the path to the downloaded model, or None if the model is not Returns the path to the downloaded model, or None if the model is not
@@ -310,5 +310,4 @@ def maybe_download_from_modelscope(
) )
else: else:
model_path = model model_path = model
return model_path return model_path
return None