[CI] Optimize nightly CI (#3858)

### What this PR does / why we need it?
This patch optimize nightly CI:
1. Bug fixes ais_bench get None repo_type error
2. Fix A2 install kubectl error with arm arch
3. Fix the multi_node CI unable to determine whether the job was
successful error
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?


- vLLM version: v0.11.0rc3
- vLLM main:
83f478bb19

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
Li Wang
2025-10-29 22:30:19 +08:00
committed by GitHub
parent cba69e117e
commit 4a2ab13743
8 changed files with 110 additions and 39 deletions

View File

@@ -15,7 +15,7 @@ from tests.e2e.nightly.multi_node.config.utils import (get_avaliable_port,
setup_logger()
logger = logging.getLogger(__name__)
DISAGGREGATED_PREFILL_PROXY_SCRIPT = "examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py"
DISAGGREGATED_PREFILL_PROXY_SCRIPT = "examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py"
DISAGGEGATED_PREFILL_PORT = 5333

View File

@@ -7,7 +7,7 @@ spec:
replicas: {{ replicas | default(1) }}
leaderWorkerTemplate:
size: {{ size | default(2) }}
restartPolicy: RecreateGroupOnPodRestart
restartPolicy: None
leaderTemplate:
metadata:
labels:
@@ -30,6 +30,10 @@ spec:
value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }}
- name: RESULT_FILE_PATH
value: {{ result_file_path | default("/root/.cache/tests/ret/test_result.txt") }}
- name: CONTROLLER_NAME
value: {{ controller_name | default("placeholder") }}
- name: SECRET
value: {{ kb_secret | default("placeholder") }}
command:
- sh
- -c

View File

@@ -162,14 +162,31 @@ kill_npu_processes() {
}
run_tests() {
pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py
set +e
kill_npu_processes
pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py
ret=$?
if [ "$LWS_WORKER_INDEX" -eq 0 ]; then
mkdir -p "$(dirname "$RESULT_FILE_PATH")"
echo $ret > "$RESULT_FILE_PATH"
if [ $ret -eq 0 ]; then
print_success "All tests passed!"
else
print_error "Some tests failed!"
kubectl delete pod $CONTROLLER_NAME -n vllm-project
fi
fi
return $ret
set -e
}
install_kubectl() {
arch=$(uname -m)
KUBECTL=/root/.cache/.kube/kubectl
if echo "$arch" | grep -qiE "arm|aarch64"; then
echo "Detected ARM architecture: $arch"
KUBECTL="$KUBECTL"_arm
fi
install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
echo "$SECRET" | base64 -d > /tmp/kubeconfig
export KUBECONFIG=/tmp/kubeconfig
}
main() {
@@ -177,6 +194,7 @@ main() {
check_and_config
checkout_src
install_sys_dependencies
install_kubectl
install_vllm
install_ais_bench
# to speed up mooncake build process, install Go here