[CI] Optimize nightly CI (#3858)
### What this PR does / why we need it?
This patch optimize nightly CI:
1. Bug fixes ais_bench get None repo_type error
2. Fix A2 install kubectl error with arm arch
3. Fix the multi_node CI unable to determine whether the job was
successful error
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0rc3
- vLLM main:
83f478bb19
---------
Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
31
.github/workflows/_e2e_nightly_multi_node.yaml
vendored
31
.github/workflows/_e2e_nightly_multi_node.yaml
vendored
@@ -65,6 +65,7 @@ concurrency:
|
|||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
e2e:
|
e2e:
|
||||||
|
name: ${{ inputs.config_file_path }}
|
||||||
# This is a runner with no NPU for k8s controller
|
# This is a runner with no NPU for k8s controller
|
||||||
runs-on: ${{ inputs.runner }}
|
runs-on: ${{ inputs.runner }}
|
||||||
container:
|
container:
|
||||||
@@ -112,9 +113,10 @@ jobs:
|
|||||||
# prepare for lws entrypoint scripts
|
# prepare for lws entrypoint scripts
|
||||||
install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh
|
install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh
|
||||||
|
|
||||||
- name: Clear result ret
|
- name: Clear resources
|
||||||
run: |
|
run: |
|
||||||
rm -f $RESULT_FILE
|
# pre clear the crd resources created by lws
|
||||||
|
kubectl delete leaderworkerset vllm -n "$NAMESPACE" --ignore-not-found
|
||||||
|
|
||||||
- name: Launch cluster
|
- name: Launch cluster
|
||||||
run: |
|
run: |
|
||||||
@@ -153,6 +155,8 @@ jobs:
|
|||||||
-D vllm_ascend_ref="$vllm_ascend_ref" \
|
-D vllm_ascend_ref="$vllm_ascend_ref" \
|
||||||
-D result_file_path="$result_file_path" \
|
-D result_file_path="$result_file_path" \
|
||||||
-D npu_per_node="$npu_per_node" \
|
-D npu_per_node="$npu_per_node" \
|
||||||
|
-D controller_name="$HOSTNAME" \
|
||||||
|
-D kb_secret=${{ secrets.KUBECONFIG_B64 }} \
|
||||||
--outfile lws.yaml
|
--outfile lws.yaml
|
||||||
|
|
||||||
kubectl apply -f ./lws.yaml
|
kubectl apply -f ./lws.yaml
|
||||||
@@ -178,29 +182,6 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
kubectl logs -f "$LEADER_POD" -n "$NAMESPACE"
|
kubectl logs -f "$LEADER_POD" -n "$NAMESPACE"
|
||||||
|
|
||||||
- name: Determine is success
|
|
||||||
run: |
|
|
||||||
TIMEOUT=300
|
|
||||||
ELAPSED=0
|
|
||||||
while [ ! -f "$RESULT_FILE" ]; do
|
|
||||||
sleep 5
|
|
||||||
ELAPSED=$((ELAPSED + 5))
|
|
||||||
if [ $ELAPSED -ge $TIMEOUT ]; then
|
|
||||||
echo "Timeout waiting for test result file"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
RET=$(cat "$RESULT_FILE")
|
|
||||||
echo "Test result: $RET"
|
|
||||||
|
|
||||||
if [ "$RET" -ne 0 ]; then
|
|
||||||
echo "Test failed"
|
|
||||||
exit 1
|
|
||||||
else
|
|
||||||
echo "Test succeeded"
|
|
||||||
fi
|
|
||||||
|
|
||||||
- name: Post process
|
- name: Post process
|
||||||
if: always()
|
if: always()
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
@@ -49,12 +49,10 @@ concurrency:
|
|||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
e2e-nightly:
|
e2e-nightly:
|
||||||
name: e2e-nightly
|
name: ${{ inputs.tests }}
|
||||||
runs-on: ${{ inputs.runner }}
|
runs-on: ${{ inputs.runner }}
|
||||||
container:
|
container:
|
||||||
image: ${{ inputs.image }}
|
image: ${{ inputs.image }}
|
||||||
env:
|
|
||||||
VLLM_USE_MODELSCOPE: True
|
|
||||||
steps:
|
steps:
|
||||||
- name: Check npu and CANN info
|
- name: Check npu and CANN info
|
||||||
run: |
|
run: |
|
||||||
@@ -111,5 +109,4 @@ jobs:
|
|||||||
VLLM_USE_MODELSCOPE: True
|
VLLM_USE_MODELSCOPE: True
|
||||||
VLLM_CI_RUNNER: ${{ inputs.runner }}
|
VLLM_CI_RUNNER: ${{ inputs.runner }}
|
||||||
run: |
|
run: |
|
||||||
# TODO: enable more tests
|
|
||||||
pytest -sv ${{ inputs.tests }}
|
pytest -sv ${{ inputs.tests }}
|
||||||
|
|||||||
57
.github/workflows/_kill_lws_resources.yaml
vendored
Normal file
57
.github/workflows/_kill_lws_resources.yaml
vendored
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
name: 'resource clear'
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_call:
|
||||||
|
inputs:
|
||||||
|
runner:
|
||||||
|
required: false
|
||||||
|
type: string
|
||||||
|
default: linux-aarch64-a3-0
|
||||||
|
secrets:
|
||||||
|
KUBECONFIG_B64:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
|
||||||
|
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
|
||||||
|
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
|
||||||
|
# It's used to activate ascend-toolkit environment variables.
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell: bash -el {0}
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
resource_clear:
|
||||||
|
# This is a runner with no NPU for k8s controller
|
||||||
|
runs-on: ${{ inputs.runner }}
|
||||||
|
container:
|
||||||
|
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
|
||||||
|
env:
|
||||||
|
KUBECONFIG: /tmp/kubeconfig
|
||||||
|
KUBECTL: /root/.cache/.kube/kubectl
|
||||||
|
NAMESPACE: vllm-project
|
||||||
|
LEADER_POD: vllm-0
|
||||||
|
RESULT_FILE: /root/.cache/tests/ret/test_result.txt
|
||||||
|
steps:
|
||||||
|
- name: Install kubectl
|
||||||
|
run: |
|
||||||
|
# Install kubectl
|
||||||
|
arch=$(uname -m)
|
||||||
|
|
||||||
|
if echo "$arch" | grep -qiE "arm|aarch64"; then
|
||||||
|
echo "Detected ARM architecture: $arch"
|
||||||
|
KUBECTL="$KUBECTL"_arm
|
||||||
|
fi
|
||||||
|
install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
|
||||||
|
|
||||||
|
# Verify kubectl installation
|
||||||
|
kubectl version --client=true
|
||||||
|
|
||||||
|
- name: Decode kubeconfig from secrets
|
||||||
|
run: |
|
||||||
|
# Decode and save kubeconfig
|
||||||
|
echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG
|
||||||
|
|
||||||
|
- name: Clear LWS resources
|
||||||
|
if: always()
|
||||||
|
run: |
|
||||||
|
kubectl delete leaderworkerset vllm -n "$NAMESPACE" --ignore-not-found
|
||||||
@@ -101,6 +101,12 @@ jobs:
|
|||||||
- name: multi-node-dpsk-4node-pd
|
- name: multi-node-dpsk-4node-pd
|
||||||
config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml
|
config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml
|
||||||
size: 4
|
size: 4
|
||||||
|
- name: multi-node-qwenw8a8-2node
|
||||||
|
config_file_path: tests/e2e/nightly/multi_node/config/models/Qwen3-235B-W8A8.yaml
|
||||||
|
size: 2
|
||||||
|
- name: multi-node-glm-2node
|
||||||
|
config_file_path: tests/e2e/nightly/multi_node/config/models/GLM-4_5.yaml
|
||||||
|
size: 2
|
||||||
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
|
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
|
||||||
with:
|
with:
|
||||||
soc_version: a3
|
soc_version: a3
|
||||||
@@ -111,3 +117,12 @@ jobs:
|
|||||||
config_file_path: ${{ matrix.test_config.config_file_path }}
|
config_file_path: ${{ matrix.test_config.config_file_path }}
|
||||||
secrets:
|
secrets:
|
||||||
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }}
|
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }}
|
||||||
|
|
||||||
|
clear_resources:
|
||||||
|
needs: multi-node-tests
|
||||||
|
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
||||||
|
uses: ./.github/workflows/_kill_lws_resources.yaml
|
||||||
|
with:
|
||||||
|
runner: linux-aarch64-a3-0
|
||||||
|
secrets:
|
||||||
|
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }}
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ from tests.e2e.nightly.multi_node.config.utils import (get_avaliable_port,
|
|||||||
|
|
||||||
setup_logger()
|
setup_logger()
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
DISAGGREGATED_PREFILL_PROXY_SCRIPT = "examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py"
|
DISAGGREGATED_PREFILL_PROXY_SCRIPT = "examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py"
|
||||||
DISAGGEGATED_PREFILL_PORT = 5333
|
DISAGGEGATED_PREFILL_PORT = 5333
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ spec:
|
|||||||
replicas: {{ replicas | default(1) }}
|
replicas: {{ replicas | default(1) }}
|
||||||
leaderWorkerTemplate:
|
leaderWorkerTemplate:
|
||||||
size: {{ size | default(2) }}
|
size: {{ size | default(2) }}
|
||||||
restartPolicy: RecreateGroupOnPodRestart
|
restartPolicy: None
|
||||||
leaderTemplate:
|
leaderTemplate:
|
||||||
metadata:
|
metadata:
|
||||||
labels:
|
labels:
|
||||||
@@ -30,6 +30,10 @@ spec:
|
|||||||
value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }}
|
value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }}
|
||||||
- name: RESULT_FILE_PATH
|
- name: RESULT_FILE_PATH
|
||||||
value: {{ result_file_path | default("/root/.cache/tests/ret/test_result.txt") }}
|
value: {{ result_file_path | default("/root/.cache/tests/ret/test_result.txt") }}
|
||||||
|
- name: CONTROLLER_NAME
|
||||||
|
value: {{ controller_name | default("placeholder") }}
|
||||||
|
- name: SECRET
|
||||||
|
value: {{ kb_secret | default("placeholder") }}
|
||||||
command:
|
command:
|
||||||
- sh
|
- sh
|
||||||
- -c
|
- -c
|
||||||
|
|||||||
@@ -162,14 +162,31 @@ kill_npu_processes() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
run_tests() {
|
run_tests() {
|
||||||
pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py
|
set +e
|
||||||
kill_npu_processes
|
kill_npu_processes
|
||||||
|
pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py
|
||||||
ret=$?
|
ret=$?
|
||||||
if [ "$LWS_WORKER_INDEX" -eq 0 ]; then
|
if [ "$LWS_WORKER_INDEX" -eq 0 ]; then
|
||||||
mkdir -p "$(dirname "$RESULT_FILE_PATH")"
|
if [ $ret -eq 0 ]; then
|
||||||
echo $ret > "$RESULT_FILE_PATH"
|
print_success "All tests passed!"
|
||||||
|
else
|
||||||
|
print_error "Some tests failed!"
|
||||||
|
kubectl delete pod $CONTROLLER_NAME -n vllm-project
|
||||||
fi
|
fi
|
||||||
return $ret
|
fi
|
||||||
|
set -e
|
||||||
|
}
|
||||||
|
|
||||||
|
install_kubectl() {
|
||||||
|
arch=$(uname -m)
|
||||||
|
KUBECTL=/root/.cache/.kube/kubectl
|
||||||
|
if echo "$arch" | grep -qiE "arm|aarch64"; then
|
||||||
|
echo "Detected ARM architecture: $arch"
|
||||||
|
KUBECTL="$KUBECTL"_arm
|
||||||
|
fi
|
||||||
|
install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
|
||||||
|
echo "$SECRET" | base64 -d > /tmp/kubeconfig
|
||||||
|
export KUBECONFIG=/tmp/kubeconfig
|
||||||
}
|
}
|
||||||
|
|
||||||
main() {
|
main() {
|
||||||
@@ -177,6 +194,7 @@ main() {
|
|||||||
check_and_config
|
check_and_config
|
||||||
checkout_src
|
checkout_src
|
||||||
install_sys_dependencies
|
install_sys_dependencies
|
||||||
|
install_kubectl
|
||||||
install_vllm
|
install_vllm
|
||||||
install_ais_bench
|
install_ais_bench
|
||||||
# to speed up mooncake build process, install Go here
|
# to speed up mooncake build process, install Go here
|
||||||
|
|||||||
@@ -284,12 +284,12 @@ def get_lock(model_name_or_path: str | Path, cache_dir: str | None = None):
|
|||||||
|
|
||||||
def maybe_download_from_modelscope(
|
def maybe_download_from_modelscope(
|
||||||
model: str,
|
model: str,
|
||||||
repo_type: str | None = None,
|
repo_type: str = "model",
|
||||||
revision: str | None = None,
|
revision: str | None = None,
|
||||||
download_dir: str | None = None,
|
download_dir: str | None = None,
|
||||||
ignore_patterns: str | list[str] | None = None,
|
ignore_patterns: str | list[str] | None = None,
|
||||||
allow_patterns: list[str] | str | None = None,
|
allow_patterns: list[str] | str | None = None,
|
||||||
) -> str | None:
|
) -> str:
|
||||||
"""
|
"""
|
||||||
Download model/dataset from ModelScope hub.
|
Download model/dataset from ModelScope hub.
|
||||||
Returns the path to the downloaded model, or None if the model is not
|
Returns the path to the downloaded model, or None if the model is not
|
||||||
@@ -311,4 +311,3 @@ def maybe_download_from_modelscope(
|
|||||||
else:
|
else:
|
||||||
model_path = model
|
model_path = model
|
||||||
return model_path
|
return model_path
|
||||||
return None
|
|
||||||
|
|||||||
Reference in New Issue
Block a user