[CI] Add daily images build for nightly ci (#3989)
### What this PR does / why we need it?
Given the current excessively long build time of our nightly-ci, I
recommend installing necessary, confirmed versions of packages in the
Docker image to reduce the time required for integration testing.
Including Mooncake vllm with fixed tags, This is expected to reduce
nightly-ci duration by 2 hours.
- vLLM version: v0.11.0
- vLLM main:
2918c1b49c
---------
Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
95
.github/workflows/_e2e_nightly_single_node.yaml
vendored
95
.github/workflows/_e2e_nightly_single_node.yaml
vendored
@@ -62,67 +62,56 @@ jobs:
|
||||
npu-smi info
|
||||
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
|
||||
|
||||
- name: Config mirrors
|
||||
- name: Show vLLM and vLLM-Ascend version
|
||||
working-directory: /vllm-workspace
|
||||
run: |
|
||||
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
|
||||
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
||||
apt-get update -y
|
||||
apt install git -y
|
||||
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
|
||||
echo "Installed vLLM-related Python packages:"
|
||||
pip list | grep vllm || echo "No vllm packages found."
|
||||
|
||||
- name: Checkout vllm-project/vllm-ascend repo
|
||||
uses: actions/checkout@v4
|
||||
echo ""
|
||||
echo "============================"
|
||||
echo "vLLM Git information"
|
||||
echo "============================"
|
||||
cd vllm
|
||||
if [ -d .git ]; then
|
||||
echo "Branch: $(git rev-parse --abbrev-ref HEAD)"
|
||||
echo "Commit hash: $(git rev-parse HEAD)"
|
||||
echo "Author: $(git log -1 --pretty=format:'%an <%ae>')"
|
||||
echo "Date: $(git log -1 --pretty=format:'%ad' --date=iso)"
|
||||
echo "Message: $(git log -1 --pretty=format:'%s')"
|
||||
echo "Tags: $(git tag --points-at HEAD || echo 'None')"
|
||||
echo "Remote: $(git remote -v | head -n1)"
|
||||
echo ""
|
||||
else
|
||||
echo "No .git directory found in vllm"
|
||||
fi
|
||||
cd ..
|
||||
|
||||
- name: Install system dependencies
|
||||
run: |
|
||||
apt-get -y install `cat packages.txt`
|
||||
apt-get -y install gcc g++ cmake libnuma-dev
|
||||
|
||||
- name: Checkout vllm-project/vllm repo
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
repository: vllm-project/vllm
|
||||
ref: ${{ inputs.vllm }}
|
||||
path: ./vllm-empty
|
||||
|
||||
- name: Install vllm-project/vllm from source
|
||||
working-directory: ./vllm-empty
|
||||
run: |
|
||||
VLLM_TARGET_DEVICE=empty pip install -e .
|
||||
|
||||
- name: Install vllm-project/vllm-ascend
|
||||
env:
|
||||
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
|
||||
run: |
|
||||
pip install -r requirements-dev.txt
|
||||
pip install -v -e .
|
||||
|
||||
- name: Install custom-ops (for DeepSeek-V3.2-Exp)
|
||||
if: ${{ inputs.name == 'deepseek3_2-exp-w8a8' }}
|
||||
shell: bash -l {0}
|
||||
run: |
|
||||
wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/CANN-custom_ops-sfa-linux.aarch64.run
|
||||
chmod +x ./CANN-custom_ops-sfa-linux.aarch64.run
|
||||
./CANN-custom_ops-sfa-linux.aarch64.run --quiet
|
||||
export ASCEND_CUSTOM_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize:${ASCEND_CUSTOM_OPP_PATH}
|
||||
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize/op_api/lib/:${LD_LIBRARY_PATH}
|
||||
wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/custom_ops-1.0-cp311-cp311-linux_aarch64.whl
|
||||
pip install custom_ops-1.0-cp311-cp311-linux_aarch64.whl
|
||||
. /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||
|
||||
- name: Checkout aisbench repo and Install aisbench
|
||||
run: |
|
||||
git clone https://gitee.com/aisbench/benchmark.git
|
||||
cd benchmark
|
||||
git checkout v3.0-20250930-master
|
||||
pip3 install -e ./
|
||||
pip3 install -r requirements/api.txt
|
||||
pip3 install -r requirements/extra.txt
|
||||
echo ""
|
||||
echo "============================"
|
||||
echo "vLLM-Ascend Git information"
|
||||
echo "============================"
|
||||
cd vllm-ascend
|
||||
if [ -d .git ]; then
|
||||
echo "Branch: $(git rev-parse --abbrev-ref HEAD)"
|
||||
echo "Commit hash: $(git rev-parse HEAD)"
|
||||
echo "Author: $(git log -1 --pretty=format:'%an <%ae>')"
|
||||
echo "Date: $(git log -1 --pretty=format:'%ad' --date=iso)"
|
||||
echo "Message: $(git log -1 --pretty=format:'%s')"
|
||||
echo "Tags: $(git tag --points-at HEAD || echo 'None')"
|
||||
echo "Remote: $(git remote -v | head -n1)"
|
||||
echo ""
|
||||
else
|
||||
echo "No .git directory found in vllm-ascend"
|
||||
fi
|
||||
cd ..
|
||||
|
||||
- name: Run vllm-project/vllm-ascend test
|
||||
env:
|
||||
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
||||
VLLM_USE_MODELSCOPE: True
|
||||
VLLM_CI_RUNNER: ${{ inputs.runner }}
|
||||
BENCHMARK_HOME: /vllm-workspace/vllm-ascend/benchmark
|
||||
working-directory: /vllm-workspace/vllm-ascend
|
||||
run: |
|
||||
pytest -sv ${{ inputs.tests }}
|
||||
|
||||
57
.github/workflows/_kill_lws_resources.yaml
vendored
57
.github/workflows/_kill_lws_resources.yaml
vendored
@@ -1,57 +0,0 @@
|
||||
name: 'resource clear'
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
runner:
|
||||
required: false
|
||||
type: string
|
||||
default: linux-aarch64-a3-0
|
||||
secrets:
|
||||
KUBECONFIG_B64:
|
||||
required: true
|
||||
|
||||
|
||||
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
|
||||
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
|
||||
# It's used to activate ascend-toolkit environment variables.
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -el {0}
|
||||
|
||||
jobs:
|
||||
resource_clear:
|
||||
# This is a runner with no NPU for k8s controller
|
||||
runs-on: ${{ inputs.runner }}
|
||||
container:
|
||||
image: m.daocloud.io/quay.io/ascend/cann:8.3.rc1-a3-ubuntu22.04-py3.11
|
||||
env:
|
||||
KUBECONFIG: /tmp/kubeconfig
|
||||
KUBECTL: /root/.cache/.kube/kubectl
|
||||
NAMESPACE: vllm-project
|
||||
LEADER_POD: vllm-0
|
||||
RESULT_FILE: /root/.cache/tests/ret/test_result.txt
|
||||
steps:
|
||||
- name: Install kubectl
|
||||
run: |
|
||||
# Install kubectl
|
||||
arch=$(uname -m)
|
||||
|
||||
if echo "$arch" | grep -qiE "arm|aarch64"; then
|
||||
echo "Detected ARM architecture: $arch"
|
||||
KUBECTL="$KUBECTL"_arm
|
||||
fi
|
||||
install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
|
||||
|
||||
# Verify kubectl installation
|
||||
kubectl version --client=true
|
||||
|
||||
- name: Decode kubeconfig from secrets
|
||||
run: |
|
||||
# Decode and save kubeconfig
|
||||
echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG
|
||||
|
||||
- name: Clear LWS resources
|
||||
if: always()
|
||||
run: |
|
||||
kubectl delete leaderworkerset vllm -n "$NAMESPACE" --ignore-not-found
|
||||
74
.github/workflows/_nightly_image_build.yaml
vendored
Normal file
74
.github/workflows/_nightly_image_build.yaml
vendored
Normal file
@@ -0,0 +1,74 @@
|
||||
name: 'image / nightly / Ubuntu / test'
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 0,4,8,12,14 * * *'
|
||||
workflow_call:
|
||||
inputs:
|
||||
target:
|
||||
required: true
|
||||
type: string
|
||||
description: 'Target architecture, e.g., a2, a3'
|
||||
outputs:
|
||||
image-tag:
|
||||
description: 'The built image tag'
|
||||
value: ${{ jobs.build-and-sync.outputs.image-tag }}
|
||||
secrets:
|
||||
HW_USERNAME:
|
||||
required: true
|
||||
HW_TOKEN:
|
||||
required: true
|
||||
|
||||
# This workflow builds and pushes Docker images for nightly-ci
|
||||
# It will be built base on the quay.io/ascend/vllm-ascend:main
|
||||
# And have some customizations for nightly testing, pushing to Huawei Cloud SWR
|
||||
jobs:
|
||||
build-and-sync:
|
||||
runs-on: ubuntu-22.04-arm
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
target: ${{ fromJson(github.event_name == 'schedule' && '["a2","a3"]' || format('["{0}"]', inputs.target || 'a3')) }}
|
||||
|
||||
outputs:
|
||||
image-tag: ${{ steps.build-image.outputs.image-tag }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Show build target
|
||||
run: |
|
||||
echo "Building target: ${{ matrix.target }}"
|
||||
|
||||
- name: Login to Huawei Cloud SWR
|
||||
id: login-swr
|
||||
if: ${{ env.HW_USERNAME != '' && env.HW_TOKEN != '' }}
|
||||
run: |
|
||||
echo "${{ env.HW_TOKEN }}" | docker login -u "${{ env.HW_USERNAME }}" --password-stdin swr.cn-southwest-2.myhuaweicloud.com
|
||||
env:
|
||||
HW_USERNAME: ${{ secrets.HW_USERNAME }}
|
||||
HW_TOKEN: ${{ secrets.HW_TOKEN }}
|
||||
|
||||
- name: Build image
|
||||
id: build-image
|
||||
run: |
|
||||
TARGET="${{ matrix.target }}"
|
||||
IMAGE_TAG="swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-${TARGET}"
|
||||
|
||||
echo "Building image: $IMAGE_TAG"
|
||||
docker build \
|
||||
--network host \
|
||||
--platform linux/arm64 \
|
||||
-f .github/Dockerfile.nightly.${TARGET} \
|
||||
--build-arg CANN_VERSION="8.3.rc1" \
|
||||
--build-arg UBUNTU_VERSION="22.04" \
|
||||
--build-arg PYTHON_VERSION="3.11" \
|
||||
-t "$IMAGE_TAG" .
|
||||
|
||||
echo "image-tag=$IMAGE_TAG" >> $GITHUB_OUTPUT
|
||||
|
||||
# To avoid pushing images from forks, only push when the repository owner is 'vllm-project'
|
||||
- name: Push image to SWR
|
||||
if: ${{ github.repository_owner == 'vllm-project' && steps.login-swr.conclusion == 'success' }}
|
||||
run: |
|
||||
docker push ${{ steps.build-image.outputs.image-tag }}
|
||||
@@ -42,9 +42,18 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
image_build:
|
||||
name: nightly image build
|
||||
uses: ./.github/workflows/_nightly_image_build.yaml
|
||||
with:
|
||||
target: a2
|
||||
secrets:
|
||||
HW_USERNAME: ${{ secrets.HW_USERNAME }}
|
||||
HW_TOKEN: ${{ secrets.HW_TOKEN }}
|
||||
single-node-tests:
|
||||
name: single-node
|
||||
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
|
||||
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
||||
needs: image_build
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -63,10 +72,11 @@ jobs:
|
||||
vllm: v0.11.0
|
||||
runner: ${{ matrix.test_config.os }}
|
||||
tests: ${{ matrix.test_config.tests }}
|
||||
image: ${{ fromJSON(format('"{0}"', needs.image_build.outputs.image-tag || 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2')) }}
|
||||
|
||||
multi-node-tests:
|
||||
name: multi-node
|
||||
needs: single-node-tests
|
||||
needs: [single-node-tests, image_build]
|
||||
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -83,7 +93,7 @@ jobs:
|
||||
with:
|
||||
soc_version: a2
|
||||
runner: linux-aarch64-a2-0
|
||||
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
|
||||
image: ${{ fromJSON(format('"{0}"', needs.image_build.outputs.image-tag || 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2')) }}
|
||||
replicas: 1
|
||||
size: ${{ matrix.test_config.size }}
|
||||
config_file_path: ${{ matrix.test_config.config_file_path }}
|
||||
|
||||
@@ -41,9 +41,18 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
image_build:
|
||||
name: nightly image build
|
||||
uses: ./.github/workflows/_nightly_image_build.yaml
|
||||
with:
|
||||
target: a3
|
||||
secrets:
|
||||
HW_USERNAME: ${{ secrets.HW_USERNAME }}
|
||||
HW_TOKEN: ${{ secrets.HW_TOKEN }}
|
||||
single-node-tests:
|
||||
name: single-node
|
||||
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
|
||||
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
||||
needs: image_build
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -94,13 +103,13 @@ jobs:
|
||||
with:
|
||||
vllm: v0.11.0
|
||||
runner: ${{ matrix.test_config.os }}
|
||||
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-a3-ubuntu22.04-py3.11
|
||||
image: ${{ fromJSON(format('"{0}"', needs.image_build.outputs.image-tag || 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3')) }}
|
||||
tests: ${{ matrix.test_config.tests }}
|
||||
name: ${{ matrix.test_config.name }}
|
||||
|
||||
multi-node-tests:
|
||||
name: multi-node
|
||||
needs: single-node-tests
|
||||
needs: [single-node-tests, image_build]
|
||||
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -129,7 +138,7 @@ jobs:
|
||||
with:
|
||||
soc_version: a3
|
||||
runner: linux-aarch64-a3-0
|
||||
image: m.daocloud.io/quay.io/ascend/cann:8.3.rc1-a3-ubuntu22.04-py3.11
|
||||
image: ${{ fromJSON(format('"{0}"', needs.image_build.outputs.image-tag || 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3')) }}
|
||||
replicas: 1
|
||||
size: ${{ matrix.test_config.size }}
|
||||
config_file_path: ${{ matrix.test_config.config_file_path }}
|
||||
|
||||
Reference in New Issue
Block a user