[CI] Add daily images build for nightly ci (#3989)

### What this PR does / why we need it?
Given the current excessively long build time of our nightly-ci, I
recommend installing necessary, confirmed versions of packages in the
Docker image to reduce the time required for integration testing.
Including Mooncake vllm with fixed tags, This is expected to reduce
nightly-ci duration by 2 hours.

- vLLM version: v0.11.0
- vLLM main:
2918c1b49c

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
Li Wang
2025-11-13 20:10:12 +08:00
committed by GitHub
parent f7d1f73b98
commit 7294f89e43
11 changed files with 285 additions and 334 deletions

View File

@@ -62,67 +62,56 @@ jobs:
npu-smi info
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
- name: Config mirrors
- name: Show vLLM and vLLM-Ascend version
working-directory: /vllm-workspace
run: |
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
apt-get update -y
apt install git -y
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
echo "Installed vLLM-related Python packages:"
pip list | grep vllm || echo "No vllm packages found."
- name: Checkout vllm-project/vllm-ascend repo
uses: actions/checkout@v4
echo ""
echo "============================"
echo "vLLM Git information"
echo "============================"
cd vllm
if [ -d .git ]; then
echo "Branch: $(git rev-parse --abbrev-ref HEAD)"
echo "Commit hash: $(git rev-parse HEAD)"
echo "Author: $(git log -1 --pretty=format:'%an <%ae>')"
echo "Date: $(git log -1 --pretty=format:'%ad' --date=iso)"
echo "Message: $(git log -1 --pretty=format:'%s')"
echo "Tags: $(git tag --points-at HEAD || echo 'None')"
echo "Remote: $(git remote -v | head -n1)"
echo ""
else
echo "No .git directory found in vllm"
fi
cd ..
- name: Install system dependencies
run: |
apt-get -y install `cat packages.txt`
apt-get -y install gcc g++ cmake libnuma-dev
- name: Checkout vllm-project/vllm repo
uses: actions/checkout@v4
with:
repository: vllm-project/vllm
ref: ${{ inputs.vllm }}
path: ./vllm-empty
- name: Install vllm-project/vllm from source
working-directory: ./vllm-empty
run: |
VLLM_TARGET_DEVICE=empty pip install -e .
- name: Install vllm-project/vllm-ascend
env:
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
run: |
pip install -r requirements-dev.txt
pip install -v -e .
- name: Install custom-ops (for DeepSeek-V3.2-Exp)
if: ${{ inputs.name == 'deepseek3_2-exp-w8a8' }}
shell: bash -l {0}
run: |
wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/CANN-custom_ops-sfa-linux.aarch64.run
chmod +x ./CANN-custom_ops-sfa-linux.aarch64.run
./CANN-custom_ops-sfa-linux.aarch64.run --quiet
export ASCEND_CUSTOM_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize:${ASCEND_CUSTOM_OPP_PATH}
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize/op_api/lib/:${LD_LIBRARY_PATH}
wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/custom_ops-1.0-cp311-cp311-linux_aarch64.whl
pip install custom_ops-1.0-cp311-cp311-linux_aarch64.whl
. /usr/local/Ascend/ascend-toolkit/set_env.sh
- name: Checkout aisbench repo and Install aisbench
run: |
git clone https://gitee.com/aisbench/benchmark.git
cd benchmark
git checkout v3.0-20250930-master
pip3 install -e ./
pip3 install -r requirements/api.txt
pip3 install -r requirements/extra.txt
echo ""
echo "============================"
echo "vLLM-Ascend Git information"
echo "============================"
cd vllm-ascend
if [ -d .git ]; then
echo "Branch: $(git rev-parse --abbrev-ref HEAD)"
echo "Commit hash: $(git rev-parse HEAD)"
echo "Author: $(git log -1 --pretty=format:'%an <%ae>')"
echo "Date: $(git log -1 --pretty=format:'%ad' --date=iso)"
echo "Message: $(git log -1 --pretty=format:'%s')"
echo "Tags: $(git tag --points-at HEAD || echo 'None')"
echo "Remote: $(git remote -v | head -n1)"
echo ""
else
echo "No .git directory found in vllm-ascend"
fi
cd ..
- name: Run vllm-project/vllm-ascend test
env:
VLLM_WORKER_MULTIPROC_METHOD: spawn
VLLM_USE_MODELSCOPE: True
VLLM_CI_RUNNER: ${{ inputs.runner }}
BENCHMARK_HOME: /vllm-workspace/vllm-ascend/benchmark
working-directory: /vllm-workspace/vllm-ascend
run: |
pytest -sv ${{ inputs.tests }}

View File

@@ -1,57 +0,0 @@
name: 'resource clear'
on:
workflow_call:
inputs:
runner:
required: false
type: string
default: linux-aarch64-a3-0
secrets:
KUBECONFIG_B64:
required: true
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
# It's used to activate ascend-toolkit environment variables.
defaults:
run:
shell: bash -el {0}
jobs:
resource_clear:
# This is a runner with no NPU for k8s controller
runs-on: ${{ inputs.runner }}
container:
image: m.daocloud.io/quay.io/ascend/cann:8.3.rc1-a3-ubuntu22.04-py3.11
env:
KUBECONFIG: /tmp/kubeconfig
KUBECTL: /root/.cache/.kube/kubectl
NAMESPACE: vllm-project
LEADER_POD: vllm-0
RESULT_FILE: /root/.cache/tests/ret/test_result.txt
steps:
- name: Install kubectl
run: |
# Install kubectl
arch=$(uname -m)
if echo "$arch" | grep -qiE "arm|aarch64"; then
echo "Detected ARM architecture: $arch"
KUBECTL="$KUBECTL"_arm
fi
install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
# Verify kubectl installation
kubectl version --client=true
- name: Decode kubeconfig from secrets
run: |
# Decode and save kubeconfig
echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG
- name: Clear LWS resources
if: always()
run: |
kubectl delete leaderworkerset vllm -n "$NAMESPACE" --ignore-not-found

View File

@@ -0,0 +1,74 @@
name: 'image / nightly / Ubuntu / test'
on:
schedule:
- cron: '0 0,4,8,12,14 * * *'
workflow_call:
inputs:
target:
required: true
type: string
description: 'Target architecture, e.g., a2, a3'
outputs:
image-tag:
description: 'The built image tag'
value: ${{ jobs.build-and-sync.outputs.image-tag }}
secrets:
HW_USERNAME:
required: true
HW_TOKEN:
required: true
# This workflow builds and pushes Docker images for nightly-ci
# It will be built base on the quay.io/ascend/vllm-ascend:main
# And have some customizations for nightly testing, pushing to Huawei Cloud SWR
jobs:
build-and-sync:
runs-on: ubuntu-22.04-arm
strategy:
matrix:
target: ${{ fromJson(github.event_name == 'schedule' && '["a2","a3"]' || format('["{0}"]', inputs.target || 'a3')) }}
outputs:
image-tag: ${{ steps.build-image.outputs.image-tag }}
steps:
- uses: actions/checkout@v4
- name: Show build target
run: |
echo "Building target: ${{ matrix.target }}"
- name: Login to Huawei Cloud SWR
id: login-swr
if: ${{ env.HW_USERNAME != '' && env.HW_TOKEN != '' }}
run: |
echo "${{ env.HW_TOKEN }}" | docker login -u "${{ env.HW_USERNAME }}" --password-stdin swr.cn-southwest-2.myhuaweicloud.com
env:
HW_USERNAME: ${{ secrets.HW_USERNAME }}
HW_TOKEN: ${{ secrets.HW_TOKEN }}
- name: Build image
id: build-image
run: |
TARGET="${{ matrix.target }}"
IMAGE_TAG="swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-${TARGET}"
echo "Building image: $IMAGE_TAG"
docker build \
--network host \
--platform linux/arm64 \
-f .github/Dockerfile.nightly.${TARGET} \
--build-arg CANN_VERSION="8.3.rc1" \
--build-arg UBUNTU_VERSION="22.04" \
--build-arg PYTHON_VERSION="3.11" \
-t "$IMAGE_TAG" .
echo "image-tag=$IMAGE_TAG" >> $GITHUB_OUTPUT
# To avoid pushing images from forks, only push when the repository owner is 'vllm-project'
- name: Push image to SWR
if: ${{ github.repository_owner == 'vllm-project' && steps.login-swr.conclusion == 'success' }}
run: |
docker push ${{ steps.build-image.outputs.image-tag }}

View File

@@ -42,9 +42,18 @@ concurrency:
cancel-in-progress: true
jobs:
image_build:
name: nightly image build
uses: ./.github/workflows/_nightly_image_build.yaml
with:
target: a2
secrets:
HW_USERNAME: ${{ secrets.HW_USERNAME }}
HW_TOKEN: ${{ secrets.HW_TOKEN }}
single-node-tests:
name: single-node
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
needs: image_build
strategy:
fail-fast: false
matrix:
@@ -63,10 +72,11 @@ jobs:
vllm: v0.11.0
runner: ${{ matrix.test_config.os }}
tests: ${{ matrix.test_config.tests }}
image: ${{ fromJSON(format('"{0}"', needs.image_build.outputs.image-tag || 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2')) }}
multi-node-tests:
name: multi-node
needs: single-node-tests
needs: [single-node-tests, image_build]
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
strategy:
fail-fast: false
@@ -83,7 +93,7 @@ jobs:
with:
soc_version: a2
runner: linux-aarch64-a2-0
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
image: ${{ fromJSON(format('"{0}"', needs.image_build.outputs.image-tag || 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2')) }}
replicas: 1
size: ${{ matrix.test_config.size }}
config_file_path: ${{ matrix.test_config.config_file_path }}

View File

@@ -41,9 +41,18 @@ concurrency:
cancel-in-progress: true
jobs:
image_build:
name: nightly image build
uses: ./.github/workflows/_nightly_image_build.yaml
with:
target: a3
secrets:
HW_USERNAME: ${{ secrets.HW_USERNAME }}
HW_TOKEN: ${{ secrets.HW_TOKEN }}
single-node-tests:
name: single-node
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
needs: image_build
strategy:
fail-fast: false
matrix:
@@ -94,13 +103,13 @@ jobs:
with:
vllm: v0.11.0
runner: ${{ matrix.test_config.os }}
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-a3-ubuntu22.04-py3.11
image: ${{ fromJSON(format('"{0}"', needs.image_build.outputs.image-tag || 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3')) }}
tests: ${{ matrix.test_config.tests }}
name: ${{ matrix.test_config.name }}
multi-node-tests:
name: multi-node
needs: single-node-tests
needs: [single-node-tests, image_build]
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
strategy:
fail-fast: false
@@ -129,7 +138,7 @@ jobs:
with:
soc_version: a3
runner: linux-aarch64-a3-0
image: m.daocloud.io/quay.io/ascend/cann:8.3.rc1-a3-ubuntu22.04-py3.11
image: ${{ fromJSON(format('"{0}"', needs.image_build.outputs.image-tag || 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3')) }}
replicas: 1
size: ${{ matrix.test_config.size }}
config_file_path: ${{ matrix.test_config.config_file_path }}