### What this PR does / why we need it?
Currently, our multi-node logs only show the master node's logs (via the
Kubernetes API), which is insufficient for effective problem
localization if other nodes experience issues. Therefore, this pull
request adds the ability to upload logs for other nodes.
Next plan: Output structured directory logs, including logs from each
node and the polog.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: release/v0.13.0
- vLLM main:
bc0a5a0c08
---------
Signed-off-by: wangli <wangli858794774@gmail.com>
151 lines
5.8 KiB
YAML
151 lines
5.8 KiB
YAML
#
|
|
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
# This file is a part of the vllm-ascend project.
|
|
#
|
|
|
|
name: 'e2e nightly test'
|
|
|
|
on:
|
|
workflow_call:
|
|
inputs:
|
|
vllm:
|
|
required: true
|
|
type: string
|
|
runner:
|
|
required: true
|
|
type: string
|
|
image:
|
|
required: false
|
|
type: string
|
|
default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11"
|
|
tests:
|
|
required: true
|
|
type: string
|
|
name:
|
|
required: false
|
|
type: string
|
|
|
|
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
|
|
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
|
|
# It's used to activate ascend-toolkit environment variables.
|
|
defaults:
|
|
run:
|
|
shell: bash -el {0}
|
|
|
|
# only cancel in-progress runs of the same workflow
|
|
# and ignore the lint / 1 card / 4 cards test type
|
|
concurrency:
|
|
group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.tests }}
|
|
cancel-in-progress: true
|
|
|
|
jobs:
|
|
e2e-nightly:
|
|
name: ${{ inputs.tests }}
|
|
runs-on: ${{ inputs.runner }}
|
|
timeout-minutes: 600
|
|
container:
|
|
image: ${{ inputs.image }}
|
|
env:
|
|
TRANSFORMERS_OFFLINE: 1
|
|
VLLM_USE_MODELSCOPE: True
|
|
steps:
|
|
- name: Check npu and CANN info
|
|
run: |
|
|
npu-smi info
|
|
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
|
|
|
|
- name: Show vLLM and vLLM-Ascend version
|
|
working-directory: /vllm-workspace
|
|
run: |
|
|
echo "Installed vLLM-related Python packages:"
|
|
pip list | grep vllm || echo "No vllm packages found."
|
|
|
|
echo ""
|
|
echo "============================"
|
|
echo "vLLM Git information"
|
|
echo "============================"
|
|
cd vllm
|
|
if [ -d .git ]; then
|
|
echo "Branch: $(git rev-parse --abbrev-ref HEAD)"
|
|
echo "Commit hash: $(git rev-parse HEAD)"
|
|
echo "Author: $(git log -1 --pretty=format:'%an <%ae>')"
|
|
echo "Date: $(git log -1 --pretty=format:'%ad' --date=iso)"
|
|
echo "Message: $(git log -1 --pretty=format:'%s')"
|
|
echo "Tags: $(git tag --points-at HEAD || echo 'None')"
|
|
echo "Remote: $(git remote -v | head -n1)"
|
|
echo ""
|
|
else
|
|
echo "No .git directory found in vllm"
|
|
fi
|
|
cd ..
|
|
|
|
echo ""
|
|
echo "============================"
|
|
echo "vLLM-Ascend Git information"
|
|
echo "============================"
|
|
cd vllm-ascend
|
|
if [ -d .git ]; then
|
|
echo "Branch: $(git rev-parse --abbrev-ref HEAD)"
|
|
echo "Commit hash: $(git rev-parse HEAD)"
|
|
echo "Author: $(git log -1 --pretty=format:'%an <%ae>')"
|
|
echo "Date: $(git log -1 --pretty=format:'%ad' --date=iso)"
|
|
echo "Message: $(git log -1 --pretty=format:'%s')"
|
|
echo "Tags: $(git tag --points-at HEAD || echo 'None')"
|
|
echo "Remote: $(git remote -v | head -n1)"
|
|
echo ""
|
|
else
|
|
echo "No .git directory found in vllm-ascend"
|
|
fi
|
|
cd ..
|
|
|
|
- name: Install custom-ops (for DeepSeek-V3.2-Exp)
|
|
if: ${{ inputs.name == 'deepseek3_2-exp-w8a8' }}
|
|
shell: bash -l {0}
|
|
run: |
|
|
wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/CANN-custom_ops-sfa-linux.aarch64.run
|
|
chmod +x ./CANN-custom_ops-sfa-linux.aarch64.run
|
|
./CANN-custom_ops-sfa-linux.aarch64.run --quiet
|
|
export ASCEND_CUSTOM_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize:${ASCEND_CUSTOM_OPP_PATH}
|
|
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize/op_api/lib/:${LD_LIBRARY_PATH}
|
|
wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/custom_ops-1.0-cp311-cp311-linux_aarch64.whl
|
|
pip install custom_ops-1.0-cp311-cp311-linux_aarch64.whl
|
|
. /usr/local/Ascend/ascend-toolkit/set_env.sh
|
|
|
|
- name: Install triton-ascend
|
|
if: ${{ inputs.name == 'test_custom_op' }}
|
|
shell: bash -l {0}
|
|
run: |
|
|
. /usr/local/Ascend/ascend-toolkit/8.3.RC2/bisheng_toolkit/set_env.sh
|
|
wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/triton_ascend-3.2.0.dev2025110717-cp311-cp311-manylinux_2_27_aarch64.whl
|
|
pip install triton_ascend-3.2.0.dev2025110717-cp311-cp311-manylinux_2_27_aarch64.whl
|
|
|
|
- name: Run vllm-project/vllm-ascend test
|
|
env:
|
|
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
|
VLLM_USE_MODELSCOPE: True
|
|
VLLM_CI_RUNNER: ${{ inputs.runner }}
|
|
BENCHMARK_HOME: /vllm-workspace/vllm-ascend/benchmark
|
|
working-directory: /vllm-workspace/vllm-ascend
|
|
run: |
|
|
# ignore test_dispatch_ffn_combine until the test is fixed
|
|
pytest -sv ${{ inputs.tests }} \
|
|
--ignore=tests/e2e/nightly/ops/test_dispatch_ffn_combine.py \
|
|
--ignore=tests/e2e/nightly/ops/test_fused_moe.py \
|
|
--ignore=tests/e2e/nightly/ops/test_rotary_embedding.py \
|
|
--ignore=tests/e2e/nightly/ops/test_matmul_allreduce_add_rmsnorm.py
|
|
|
|
|
|
|