Files
xc-llm-ascend/.github/workflows/_e2e_nightly_single_node.yaml
Li Wang c2f776b846 [Nightly] Initial logging for nightly multi-node testing (#5362)
### What this PR does / why we need it?
Currently, our multi-node logs only show the master node's logs (via the
Kubernetes API), which is insufficient for effective problem
localization if other nodes experience issues. Therefore, this pull
request adds the ability to upload logs for other nodes.

Next plan: Output structured directory logs, including logs from each
node and the polog.
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
bc0a5a0c08

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
2025-12-26 11:39:07 +08:00

151 lines
5.8 KiB
YAML

#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
name: 'e2e nightly test'
on:
workflow_call:
inputs:
vllm:
required: true
type: string
runner:
required: true
type: string
image:
required: false
type: string
default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11"
tests:
required: true
type: string
name:
required: false
type: string
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
# It's used to activate ascend-toolkit environment variables.
defaults:
run:
shell: bash -el {0}
# only cancel in-progress runs of the same workflow
# and ignore the lint / 1 card / 4 cards test type
concurrency:
group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.tests }}
cancel-in-progress: true
jobs:
e2e-nightly:
name: ${{ inputs.tests }}
runs-on: ${{ inputs.runner }}
timeout-minutes: 600
container:
image: ${{ inputs.image }}
env:
TRANSFORMERS_OFFLINE: 1
VLLM_USE_MODELSCOPE: True
steps:
- name: Check npu and CANN info
run: |
npu-smi info
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
- name: Show vLLM and vLLM-Ascend version
working-directory: /vllm-workspace
run: |
echo "Installed vLLM-related Python packages:"
pip list | grep vllm || echo "No vllm packages found."
echo ""
echo "============================"
echo "vLLM Git information"
echo "============================"
cd vllm
if [ -d .git ]; then
echo "Branch: $(git rev-parse --abbrev-ref HEAD)"
echo "Commit hash: $(git rev-parse HEAD)"
echo "Author: $(git log -1 --pretty=format:'%an <%ae>')"
echo "Date: $(git log -1 --pretty=format:'%ad' --date=iso)"
echo "Message: $(git log -1 --pretty=format:'%s')"
echo "Tags: $(git tag --points-at HEAD || echo 'None')"
echo "Remote: $(git remote -v | head -n1)"
echo ""
else
echo "No .git directory found in vllm"
fi
cd ..
echo ""
echo "============================"
echo "vLLM-Ascend Git information"
echo "============================"
cd vllm-ascend
if [ -d .git ]; then
echo "Branch: $(git rev-parse --abbrev-ref HEAD)"
echo "Commit hash: $(git rev-parse HEAD)"
echo "Author: $(git log -1 --pretty=format:'%an <%ae>')"
echo "Date: $(git log -1 --pretty=format:'%ad' --date=iso)"
echo "Message: $(git log -1 --pretty=format:'%s')"
echo "Tags: $(git tag --points-at HEAD || echo 'None')"
echo "Remote: $(git remote -v | head -n1)"
echo ""
else
echo "No .git directory found in vllm-ascend"
fi
cd ..
- name: Install custom-ops (for DeepSeek-V3.2-Exp)
if: ${{ inputs.name == 'deepseek3_2-exp-w8a8' }}
shell: bash -l {0}
run: |
wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/CANN-custom_ops-sfa-linux.aarch64.run
chmod +x ./CANN-custom_ops-sfa-linux.aarch64.run
./CANN-custom_ops-sfa-linux.aarch64.run --quiet
export ASCEND_CUSTOM_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize:${ASCEND_CUSTOM_OPP_PATH}
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize/op_api/lib/:${LD_LIBRARY_PATH}
wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/custom_ops-1.0-cp311-cp311-linux_aarch64.whl
pip install custom_ops-1.0-cp311-cp311-linux_aarch64.whl
. /usr/local/Ascend/ascend-toolkit/set_env.sh
- name: Install triton-ascend
if: ${{ inputs.name == 'test_custom_op' }}
shell: bash -l {0}
run: |
. /usr/local/Ascend/ascend-toolkit/8.3.RC2/bisheng_toolkit/set_env.sh
wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/triton_ascend-3.2.0.dev2025110717-cp311-cp311-manylinux_2_27_aarch64.whl
pip install triton_ascend-3.2.0.dev2025110717-cp311-cp311-manylinux_2_27_aarch64.whl
- name: Run vllm-project/vllm-ascend test
env:
VLLM_WORKER_MULTIPROC_METHOD: spawn
VLLM_USE_MODELSCOPE: True
VLLM_CI_RUNNER: ${{ inputs.runner }}
BENCHMARK_HOME: /vllm-workspace/vllm-ascend/benchmark
working-directory: /vllm-workspace/vllm-ascend
run: |
# ignore test_dispatch_ffn_combine until the test is fixed
pytest -sv ${{ inputs.tests }} \
--ignore=tests/e2e/nightly/ops/test_dispatch_ffn_combine.py \
--ignore=tests/e2e/nightly/ops/test_fused_moe.py \
--ignore=tests/e2e/nightly/ops/test_rotary_embedding.py \
--ignore=tests/e2e/nightly/ops/test_matmul_allreduce_add_rmsnorm.py