Files
xc-llm-ascend/.github/workflows/_e2e_test.yaml
Wang Kunpeng 13cd6362c6 [bugfix] fix Error 'ValueError: Duplicate layer name' (#5280)
### What this PR does / why we need it?
When matmul_and_reduce is enabled, the prefix attribute is required.
However, in some models, the prefix is not passed correctly, causing
errors when starting the service.
The issue of incorrect prefix passing will be fixed in vLLM in the
future.

- vLLM version: release/v0.13.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: Wang Kunpeng <1289706727@qq.com>
2025-12-25 10:43:24 +08:00

302 lines
13 KiB
YAML

name: 'e2e test'
on:
workflow_call:
inputs:
vllm:
required: true
type: string
runner:
required: true
type: string
image:
required: true
type: string
type:
required: true
type: string
jobs:
e2e:
name: singlecard
runs-on: ${{ inputs.runner }}-1
container:
image: ${{ inputs.image }}
env:
VLLM_LOGGING_LEVEL: ERROR
VLLM_USE_MODELSCOPE: True
TRANSFORMERS_OFFLINE: 1
steps:
- name: Check npu and CANN info
run: |
npu-smi info
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
- name: Config mirrors
run: |
sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list
pip config set global.index-url http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple
pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local
apt-get update -y
apt install git -y
- name: Checkout vllm-project/vllm-ascend repo
uses: actions/checkout@v6
- name: Install system dependencies
run: |
apt-get -y install `cat packages.txt`
apt-get -y install gcc g++ cmake libnuma-dev
- name: Checkout vllm-project/vllm repo
uses: actions/checkout@v6
with:
repository: vllm-project/vllm
ref: ${{ inputs.vllm }}
path: ./vllm-empty
fetch-depth: 1
- name: Install vllm-project/vllm from source
working-directory: ./vllm-empty
run: |
VLLM_TARGET_DEVICE=empty pip install -e .
- name: Install vllm-project/vllm-ascend
env:
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
run: |
pip install -r requirements-dev.txt
pip install -v -e .
- name: Run vllm-project/vllm-ascend test (non triton)
env:
VLLM_WORKER_MULTIPROC_METHOD: spawn
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
if: ${{ inputs.type == 'full' }}
run: |
pytest -sv --durations=0 tests/e2e/singlecard/test_aclgraph_mem.py
pytest -sv --durations=0 tests/e2e/singlecard/test_camem.py
- name: Install Ascend toolkit & triton_ascend
shell: bash -l {0}
run: |
. /usr/local/Ascend/ascend-toolkit/8.3.RC2/bisheng_toolkit/set_env.sh
python3 -m pip install "https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/triton_ascend-3.2.0.dev2025110717-cp311-cp311-manylinux_2_27_aarch64.whl"
- name: Run vllm-project/vllm-ascend test
env:
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
VLLM_WORKER_MULTIPROC_METHOD: spawn
if: ${{ inputs.type == 'light' }}
run: |
# pytest -sv --durations=0 tests/e2e/singlecard/test_aclgraph_accuracy.py
# pytest -sv --durations=0 tests/e2e/singlecard/test_quantization.py
pytest -sv --durations=0 tests/e2e/singlecard/test_vlm.py::test_multimodal_vl
pytest -sv --durations=0 tests/e2e/singlecard/pooling/test_classification.py::test_qwen_pooling_classify_correctness
- name: Run e2e test
env:
VLLM_WORKER_MULTIPROC_METHOD: spawn
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
if: ${{ inputs.type == 'full' }}
run: |
# We found that if running aclgraph tests in batch, it will cause AclmdlRICaptureBegin error. So we run
# the test separately.
pytest -sv --durations=0 tests/e2e/nightly/ops/triton/
pytest -sv --durations=0 tests/e2e/singlecard/test_completion_with_prompt_embeds.py
pytest -sv --durations=0 tests/e2e/singlecard/test_aclgraph_accuracy.py
pytest -sv --durations=0 tests/e2e/singlecard/test_async_scheduling.py
pytest -sv --durations=0 tests/e2e/singlecard/test_guided_decoding.py
# torch 2.8 doesn't work with lora, fix me
#pytest -sv --durations=0 tests/e2e/singlecard/test_ilama_lora.py
pytest -sv --durations=0 tests/e2e/singlecard/test_profile_execute_duration.py
pytest -sv --durations=0 tests/e2e/singlecard/test_quantization.py
pytest -sv --durations=0 tests/e2e/singlecard/test_sampler.py
pytest -sv --durations=0 tests/e2e/singlecard/test_vlm.py
pytest -sv --durations=0 tests/e2e/singlecard/test_xlite.py
pytest -sv --durations=0 tests/e2e/singlecard/test_models.py
pytest -sv --durations=0 tests/e2e/singlecard/pooling/
pytest -sv --durations=0 tests/e2e/singlecard/compile/test_norm_quant_fusion.py
pytest -sv --durations=0 tests/e2e/singlecard/test_multistream_overlap_shared_expert.py
# ------------------------------------ v1 spec decode test ------------------------------------ #
pytest -sv --durations=0 tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
pytest -sv --durations=0 tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
e2e-2-cards:
name: multicard-2
runs-on: linux-aarch64-a3-2
container:
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-a3-ubuntu22.04-py3.11
env:
VLLM_LOGGING_LEVEL: ERROR
VLLM_USE_MODELSCOPE: True
HCCL_BUFFSIZE: 1024
TRANSFORMERS_OFFLINE: 1
steps:
- name: Check npu and CANN info
run: |
npu-smi info
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
- name: Config mirrors
run: |
# Fix me: use nginx cache rather than the pypi
# sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list
# pip config set global.index-url http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple
# pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
apt-get update -y
apt install git -y
- name: Checkout vllm-project/vllm-ascend repo
uses: actions/checkout@v6
- name: Install system dependencies
run: |
apt-get -y install `cat packages.txt`
apt-get -y install gcc g++ cmake libnuma-dev
- name: Checkout vllm-project/vllm repo
uses: actions/checkout@v6
with:
repository: vllm-project/vllm
ref: ${{ inputs.vllm }}
path: ./vllm-empty
fetch-depth: 1
- name: Install vllm-project/vllm from source
working-directory: ./vllm-empty
run: |
VLLM_TARGET_DEVICE=empty pip install -e .
- name: Install vllm-project/vllm-ascend
env:
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
run: |
pip install -r requirements-dev.txt
pip install -v -e .
- name: Run vllm-project/vllm-ascend test (non triton)
if: ${{ inputs.type == 'full' }}
env:
VLLM_WORKER_MULTIPROC_METHOD: spawn
run: |
pytest -sv --durations=0 tests/e2e/multicard/test_aclgraph_capture_replay.py
- name: Install Ascend toolkit & triton_ascend
shell: bash -l {0}
run: |
. /usr/local/Ascend/ascend-toolkit/8.3.RC2/bisheng_toolkit/set_env.sh
python3 -m pip install "https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/triton_ascend-3.2.0.dev2025110717-cp311-cp311-manylinux_2_27_aarch64.whl"
- name: Run vllm-project/vllm-ascend test (light)
env:
VLLM_WORKER_MULTIPROC_METHOD: spawn
if: ${{ inputs.type == 'light' }}
run: |
pytest -sv --durations=0 tests/e2e/multicard/test_qwen3_moe.py::test_qwen3_moe_distributed_mp_tp2_ep
- name: Run vllm-project/vllm-ascend test (full)
env:
VLLM_WORKER_MULTIPROC_METHOD: spawn
if: ${{ inputs.type == 'full' }}
run: |
pytest -sv --durations=0 tests/e2e/multicard/test_quantization.py
pytest -sv --durations=0 tests/e2e/multicard/test_full_graph_mode.py
pytest -sv --durations=0 tests/e2e/multicard/test_data_parallel.py
pytest -sv --durations=0 tests/e2e/multicard/test_expert_parallel.py
pytest -sv --durations=0 tests/e2e/multicard/test_external_launcher.py
pytest -sv --durations=0 tests/e2e/multicard/test_single_request_aclgraph.py
# torch 2.8 doesn't work with lora, fix me
#pytest -sv --durations=0 tests/e2e/multicard/test_ilama_lora_tp2.py
# To avoid oom, we need to run the test in a single process.
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_deepseek_multistream_moe_tp2
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_qwen3_w4a8_dynamic_tp2
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_qwen3_moe_sp_tp2
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_qwen3_moe_fc2_tp2
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_qwen3_dense_fc1_tp2
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_qwen3_dense_prefetch_mlp_weight_tp2
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_deepseek_w4a8_accuracy_tp2
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_deepseek_v2_lite_fc1_tp2
pytest -sv --durations=0 tests/e2e/multicard/test_prefix_caching.py
pytest -sv --durations=0 tests/e2e/multicard/test_pipeline_parallel.py
pytest -sv --durations=0 tests/e2e/multicard/test_qwen3_moe.py
pytest -sv --durations=0 tests/e2e/multicard/test_offline_weight_load.py
e2e-4-cards:
name: multicard-4
needs: [e2e, e2e-2-cards]
if: ${{ needs.e2e.result == 'success' && needs.e2e-2-cards.result == 'success' && inputs.type == 'full' }}
runs-on: linux-aarch64-a3-4
container:
image: m.daocloud.io/quay.io/ascend/cann:8.3.rc2-a3-ubuntu22.04-py3.11
env:
VLLM_LOGGING_LEVEL: ERROR
VLLM_USE_MODELSCOPE: True
TRANSFORMERS_OFFLINE: 1
steps:
- name: Check npu and CANN info
run: |
npu-smi info
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
- name: Config mirrors
run: |
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
apt-get update -y
apt install git wget curl -y
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
- name: Checkout vllm-project/vllm-ascend repo
uses: actions/checkout@v6
with:
path: ./vllm-ascend
- name: Install system dependencies
run: |
apt-get -y install `cat packages.txt`
apt-get -y install gcc g++ cmake libnuma-dev
- name: Checkout vllm-project/vllm repo
uses: actions/checkout@v6
with:
repository: vllm-project/vllm
ref: ${{ inputs.vllm }}
path: ./vllm-empty
- name: Install vllm-project/vllm from source
working-directory: ./vllm-empty
run: |
VLLM_TARGET_DEVICE=empty pip install -e .
- name: Install vllm-project/vllm-ascend
working-directory: ./vllm-ascend
run: |
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib
pip install -r requirements-dev.txt
pip install -v -e .
- name: Install Ascend toolkit & triton_ascend
shell: bash -l {0}
run: |
. /usr/local/Ascend/ascend-toolkit/8.3.RC2/bisheng_toolkit/set_env.sh
python3 -m pip install "https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/triton_ascend-3.2.0.dev2025110717-cp311-cp311-manylinux_2_27_aarch64.whl"
- name: Run vllm-project/vllm-ascend test for V1 Engine
working-directory: ./vllm-ascend
env:
VLLM_WORKER_MULTIPROC_METHOD: spawn
run: |
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_deepseek_multistream_moe_tp2
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_kimi_k2_thinking_w4a16_tp4
pytest -sv --durations=0 tests/e2e/multicard/test_data_parallel_tp2.py
pytest -sv --durations=0 tests/e2e/multicard/long_sequence/test_basic.py
pytest -sv --durations=0 tests/e2e/multicard/long_sequence/test_accuracy.py
pytest -sv --durations=0 tests/e2e/multicard/long_sequence/test_mtp.py
pytest -sv --durations=0 tests/e2e/multicard/test_qwen3_next.py