### What this PR does / why we need it?
When matmul_and_reduce is enabled, the prefix attribute is required.
However, in some models, the prefix is not passed correctly, causing
errors when starting the service.
The issue of incorrect prefix passing will be fixed in vLLM in the
future.
- vLLM version: release/v0.13.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: Wang Kunpeng <1289706727@qq.com>
302 lines
13 KiB
YAML
302 lines
13 KiB
YAML
name: 'e2e test'
|
|
|
|
on:
|
|
workflow_call:
|
|
inputs:
|
|
vllm:
|
|
required: true
|
|
type: string
|
|
runner:
|
|
required: true
|
|
type: string
|
|
image:
|
|
required: true
|
|
type: string
|
|
type:
|
|
required: true
|
|
type: string
|
|
|
|
jobs:
|
|
e2e:
|
|
name: singlecard
|
|
runs-on: ${{ inputs.runner }}-1
|
|
container:
|
|
image: ${{ inputs.image }}
|
|
env:
|
|
VLLM_LOGGING_LEVEL: ERROR
|
|
VLLM_USE_MODELSCOPE: True
|
|
TRANSFORMERS_OFFLINE: 1
|
|
steps:
|
|
- name: Check npu and CANN info
|
|
run: |
|
|
npu-smi info
|
|
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
|
|
|
|
- name: Config mirrors
|
|
run: |
|
|
sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list
|
|
pip config set global.index-url http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple
|
|
pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local
|
|
apt-get update -y
|
|
apt install git -y
|
|
|
|
- name: Checkout vllm-project/vllm-ascend repo
|
|
uses: actions/checkout@v6
|
|
|
|
- name: Install system dependencies
|
|
run: |
|
|
apt-get -y install `cat packages.txt`
|
|
apt-get -y install gcc g++ cmake libnuma-dev
|
|
|
|
- name: Checkout vllm-project/vllm repo
|
|
uses: actions/checkout@v6
|
|
with:
|
|
repository: vllm-project/vllm
|
|
ref: ${{ inputs.vllm }}
|
|
path: ./vllm-empty
|
|
fetch-depth: 1
|
|
|
|
- name: Install vllm-project/vllm from source
|
|
working-directory: ./vllm-empty
|
|
run: |
|
|
VLLM_TARGET_DEVICE=empty pip install -e .
|
|
|
|
- name: Install vllm-project/vllm-ascend
|
|
env:
|
|
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
|
|
run: |
|
|
pip install -r requirements-dev.txt
|
|
pip install -v -e .
|
|
|
|
- name: Run vllm-project/vllm-ascend test (non triton)
|
|
env:
|
|
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
|
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
|
|
if: ${{ inputs.type == 'full' }}
|
|
run: |
|
|
pytest -sv --durations=0 tests/e2e/singlecard/test_aclgraph_mem.py
|
|
pytest -sv --durations=0 tests/e2e/singlecard/test_camem.py
|
|
|
|
- name: Install Ascend toolkit & triton_ascend
|
|
shell: bash -l {0}
|
|
run: |
|
|
. /usr/local/Ascend/ascend-toolkit/8.3.RC2/bisheng_toolkit/set_env.sh
|
|
python3 -m pip install "https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/triton_ascend-3.2.0.dev2025110717-cp311-cp311-manylinux_2_27_aarch64.whl"
|
|
|
|
- name: Run vllm-project/vllm-ascend test
|
|
env:
|
|
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
|
|
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
|
if: ${{ inputs.type == 'light' }}
|
|
run: |
|
|
# pytest -sv --durations=0 tests/e2e/singlecard/test_aclgraph_accuracy.py
|
|
# pytest -sv --durations=0 tests/e2e/singlecard/test_quantization.py
|
|
pytest -sv --durations=0 tests/e2e/singlecard/test_vlm.py::test_multimodal_vl
|
|
pytest -sv --durations=0 tests/e2e/singlecard/pooling/test_classification.py::test_qwen_pooling_classify_correctness
|
|
|
|
- name: Run e2e test
|
|
env:
|
|
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
|
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
|
|
if: ${{ inputs.type == 'full' }}
|
|
run: |
|
|
# We found that if running aclgraph tests in batch, it will cause AclmdlRICaptureBegin error. So we run
|
|
# the test separately.
|
|
|
|
pytest -sv --durations=0 tests/e2e/nightly/ops/triton/
|
|
pytest -sv --durations=0 tests/e2e/singlecard/test_completion_with_prompt_embeds.py
|
|
pytest -sv --durations=0 tests/e2e/singlecard/test_aclgraph_accuracy.py
|
|
pytest -sv --durations=0 tests/e2e/singlecard/test_async_scheduling.py
|
|
pytest -sv --durations=0 tests/e2e/singlecard/test_guided_decoding.py
|
|
# torch 2.8 doesn't work with lora, fix me
|
|
#pytest -sv --durations=0 tests/e2e/singlecard/test_ilama_lora.py
|
|
pytest -sv --durations=0 tests/e2e/singlecard/test_profile_execute_duration.py
|
|
pytest -sv --durations=0 tests/e2e/singlecard/test_quantization.py
|
|
pytest -sv --durations=0 tests/e2e/singlecard/test_sampler.py
|
|
pytest -sv --durations=0 tests/e2e/singlecard/test_vlm.py
|
|
pytest -sv --durations=0 tests/e2e/singlecard/test_xlite.py
|
|
pytest -sv --durations=0 tests/e2e/singlecard/test_models.py
|
|
pytest -sv --durations=0 tests/e2e/singlecard/pooling/
|
|
pytest -sv --durations=0 tests/e2e/singlecard/compile/test_norm_quant_fusion.py
|
|
pytest -sv --durations=0 tests/e2e/singlecard/test_multistream_overlap_shared_expert.py
|
|
|
|
# ------------------------------------ v1 spec decode test ------------------------------------ #
|
|
pytest -sv --durations=0 tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
|
|
pytest -sv --durations=0 tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
|
|
|
|
e2e-2-cards:
|
|
name: multicard-2
|
|
runs-on: linux-aarch64-a3-2
|
|
container:
|
|
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-a3-ubuntu22.04-py3.11
|
|
env:
|
|
VLLM_LOGGING_LEVEL: ERROR
|
|
VLLM_USE_MODELSCOPE: True
|
|
HCCL_BUFFSIZE: 1024
|
|
TRANSFORMERS_OFFLINE: 1
|
|
steps:
|
|
- name: Check npu and CANN info
|
|
run: |
|
|
npu-smi info
|
|
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
|
|
|
|
- name: Config mirrors
|
|
run: |
|
|
# Fix me: use nginx cache rather than the pypi
|
|
# sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list
|
|
# pip config set global.index-url http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple
|
|
# pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local
|
|
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
|
apt-get update -y
|
|
apt install git -y
|
|
|
|
- name: Checkout vllm-project/vllm-ascend repo
|
|
uses: actions/checkout@v6
|
|
|
|
- name: Install system dependencies
|
|
run: |
|
|
apt-get -y install `cat packages.txt`
|
|
apt-get -y install gcc g++ cmake libnuma-dev
|
|
|
|
- name: Checkout vllm-project/vllm repo
|
|
uses: actions/checkout@v6
|
|
with:
|
|
repository: vllm-project/vllm
|
|
ref: ${{ inputs.vllm }}
|
|
path: ./vllm-empty
|
|
fetch-depth: 1
|
|
|
|
- name: Install vllm-project/vllm from source
|
|
working-directory: ./vllm-empty
|
|
run: |
|
|
VLLM_TARGET_DEVICE=empty pip install -e .
|
|
|
|
- name: Install vllm-project/vllm-ascend
|
|
env:
|
|
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
|
|
run: |
|
|
pip install -r requirements-dev.txt
|
|
pip install -v -e .
|
|
|
|
- name: Run vllm-project/vllm-ascend test (non triton)
|
|
if: ${{ inputs.type == 'full' }}
|
|
env:
|
|
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
|
run: |
|
|
pytest -sv --durations=0 tests/e2e/multicard/test_aclgraph_capture_replay.py
|
|
|
|
- name: Install Ascend toolkit & triton_ascend
|
|
shell: bash -l {0}
|
|
run: |
|
|
. /usr/local/Ascend/ascend-toolkit/8.3.RC2/bisheng_toolkit/set_env.sh
|
|
python3 -m pip install "https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/triton_ascend-3.2.0.dev2025110717-cp311-cp311-manylinux_2_27_aarch64.whl"
|
|
|
|
- name: Run vllm-project/vllm-ascend test (light)
|
|
env:
|
|
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
|
if: ${{ inputs.type == 'light' }}
|
|
run: |
|
|
pytest -sv --durations=0 tests/e2e/multicard/test_qwen3_moe.py::test_qwen3_moe_distributed_mp_tp2_ep
|
|
|
|
- name: Run vllm-project/vllm-ascend test (full)
|
|
env:
|
|
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
|
if: ${{ inputs.type == 'full' }}
|
|
run: |
|
|
pytest -sv --durations=0 tests/e2e/multicard/test_quantization.py
|
|
pytest -sv --durations=0 tests/e2e/multicard/test_full_graph_mode.py
|
|
pytest -sv --durations=0 tests/e2e/multicard/test_data_parallel.py
|
|
pytest -sv --durations=0 tests/e2e/multicard/test_expert_parallel.py
|
|
pytest -sv --durations=0 tests/e2e/multicard/test_external_launcher.py
|
|
pytest -sv --durations=0 tests/e2e/multicard/test_single_request_aclgraph.py
|
|
# torch 2.8 doesn't work with lora, fix me
|
|
#pytest -sv --durations=0 tests/e2e/multicard/test_ilama_lora_tp2.py
|
|
|
|
# To avoid oom, we need to run the test in a single process.
|
|
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_deepseek_multistream_moe_tp2
|
|
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_qwen3_w4a8_dynamic_tp2
|
|
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_qwen3_moe_sp_tp2
|
|
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_qwen3_moe_fc2_tp2
|
|
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_qwen3_dense_fc1_tp2
|
|
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_qwen3_dense_prefetch_mlp_weight_tp2
|
|
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_deepseek_w4a8_accuracy_tp2
|
|
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_deepseek_v2_lite_fc1_tp2
|
|
|
|
pytest -sv --durations=0 tests/e2e/multicard/test_prefix_caching.py
|
|
pytest -sv --durations=0 tests/e2e/multicard/test_pipeline_parallel.py
|
|
pytest -sv --durations=0 tests/e2e/multicard/test_qwen3_moe.py
|
|
pytest -sv --durations=0 tests/e2e/multicard/test_offline_weight_load.py
|
|
|
|
e2e-4-cards:
|
|
name: multicard-4
|
|
needs: [e2e, e2e-2-cards]
|
|
if: ${{ needs.e2e.result == 'success' && needs.e2e-2-cards.result == 'success' && inputs.type == 'full' }}
|
|
runs-on: linux-aarch64-a3-4
|
|
container:
|
|
image: m.daocloud.io/quay.io/ascend/cann:8.3.rc2-a3-ubuntu22.04-py3.11
|
|
env:
|
|
VLLM_LOGGING_LEVEL: ERROR
|
|
VLLM_USE_MODELSCOPE: True
|
|
TRANSFORMERS_OFFLINE: 1
|
|
steps:
|
|
- name: Check npu and CANN info
|
|
run: |
|
|
npu-smi info
|
|
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
|
|
|
|
- name: Config mirrors
|
|
run: |
|
|
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
|
|
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
|
apt-get update -y
|
|
apt install git wget curl -y
|
|
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
|
|
|
|
- name: Checkout vllm-project/vllm-ascend repo
|
|
uses: actions/checkout@v6
|
|
with:
|
|
path: ./vllm-ascend
|
|
|
|
- name: Install system dependencies
|
|
run: |
|
|
apt-get -y install `cat packages.txt`
|
|
apt-get -y install gcc g++ cmake libnuma-dev
|
|
|
|
- name: Checkout vllm-project/vllm repo
|
|
uses: actions/checkout@v6
|
|
with:
|
|
repository: vllm-project/vllm
|
|
ref: ${{ inputs.vllm }}
|
|
path: ./vllm-empty
|
|
|
|
- name: Install vllm-project/vllm from source
|
|
working-directory: ./vllm-empty
|
|
run: |
|
|
VLLM_TARGET_DEVICE=empty pip install -e .
|
|
|
|
- name: Install vllm-project/vllm-ascend
|
|
working-directory: ./vllm-ascend
|
|
run: |
|
|
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
|
|
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib
|
|
pip install -r requirements-dev.txt
|
|
pip install -v -e .
|
|
|
|
- name: Install Ascend toolkit & triton_ascend
|
|
shell: bash -l {0}
|
|
run: |
|
|
. /usr/local/Ascend/ascend-toolkit/8.3.RC2/bisheng_toolkit/set_env.sh
|
|
python3 -m pip install "https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/triton_ascend-3.2.0.dev2025110717-cp311-cp311-manylinux_2_27_aarch64.whl"
|
|
|
|
- name: Run vllm-project/vllm-ascend test for V1 Engine
|
|
working-directory: ./vllm-ascend
|
|
env:
|
|
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
|
run: |
|
|
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_deepseek_multistream_moe_tp2
|
|
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_kimi_k2_thinking_w4a16_tp4
|
|
pytest -sv --durations=0 tests/e2e/multicard/test_data_parallel_tp2.py
|
|
pytest -sv --durations=0 tests/e2e/multicard/long_sequence/test_basic.py
|
|
pytest -sv --durations=0 tests/e2e/multicard/long_sequence/test_accuracy.py
|
|
pytest -sv --durations=0 tests/e2e/multicard/long_sequence/test_mtp.py
|
|
pytest -sv --durations=0 tests/e2e/multicard/test_qwen3_next.py |