<!-- Thanks for sending a pull request! BEFORE SUBMITTING, PLEASE READ https://docs.vllm.ai/en/latest/contributing/overview.html --> ### What this PR does / why we need it? As custom deepseek modeling do some changes to support graph mode in https://github.com/vllm-project/vllm-ascend/pull/585, so i follow it to change custom deepseek_mtp modeling. And some modifications for k>1 were not carried over by the https://github.com/vllm-project/vllm-ascend/pull/429, now i add it. In order to better take care of the MTP feature in the vllm-ascend repository, I added cases related to graph mode(torchair), but i skip it since torchair can not correctly clean up memory in vllmrunner. Also i add some case for MTP quantization weights, but test weight is not ready, so i skip it and i will open it when test quant weights is ready. https://github.com/vllm-project/vllm-ascend/pull/648 did not completely fix the sample change(https://github.com/vllm-project/vllm-ascend/issues/660) issue, I added the relevant changes. ### Does this PR introduce _any_ user-facing change? now, u can use following method to use mtp in deepseek v3/r1 float or quant weights with eager mode. ```python llm = LLM( model="wemaster/deepseek_mtp_main_random_bf16", tensor_parallel_size=2, speculative_config={ "num_speculative_tokens": 1, }, enforce_eager=True, trust_remote_code=True, disable_log_stats=False, gpu_memory_utilization=0.8, max_model_len=64, ) ``` or use mtp in deepseek v3/r1 float or quant weights with graph mode(torchair) ```python llm = LLM( model="wemaster/deepseek_mtp_main_random_bf16", tensor_parallel_size=2, speculative_config={ "num_speculative_tokens": 1, }, trust_remote_code=True, additional_config={ 'enable_graph_mode': True, }, disable_log_stats=False, gpu_memory_utilization=0.8, max_model_len=64, ) ``` add notes: 1. now, we support k>1, so u can set num_speculative_tokens > 1 if there is sufficient redundant computing power; 2. MTP is not supported in V1, we will support it when vLLM does it in https://github.com/vllm-project/vllm/issues/13500. 3. if u run MTP failed by `segmentation fault`, u can follow v0.7.3 patch https://github.com/vllm-project/vllm-ascend/pull/236 file `vllm_ascend/patch/patch_metrics.py` method `__npu_async_metrics_collector_init__` ### How was this patch tested? local tested passed and test by CI Signed-off-by: mengwei805 <mengwei25@huawei.com>
166 lines
5.7 KiB
YAML
166 lines
5.7 KiB
YAML
#
|
|
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
# This file is a part of the vllm-ascend project.
|
|
#
|
|
|
|
name: 'e2e test'
|
|
|
|
on:
|
|
pull_request:
|
|
branches:
|
|
- 'main'
|
|
- '*-dev'
|
|
paths:
|
|
- '*.txt'
|
|
- '**/*.py'
|
|
- '.github/workflows/vllm_ascend_test.yaml'
|
|
- '!docs/**'
|
|
- 'pytest.ini'
|
|
|
|
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
|
|
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
|
|
# It's used to activate ascend-toolkit environment variables.
|
|
defaults:
|
|
run:
|
|
shell: bash -el {0}
|
|
|
|
concurrency:
|
|
group: pr-${{ github.event.pull_request.number }}
|
|
cancel-in-progress: true
|
|
|
|
jobs:
|
|
test:
|
|
strategy:
|
|
max-parallel: 2
|
|
matrix:
|
|
os: [linux-arm64-npu-1, linux-arm64-npu-4]
|
|
vllm_verison: [main, v0.8.4]
|
|
concurrency:
|
|
group: >
|
|
${{
|
|
matrix.os == 'linux-arm64-npu-4'
|
|
&& github.event.pull_request.number
|
|
&& format('pr-{0}-limit-npu-4', github.event.pull_request.number)
|
|
|| format('job-{0}-{1}-{2}', matrix.os, matrix.vllm_verison, github.event.pull_request.number)
|
|
}}
|
|
cancel-in-progress: false
|
|
name: vLLM Ascend test
|
|
runs-on: ${{ matrix.os }}
|
|
container:
|
|
image: quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py3.10
|
|
env:
|
|
HF_ENDPOINT: https://hf-mirror.com
|
|
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
|
steps:
|
|
- name: Check npu and CANN info
|
|
run: |
|
|
npu-smi info
|
|
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
|
|
|
|
- name: Config mirrors
|
|
run: |
|
|
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
|
|
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
|
apt-get update -y
|
|
apt install git -y
|
|
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
|
|
|
|
- name: Checkout vllm-project/vllm-ascend repo
|
|
uses: actions/checkout@v4
|
|
|
|
- name: Install system dependencies
|
|
run: |
|
|
apt-get -y install `cat packages.txt`
|
|
apt-get -y install gcc g++ cmake libnuma-dev
|
|
|
|
- name: Checkout vllm-project/vllm repo
|
|
uses: actions/checkout@v4
|
|
with:
|
|
repository: vllm-project/vllm
|
|
ref: ${{ matrix.vllm_verison }}
|
|
path: ./vllm-empty
|
|
|
|
- name: Install vllm-project/vllm from source
|
|
working-directory: ./vllm-empty
|
|
run: |
|
|
VLLM_TARGET_DEVICE=empty pip install -e .
|
|
|
|
- name: Install vllm-project/vllm-ascend
|
|
run: |
|
|
pip install -r requirements-dev.txt
|
|
pip install -v -e .
|
|
|
|
- name: Run vllm-project/vllm-ascend test for V1 Engine
|
|
env:
|
|
VLLM_USE_V1: 1
|
|
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
|
run: |
|
|
if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
|
|
pytest -sv tests/singlecard/test_offline_inference.py
|
|
pytest -sv tests/ops
|
|
pytest -sv tests/compile
|
|
else
|
|
pytest -sv tests/multicard/test_offline_inference_distributed.py
|
|
pytest -sv tests/ops
|
|
pytest -sv tests/compile
|
|
fi
|
|
|
|
- name: Run vllm-project/vllm-ascend test on V0 engine
|
|
env:
|
|
VLLM_USE_V1: 0
|
|
run: |
|
|
if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
|
|
pytest -sv tests/singlecard/test_offline_inference.py
|
|
pytest -sv tests/ops
|
|
else
|
|
pytest -sv tests/multicard/test_offline_inference_distributed.py
|
|
pytest -sv tests/ops
|
|
fi
|
|
|
|
# only run test on spec decode when the related code changed
|
|
- name: Check for changes in Speculative Decode
|
|
id: filter_spec_decode
|
|
uses: dorny/paths-filter@v3
|
|
with:
|
|
filters: |
|
|
speculative_tests_changed:
|
|
- "tests/singlecard/spec_decode/**"
|
|
- "tests/multicard/spec_decode_e2e/**"
|
|
- "vllm_ascend/worker/worker.py"
|
|
- "vllm_ascend/worker/model_runner.py"
|
|
- "vllm_ascend/worker/multi_step_runner.py"
|
|
- "vllm_ascend/worker/multi_step_worker.py"
|
|
- "vllm_ascend/worker/draft_model_runner.py"
|
|
- "vllm_ascend/patch/worker/patch_common/patch_metrics.py"
|
|
- "vllm_ascend/patch/worker/patch_common/patch_spec_decode_worker.py"
|
|
- "vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py"
|
|
|
|
- name: Run vllm-project/vllm-ascend Speculative Decode test
|
|
env:
|
|
VLLM_USE_V1: 0
|
|
if: steps.filter_spec_decode.outputs.speculative_tests_changed == 'true'
|
|
run: |
|
|
if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
|
|
pytest -sv tests/singlecard/spec_decode/e2e/test_mtp_correctness.py # it needs a clean process
|
|
pytest -sv tests/singlecard/spec_decode --ignore=tests/singlecard/spec_decode/e2e/test_mtp_correctness.py
|
|
fi
|
|
|
|
- name: Run vllm-project/vllm test for V0 Engine
|
|
env:
|
|
VLLM_USE_V1: 0
|
|
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
|
|
run: |
|
|
pytest -sv
|