From afdbf774835adc0d9bab0e73ee201dae40a4ae52 Mon Sep 17 00:00:00 2001 From: Li Wang Date: Tue, 8 Apr 2025 16:52:45 +0800 Subject: [PATCH] [CI] Add new runner and enable QwQ multinpu test (#417) ### What this PR does / why we need it? - Add a new runner to the continuous integration system and keep the original CI runner until the new runner runs stably - Add distributed test cases ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI passed --------- Signed-off-by: wangli --- .github/workflows/vllm_ascend_test.yaml | 367 +++++++++++++++++++ .github/workflows/vllm_ascend_test_main.yaml | 119 ------ pytest.ini | 3 + tests/conftest.py | 13 +- tests/test_offline_inference.py | 31 +- 5 files changed, 405 insertions(+), 128 deletions(-) create mode 100644 .github/workflows/vllm_ascend_test.yaml delete mode 100644 .github/workflows/vllm_ascend_test_main.yaml diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml new file mode 100644 index 0000000..e69eb82 --- /dev/null +++ b/.github/workflows/vllm_ascend_test.yaml @@ -0,0 +1,367 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: 'e2e test' + +on: + pull_request: + branches: + - 'main' + - '*-dev' + paths: + - '*.txt' + - '**/*.py' + - '.github/workflows/vllm_ascend_test.yaml' + - '!docs/**' + - 'pytest.ini' + +# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly +# declared as "shell: bash -el {0}" on steps that need to be properly activated. +# It's used to activate ascend-toolkit environment variables. +defaults: + run: + shell: bash -el {0} + +concurrency: + group: pr-${{ github.event.pull_request.number }} + cancel-in-progress: true + +jobs: + test-singlenpu: + name: vLLM Ascend test main(single-npu) + runs-on: linux-arm64-npu-1 # actionlint-ignore: runner-label + container: + image: quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py3.10 + steps: + - name: Check npu and CANN info + run: | + npu-smi info + cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info + + - name: Config mirrors + run: | + sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list + pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple + apt-get update -y + apt install git -y + git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/ + + - name: Checkout vllm-project/vllm-ascend repo + uses: actions/checkout@v4 + + - name: Install system dependencies + run: | + apt-get -y install `cat packages.txt` + apt-get -y install gcc g++ cmake libnuma-dev + + - name: Checkout vllm-project/vllm repo + uses: actions/checkout@v4 + with: + repository: vllm-project/vllm + path: ./vllm-empty + + - name: Install vllm-project/vllm from source + working-directory: ./vllm-empty + run: | + VLLM_TARGET_DEVICE=empty pip install -e . + + - name: Install vllm-project/vllm-ascend + run: | + pip install -r requirements-dev.txt + pip install -e . + + - name: Install pta + run: | + if [ ! -d /root/.cache/pta ]; then + mkdir -p /root/.cache/pta + fi + + if [ ! -f /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ]; then + cd /root/.cache/pta + rm -rf pytorch_v2.5.1_py310* + wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz + tar -zxvf pytorch_v2.5.1_py310.tar.gz + fi + + pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl + + - name: Run vllm-project/vllm-ascend test on V0 engine + env: + VLLM_USE_V1: 0 + HF_ENDPOINT: https://hf-mirror.com + run: | + VLLM_USE_V1=0 pytest -sv -m 'not multinpu' tests + + - name: Run vllm-project/vllm-ascend test for V1 Engine + env: + VLLM_USE_V1: 1 + VLLM_WORKER_MULTIPROC_METHOD: spawn + HF_ENDPOINT: https://hf-mirror.com + run: | + pytest -sv -m 'not multinpu' tests + + - name: Run vllm-project/vllm test for V0 Engine + env: + VLLM_USE_V1: 0 + PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256 + HF_ENDPOINT: https://hf-mirror.com + run: | + pytest -sv + + test-multinpu: + name: vLLM Ascend test main(multi-npu) + runs-on: linux-arm64-npu-4 + container: + image: ascendai/cann:8.0.0-910b-ubuntu22.04-py3.10 + env: + HF_ENDPOINT: https://hf-mirror.com + HF_TOKEN: ${{ secrets.HF_TOKEN }} + steps: + - name: Check npu and CANN info + run: | + npu-smi info + cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info + + - name: Config mirrors + run: | + # sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list + pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple + + - name: Install system dependencies + run: | + apt-get update -y + apt-get -y install git wget + + - name: Config git + run: | + git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/ + + - name: Checkout vllm-project/vllm-ascend repo + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + pip install -r requirements-dev.txt + + - name: Checkout vllm-project/vllm repo + uses: actions/checkout@v4 + with: + repository: vllm-project/vllm + path: ./vllm-empty + + - name: Install vllm-project/vllm from source + working-directory: ./vllm-empty + run: | + VLLM_TARGET_DEVICE=empty pip install -e . + + - name: Install vllm-project/vllm-ascend + run: | + pip install -r requirements-dev.txt + pip install -e . + + - name: Install pta + run: | + if [ ! -d /root/.cache/pta ]; then + mkdir -p /root/.cache/pta + fi + + if [ ! -f /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ]; then + cd /root/.cache/pta + rm -rf pytorch_v2.5.1_py310* + wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz + tar -zxvf pytorch_v2.5.1_py310.tar.gz + fi + + pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl + - name: Run vllm-project/vllm-ascend test on V0 engine + env: + VLLM_USE_V1: 0 + HF_ENDPOINT: https://hf-mirror.com + run: | + VLLM_USE_V1=0 pytest -sv -m 'multinpu' tests + + - name: Run vllm-project/vllm-ascend test for V1 Engine + env: + VLLM_USE_V1: 1 + VLLM_WORKER_MULTIPROC_METHOD: spawn + HF_ENDPOINT: https://hf-mirror.com + run: | + pytest -sv -m 'multinpu' tests + + test-singlenpu-v0_8_3: + name: vLLM Ascend test v0.8.3(single-npu) + runs-on: linux-arm64-npu-1 # actionlint-ignore: runner-label + container: + image: quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py3.10 + steps: + - name: Check npu and CANN info + run: | + npu-smi info + cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info + + - name: Config mirrors + run: | + sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list + pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple + apt-get update -y + apt install git -y + git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/ + + - name: Checkout vllm-project/vllm-ascend repo + uses: actions/checkout@v4 + + - name: Install system dependencies + run: | + apt-get -y install `cat packages.txt` + apt-get -y install gcc g++ cmake libnuma-dev + + - name: Checkout vllm-project/vllm repo + uses: actions/checkout@v4 + with: + repository: vllm-project/vllm + ref: v0.8.3 + path: ./vllm-empty + + - name: Install vllm-project/vllm from source + working-directory: ./vllm-empty + run: | + VLLM_TARGET_DEVICE=empty pip install -e . + + - name: Install vllm-project/vllm-ascend + run: | + pip install -r requirements-dev.txt + pip install -e . + + - name: Install pta + run: | + if [ ! -d /root/.cache/pta ]; then + mkdir -p /root/.cache/pta + fi + + if [ ! -f /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ]; then + cd /root/.cache/pta + rm -rf pytorch_v2.5.1_py310* + wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz + tar -zxvf pytorch_v2.5.1_py310.tar.gz + fi + + pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl + + - name: Run vllm-project/vllm-ascend test on V0 engine + env: + VLLM_USE_V1: 0 + HF_ENDPOINT: https://hf-mirror.com + run: | + VLLM_USE_V1=0 pytest -sv -m 'not multinpu' tests + + - name: Run vllm-project/vllm-ascend test for V1 Engine + env: + VLLM_USE_V1: 1 + VLLM_WORKER_MULTIPROC_METHOD: spawn + HF_ENDPOINT: https://hf-mirror.com + run: | + pytest -sv -m 'not multinpu' tests + + - name: Run vllm-project/vllm test for V0 Engine + env: + VLLM_USE_V1: 0 + PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256 + HF_ENDPOINT: https://hf-mirror.com + run: | + pytest -sv + + test-multinpu-v0_8_3: + name: vLLM Ascend test v0.8.3(multi-npu) + runs-on: linux-arm64-npu-4 + needs: test-multinpu + container: + image: ascendai/cann:8.0.0-910b-ubuntu22.04-py3.10 + env: + HF_ENDPOINT: https://hf-mirror.com + HF_TOKEN: ${{ secrets.HF_TOKEN }} + steps: + - name: Check npu and CANN info + run: | + npu-smi info + cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info + + - name: Config mirrors + run: | + # sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list + pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple + + - name: Install system dependencies + run: | + apt-get update -y + apt-get -y install git wget + + - name: Config git + run: | + git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/ + + - name: Checkout vllm-project/vllm-ascend repo + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + pip install -r requirements-dev.txt + + - name: Checkout vllm-project/vllm repo + uses: actions/checkout@v4 + with: + repository: vllm-project/vllm + ref: v0.8.3 + path: ./vllm-empty + + - name: Install vllm-project/vllm from source + working-directory: ./vllm-empty + run: | + VLLM_TARGET_DEVICE=empty pip install -e . + + - name: Install vllm-project/vllm-ascend + run: | + pip install -r requirements-dev.txt + pip install -e . + + - name: Install pta + run: | + if [ ! -d /root/.cache/pta ]; then + mkdir -p /root/.cache/pta + fi + + if [ ! -f /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ]; then + cd /root/.cache/pta + rm -rf pytorch_v2.5.1_py310* + wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz + tar -zxvf pytorch_v2.5.1_py310.tar.gz + fi + + pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl + - name: Run vllm-project/vllm-ascend test on V0 engine + env: + VLLM_USE_V1: 0 + HF_ENDPOINT: https://hf-mirror.com + run: | + VLLM_USE_V1=0 pytest -sv -m 'multinpu' tests + + - name: Run vllm-project/vllm-ascend test for V1 Engine + env: + VLLM_USE_V1: 1 + VLLM_WORKER_MULTIPROC_METHOD: spawn + HF_ENDPOINT: https://hf-mirror.com + run: | + pytest -sv -m 'multinpu' tests \ No newline at end of file diff --git a/.github/workflows/vllm_ascend_test_main.yaml b/.github/workflows/vllm_ascend_test_main.yaml deleted file mode 100644 index 77feade..0000000 --- a/.github/workflows/vllm_ascend_test_main.yaml +++ /dev/null @@ -1,119 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# This file is a part of the vllm-ascend project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -name: 'e2e test' - -on: - pull_request: - branches: - - 'main' - - '*-dev' - paths: - - '*.txt' - - '**/*.py' - - '.github/workflows/vllm_ascend_test_main.yaml' - - '!docs/**' - - 'pytest.ini' - -# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly -# declared as "shell: bash -el {0}" on steps that need to be properly activated. -# It's used to activate ascend-toolkit environment variables. -defaults: - run: - shell: bash -el {0} - -jobs: - test: - name: vLLM Ascend test (self-host) - runs-on: linux-arm64-npu-1 # actionlint-ignore: runner-label - container: - image: quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py3.10 - steps: - - name: Check npu and CANN info - run: | - npu-smi info - cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info - - - name: Config mirrors - run: | - sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list - pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple - apt-get update -y - apt install git -y - git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/ - - - name: Checkout vllm-project/vllm-ascend repo - uses: actions/checkout@v4 - - - name: Install system dependencies - run: | - apt-get -y install `cat packages.txt` - apt-get -y install gcc g++ cmake libnuma-dev - - - name: Checkout vllm-project/vllm repo - uses: actions/checkout@v4 - with: - repository: vllm-project/vllm - path: ./vllm-empty - - - name: Install vllm-project/vllm from source - working-directory: ./vllm-empty - run: | - VLLM_TARGET_DEVICE=empty pip install -e . - - - name: Install vllm-project/vllm-ascend - run: | - pip install -r requirements-dev.txt - pip install -e . - - - name: Install pta - run: | - if [ ! -d /root/.cache/pta ]; then - mkdir -p /root/.cache/pta - fi - - if [ ! -f /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ]; then - cd /root/.cache/pta - rm -rf pytorch_v2.5.1_py310* - wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz - tar -zxvf pytorch_v2.5.1_py310.tar.gz - fi - - pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl - - - name: Run vllm-project/vllm-ascend test for V0 Engine - env: - VLLM_USE_V1: 0 - HF_ENDPOINT: https://hf-mirror.com - run: | - VLLM_USE_V1=0 pytest -sv tests - - - name: Run vllm-project/vllm-ascend test for V1 Engine - env: - VLLM_USE_V1: 1 - VLLM_WORKER_MULTIPROC_METHOD: spawn - HF_ENDPOINT: https://hf-mirror.com - run: | - pytest -sv tests - - - name: Run vllm-project/vllm test for V0 Engine - env: - VLLM_USE_V1: 0 - PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256 - HF_ENDPOINT: https://hf-mirror.com - run: | - pytest -sv diff --git a/pytest.ini b/pytest.ini index 8889df7..e2c9818 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,5 +1,8 @@ [pytest] minversion = 6.0 +markers = + singlenpu: tests that run on single npu + multinpu: tests that run on multi npu norecursedirs = vllm-empty/tests/prefix_caching vllm-empty/tests/weight_loading diff --git a/tests/conftest.py b/tests/conftest.py index 3a593e4..49f42c7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -17,14 +17,17 @@ # limitations under the License. # +import gc from typing import List, Optional, Tuple, TypeVar, Union import numpy as np import pytest +import torch from PIL import Image from vllm import LLM, SamplingParams from vllm.config import TaskOption -from vllm.distributed import cleanup_dist_env_and_memory +from vllm.distributed.parallel_state import (destroy_distributed_environment, + destroy_model_parallel) from vllm.inputs import ExplicitEncoderDecoderPrompt, TextPrompt, TokensPrompt from vllm.logger import init_logger from vllm.outputs import RequestOutput @@ -37,6 +40,7 @@ from tests.model_utils import (TokensTextLogprobs, logger = init_logger(__name__) _M = TypeVar("_M") + _PromptMultiModalInput = Union[List[_M], List[List[_M]]] PromptImageInput = _PromptMultiModalInput[Image.Image] @@ -44,6 +48,13 @@ PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]] PromptVideoInput = _PromptMultiModalInput[np.ndarray] +def cleanup_dist_env_and_memory(): + destroy_model_parallel() + destroy_distributed_environment() + gc.collect() + torch.npu.empty_cache() + + class VllmRunner: def __init__( diff --git a/tests/test_offline_inference.py b/tests/test_offline_inference.py index 3d64be9..ecff067 100644 --- a/tests/test_offline_inference.py +++ b/tests/test_offline_inference.py @@ -31,20 +31,13 @@ import vllm_ascend # noqa: F401 MODELS = [ "Qwen/Qwen2.5-0.5B-Instruct", ] -os.environ["VLLM_USE_MODELSCOPE"] = "True" os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" -TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4") - @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half", "float16"]) @pytest.mark.parametrize("max_tokens", [5]) -def test_models( - model: str, - dtype: str, - max_tokens: int, -) -> None: +def test_models(model: str, dtype: str, max_tokens: int) -> None: # 5042 tokens for gemma2 # gemma2 has alternating sliding window size of 4096 # we need a prompt with more than 4096 tokens to test the sliding window @@ -60,6 +53,28 @@ def test_models( vllm_model.generate_greedy(example_prompts, max_tokens) +@pytest.mark.multinpu +@pytest.mark.parametrize("model, distributed_executor_backend", [ + ("Qwen/QwQ-32B", "mp"), +]) +def test_models_distributed(vllm_runner, model: str, + distributed_executor_backend: str) -> None: + example_prompts = [ + "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.", + "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.", + "Compare and contrast artificial intelligence with human intelligence in terms of processing information.", + ] + dtype = "half" + max_tokens = 5 + with vllm_runner( + model, + dtype=dtype, + tensor_parallel_size=4, + distributed_executor_backend=distributed_executor_backend, + ) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) + + if __name__ == "__main__": import pytest pytest.main([__file__])