[CI] Add new runner and enable QwQ multinpu test (#417)

### What this PR does / why we need it? - Add a new runner to the continuous integration system and keep the original CI runner until the new runner runs stably - Add distributed test cases ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI passed --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2025-04-08 16:52:45 +08:00
parent 5d6239306b
commit afdbf77483
5 changed files with 405 additions and 128 deletions
--- a/.github/workflows/vllm_ascend_test.yaml
+++ b/.github/workflows/vllm_ascend_test.yaml
@@ -0,0 +1,367 @@
 #
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 # This file is a part of the vllm-ascend project.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 name: 'e2e test'
 on:
  pull_request:
    branches:
      - 'main'
      - '*-dev'
    paths:
      - '*.txt'
      - '**/*.py'
      - '.github/workflows/vllm_ascend_test.yaml'
      - '!docs/**'
      - 'pytest.ini'
 # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
 # declared as "shell: bash -el {0}" on steps that need to be properly activated.
 # It's used to activate ascend-toolkit environment variables.
 defaults:
  run:
    shell: bash -el {0}
 concurrency:
  group: pr-${{ github.event.pull_request.number }}
  cancel-in-progress: true
 jobs:
  test-singlenpu:
    name: vLLM Ascend test main(single-npu)
    runs-on: linux-arm64-npu-1  # actionlint-ignore: runner-label
    container:
      image: quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py3.10
    steps:
      - name: Check npu and CANN info
        run: |
          npu-smi info
          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
      - name: Config mirrors
        run: |
          sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
          apt-get update -y
          apt install git -y
          git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
      - name: Checkout vllm-project/vllm-ascend repo
        uses: actions/checkout@v4
      - name: Install system dependencies
        run: |
          apt-get -y install `cat packages.txt`
          apt-get -y install gcc g++ cmake libnuma-dev
      - name: Checkout vllm-project/vllm repo
        uses: actions/checkout@v4
        with:
          repository: vllm-project/vllm
          path: ./vllm-empty
      - name: Install vllm-project/vllm from source
        working-directory: ./vllm-empty
        run: |
          VLLM_TARGET_DEVICE=empty pip install -e .
      - name: Install vllm-project/vllm-ascend
        run: |
          pip install -r requirements-dev.txt
          pip install -e .
      - name: Install pta
        run: |
          if [ ! -d /root/.cache/pta ]; then
            mkdir -p /root/.cache/pta
          fi
          if [ ! -f /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ]; then
            cd /root/.cache/pta
            rm -rf pytorch_v2.5.1_py310*
            wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz
            tar -zxvf pytorch_v2.5.1_py310.tar.gz
          fi
          pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
      - name: Run vllm-project/vllm-ascend test on V0 engine
        env:
          VLLM_USE_V1: 0
          HF_ENDPOINT: https://hf-mirror.com
        run: |
          VLLM_USE_V1=0 pytest -sv -m 'not multinpu' tests
      - name: Run vllm-project/vllm-ascend test for V1 Engine
        env:
          VLLM_USE_V1: 1
          VLLM_WORKER_MULTIPROC_METHOD: spawn
          HF_ENDPOINT: https://hf-mirror.com
        run: |
          pytest -sv -m 'not multinpu' tests
      - name: Run vllm-project/vllm test for V0 Engine
        env:
          VLLM_USE_V1: 0
          PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
          HF_ENDPOINT: https://hf-mirror.com
        run: |
          pytest -sv
  test-multinpu:
    name: vLLM Ascend test main(multi-npu)
    runs-on: linux-arm64-npu-4  
    container:
      image: ascendai/cann:8.0.0-910b-ubuntu22.04-py3.10
      env:
        HF_ENDPOINT: https://hf-mirror.com
        HF_TOKEN: ${{ secrets.HF_TOKEN }}
    steps:
      - name: Check npu and CANN info
        run: |
          npu-smi info
          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
      - name: Config mirrors
        run: |
          # sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
      - name: Install system dependencies
        run: |
          apt-get update -y
          apt-get -y install git wget
      - name: Config git
        run: |
          git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/ 
      - name: Checkout vllm-project/vllm-ascend repo
        uses: actions/checkout@v4
      - name: Install dependencies
        run: |
          pip install -r requirements-dev.txt
      - name: Checkout vllm-project/vllm repo
        uses: actions/checkout@v4
        with:
          repository: vllm-project/vllm
          path: ./vllm-empty
      - name: Install vllm-project/vllm from source
        working-directory: ./vllm-empty
        run: |
          VLLM_TARGET_DEVICE=empty pip install -e .
      - name: Install vllm-project/vllm-ascend
        run: |
          pip install -r requirements-dev.txt
          pip install -e .
      - name: Install pta
        run: |
          if [ ! -d /root/.cache/pta ]; then
            mkdir -p /root/.cache/pta
          fi
          if [ ! -f /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ]; then
            cd /root/.cache/pta
            rm -rf pytorch_v2.5.1_py310*
            wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz
            tar -zxvf pytorch_v2.5.1_py310.tar.gz
          fi
          pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
      - name: Run vllm-project/vllm-ascend test on V0 engine
        env:
          VLLM_USE_V1: 0
          HF_ENDPOINT: https://hf-mirror.com
        run: |
          VLLM_USE_V1=0 pytest -sv -m 'multinpu' tests
      - name: Run vllm-project/vllm-ascend test for V1 Engine
        env:
          VLLM_USE_V1: 1
          VLLM_WORKER_MULTIPROC_METHOD: spawn
          HF_ENDPOINT: https://hf-mirror.com
        run: |
          pytest -sv -m 'multinpu' tests
  test-singlenpu-v0_8_3:
    name: vLLM Ascend test v0.8.3(single-npu)
    runs-on: linux-arm64-npu-1  # actionlint-ignore: runner-label
    container:
      image: quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py3.10
    steps:
      - name: Check npu and CANN info
        run: |
          npu-smi info
          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
      - name: Config mirrors
        run: |
          sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
          apt-get update -y
          apt install git -y
          git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
      - name: Checkout vllm-project/vllm-ascend repo
        uses: actions/checkout@v4
      - name: Install system dependencies
        run: |
          apt-get -y install `cat packages.txt`
          apt-get -y install gcc g++ cmake libnuma-dev
      - name: Checkout vllm-project/vllm repo
        uses: actions/checkout@v4
        with:
          repository: vllm-project/vllm
          ref: v0.8.3
          path: ./vllm-empty
      - name: Install vllm-project/vllm from source
        working-directory: ./vllm-empty
        run: |
          VLLM_TARGET_DEVICE=empty pip install -e .
      - name: Install vllm-project/vllm-ascend
        run: |
          pip install -r requirements-dev.txt
          pip install -e .
      - name: Install pta
        run: |
          if [ ! -d /root/.cache/pta ]; then
            mkdir -p /root/.cache/pta
          fi
          if [ ! -f /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ]; then
            cd /root/.cache/pta
            rm -rf pytorch_v2.5.1_py310*
            wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz
            tar -zxvf pytorch_v2.5.1_py310.tar.gz
          fi
          pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
      - name: Run vllm-project/vllm-ascend test on V0 engine
        env:
          VLLM_USE_V1: 0
          HF_ENDPOINT: https://hf-mirror.com
        run: |
          VLLM_USE_V1=0 pytest -sv -m 'not multinpu' tests
      - name: Run vllm-project/vllm-ascend test for V1 Engine
        env:
          VLLM_USE_V1: 1
          VLLM_WORKER_MULTIPROC_METHOD: spawn
          HF_ENDPOINT: https://hf-mirror.com
        run: |
          pytest -sv -m 'not multinpu' tests
      - name: Run vllm-project/vllm test for V0 Engine
        env:
          VLLM_USE_V1: 0
          PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
          HF_ENDPOINT: https://hf-mirror.com
        run: |
          pytest -sv
  test-multinpu-v0_8_3:
    name: vLLM Ascend test v0.8.3(multi-npu)
    runs-on: linux-arm64-npu-4
    needs: test-multinpu
    container:
      image: ascendai/cann:8.0.0-910b-ubuntu22.04-py3.10
      env:
        HF_ENDPOINT: https://hf-mirror.com
        HF_TOKEN: ${{ secrets.HF_TOKEN }}
    steps:
      - name: Check npu and CANN info
        run: |
          npu-smi info
          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
      - name: Config mirrors
        run: |
          # sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
      - name: Install system dependencies
        run: |
          apt-get update -y
          apt-get -y install git wget
      - name: Config git
        run: |
          git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/ 
      - name: Checkout vllm-project/vllm-ascend repo
        uses: actions/checkout@v4
      - name: Install dependencies 
        run: |
          pip install -r requirements-dev.txt
      - name: Checkout vllm-project/vllm repo
        uses: actions/checkout@v4
        with:
          repository: vllm-project/vllm
          ref: v0.8.3
          path: ./vllm-empty
      - name: Install vllm-project/vllm from source
        working-directory: ./vllm-empty
        run: |
          VLLM_TARGET_DEVICE=empty pip install -e .
      - name: Install vllm-project/vllm-ascend
        run: |
          pip install -r requirements-dev.txt
          pip install -e .
      - name: Install pta
        run: |
          if [ ! -d /root/.cache/pta ]; then
            mkdir -p /root/.cache/pta
          fi
          if [ ! -f /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ]; then
            cd /root/.cache/pta
            rm -rf pytorch_v2.5.1_py310*
            wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz
            tar -zxvf pytorch_v2.5.1_py310.tar.gz
          fi
          pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
      - name: Run vllm-project/vllm-ascend test on V0 engine
        env:
          VLLM_USE_V1: 0
          HF_ENDPOINT: https://hf-mirror.com
        run: |
          VLLM_USE_V1=0 pytest -sv -m 'multinpu' tests
      - name: Run vllm-project/vllm-ascend test for V1 Engine
        env:
          VLLM_USE_V1: 1
          VLLM_WORKER_MULTIPROC_METHOD: spawn
          HF_ENDPOINT: https://hf-mirror.com
        run: |
          pytest -sv -m 'multinpu' tests
--- a/.github/workflows/vllm_ascend_test_main.yaml
+++ b/.github/workflows/vllm_ascend_test_main.yaml
@@ -1,119 +0,0 @@
 #
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 # This file is a part of the vllm-ascend project.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 name: 'e2e test'
 on:
  pull_request:
    branches:
      - 'main'
      - '*-dev'
    paths:
      - '*.txt'
      - '**/*.py'
      - '.github/workflows/vllm_ascend_test_main.yaml'
      - '!docs/**'
      - 'pytest.ini'
 # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
 # declared as "shell: bash -el {0}" on steps that need to be properly activated.
 # It's used to activate ascend-toolkit environment variables.
 defaults:
  run:
    shell: bash -el {0}
 jobs:
  test:
    name: vLLM Ascend test (self-host)
    runs-on: linux-arm64-npu-1  # actionlint-ignore: runner-label
    container:
      image: quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py3.10
    steps:
      - name: Check npu and CANN info
        run: |
          npu-smi info
          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
      - name: Config mirrors
        run: |
          sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
          apt-get update -y
          apt install git -y
          git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
      - name: Checkout vllm-project/vllm-ascend repo
        uses: actions/checkout@v4
      - name: Install system dependencies
        run: |
          apt-get -y install `cat packages.txt`
          apt-get -y install gcc g++ cmake libnuma-dev
      - name: Checkout vllm-project/vllm repo
        uses: actions/checkout@v4
        with:
          repository: vllm-project/vllm
          path: ./vllm-empty
      - name: Install vllm-project/vllm from source
        working-directory: ./vllm-empty
        run: |
          VLLM_TARGET_DEVICE=empty pip install -e .
      - name: Install vllm-project/vllm-ascend
        run: |
          pip install -r requirements-dev.txt
          pip install -e .
      - name: Install pta
        run: |
          if [ ! -d /root/.cache/pta ]; then
            mkdir -p /root/.cache/pta
          fi
          if [ ! -f /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ]; then
            cd /root/.cache/pta
            rm -rf pytorch_v2.5.1_py310*
            wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz
            tar -zxvf pytorch_v2.5.1_py310.tar.gz
          fi
          pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
      - name: Run vllm-project/vllm-ascend test for V0 Engine
        env:
          VLLM_USE_V1: 0
          HF_ENDPOINT: https://hf-mirror.com
        run: |
          VLLM_USE_V1=0 pytest -sv tests
      - name: Run vllm-project/vllm-ascend test for V1 Engine
        env:
          VLLM_USE_V1: 1
          VLLM_WORKER_MULTIPROC_METHOD: spawn
          HF_ENDPOINT: https://hf-mirror.com
        run: |
          pytest -sv tests
      - name: Run vllm-project/vllm test for V0 Engine
        env:
          VLLM_USE_V1: 0
          PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
          HF_ENDPOINT: https://hf-mirror.com
        run: |
          pytest -sv
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,5 +1,8 @@
 [pytest]
 minversion = 6.0
 markers =
    singlenpu: tests that run on single npu
    multinpu: tests that run on multi npu
 norecursedirs = 
    vllm-empty/tests/prefix_caching
    vllm-empty/tests/weight_loading
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -17,14 +17,17 @@
 # limitations under the License.
 #
 import gc
 from typing import List, Optional, Tuple, TypeVar, Union
 import numpy as np
 import pytest
 import torch
 from PIL import Image
 from vllm import LLM, SamplingParams
 from vllm.config import TaskOption
-from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.distributed.parallel_state import (destroy_distributed_environment,
                                             destroy_model_parallel)
 from vllm.inputs import ExplicitEncoderDecoderPrompt, TextPrompt, TokensPrompt
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
@@ -37,6 +40,7 @@ from tests.model_utils import (TokensTextLogprobs,
 logger = init_logger(__name__)
 _M = TypeVar("_M")
 _PromptMultiModalInput = Union[List[_M], List[List[_M]]]
 PromptImageInput = _PromptMultiModalInput[Image.Image]
@@ -44,6 +48,13 @@ PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]]
 PromptVideoInput = _PromptMultiModalInput[np.ndarray]
 def cleanup_dist_env_and_memory():
    destroy_model_parallel()
    destroy_distributed_environment()
    gc.collect()
    torch.npu.empty_cache()
 class VllmRunner:
    def __init__(
--- a/tests/test_offline_inference.py
+++ b/tests/test_offline_inference.py
@@ -31,20 +31,13 @@ import vllm_ascend  # noqa: F401
 MODELS = [
    "Qwen/Qwen2.5-0.5B-Instruct",
 ]
 os.environ["VLLM_USE_MODELSCOPE"] = "True"
 os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
 TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half", "float16"])
@pytest.mark.parametrize("max_tokens", [5])
-def test_models(
+def test_models(model: str, dtype: str, max_tokens: int) -> None:
    model: str,
    dtype: str,
    max_tokens: int,
 ) -> None:
    # 5042 tokens for gemma2
    # gemma2 has alternating sliding window size of 4096
    # we need a prompt with more than 4096 tokens to test the sliding window
@@ -60,6 +53,28 @@ def test_models(
        vllm_model.generate_greedy(example_prompts, max_tokens)
@pytest.mark.multinpu
@pytest.mark.parametrize("model, distributed_executor_backend", [
    ("Qwen/QwQ-32B", "mp"),
 ])
 def test_models_distributed(vllm_runner, model: str,
                            distributed_executor_backend: str) -> None:
    example_prompts = [
        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
        "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
        "Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
    ]
    dtype = "half"
    max_tokens = 5
    with vllm_runner(
            model,
            dtype=dtype,
            tensor_parallel_size=4,
            distributed_executor_backend=distributed_executor_backend,
    ) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)
 if __name__ == "__main__":
    import pytest
    pytest.main([__file__])