diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index 84117eda..ed744973 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -15,6 +15,9 @@ on: type: required: true type: string + contains_310: + required: true + type: boolean jobs: e2e: @@ -328,3 +331,119 @@ jobs: # spec_decode pytest -sv --durations=0 tests/e2e/multicard/4-cards/spec_decode/test_mtp_qwen3_next.py + + e2e_310p: + name: 310p singlecard + runs-on: linux-aarch64-310p-1 + if: ${{ inputs.contains_310 }} + container: + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-310p-ubuntu22.04-py3.11 + env: + VLLM_LOGGING_LEVEL: ERROR + VLLM_USE_MODELSCOPE: True + TRANSFORMERS_OFFLINE: 1 + steps: + - name: Check npu and CANN info + run: | + npu-smi info + cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info + - name: Config mirrors + run: | + sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list + pip config set global.index-url http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple + pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local + apt-get update -y + apt install git -y + + - name: Checkout vllm-project/vllm-ascend repo + uses: actions/checkout@v6 + + - name: Install system dependencies + run: | + apt-get -y install `cat packages.txt` + apt-get -y install gcc g++ cmake libnuma-dev + + - name: Checkout vllm-project/vllm repo + uses: actions/checkout@v6 + with: + repository: vllm-project/vllm + ref: ${{ inputs.vllm }} + path: ./vllm-empty + fetch-depth: 1 + + - name: Install vllm-project/vllm from source + working-directory: ./vllm-empty + run: | + VLLM_TARGET_DEVICE=empty pip install -e . + + - name: Install vllm-project/vllm-ascend + env: + PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi + run: | + pip install -r requirements-dev.txt + pip install -v -e . + + - name: Run vllm-project/vllm-ascend test + env: + PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256 + VLLM_WORKER_MULTIPROC_METHOD: spawn + run: | + pytest -sv --durations=0 tests/e2e/310p/test_offline_inference_310p.py + + e2e_310p-4cards: + name: 310p multicards 4cards + runs-on: linux-aarch64-310p-4 + if: ${{ inputs.contains_310 }} + container: + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-310p-ubuntu22.04-py3.11 + env: + VLLM_LOGGING_LEVEL: ERROR + VLLM_USE_MODELSCOPE: True + TRANSFORMERS_OFFLINE: 1 + steps: + - name: Check npu and CANN info + run: | + npu-smi info + cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info + - name: Config mirrors + run: | + sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list + pip config set global.index-url http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple + pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local + apt-get update -y + apt install git -y + + - name: Checkout vllm-project/vllm-ascend repo + uses: actions/checkout@v6 + + - name: Install system dependencies + run: | + apt-get -y install `cat packages.txt` + apt-get -y install gcc g++ cmake libnuma-dev + + - name: Checkout vllm-project/vllm repo + uses: actions/checkout@v6 + with: + repository: vllm-project/vllm + ref: ${{ inputs.vllm }} + path: ./vllm-empty + fetch-depth: 1 + + - name: Install vllm-project/vllm from source + working-directory: ./vllm-empty + run: | + VLLM_TARGET_DEVICE=empty pip install -e . + + - name: Install vllm-project/vllm-ascend + env: + PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi + run: | + pip install -r requirements-dev.txt + pip install -v -e . + + - name: Run vllm-project/vllm-ascend test + env: + PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256 + VLLM_WORKER_MULTIPROC_METHOD: spawn + run: | + pytest -sv --durations=0 tests/e2e/310p/test_offline_inference_parallel_310p.py diff --git a/.github/workflows/labled_test_310.yaml b/.github/workflows/labled_test_310.yaml deleted file mode 100644 index acd06346..00000000 --- a/.github/workflows/labled_test_310.yaml +++ /dev/null @@ -1,110 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# - -name: 310p Labeled Test - -on: - pull_request: - types: [ labeled ] - -# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly -# declared as "shell: bash -el {0}" on steps that need to be properly activated. -# It's used to activate ascend-toolkit environment variables. -defaults: - run: - shell: bash -el {0} - -# only cancel in-progress runs of the same workflow -# and ignore the lint / 1 card / 4 cards test type -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - e2e: - # e2e-310p-test will be triggered when tag 'e2e-310p-test' & 'ready-for-test' or schedule job - if: >- - ${{ - (contains(github.event.pull_request.labels.*.name, 'e2e-310p-test')) && - contains(github.event.pull_request.labels.*.name, 'ready-for-test') || - github.event_name == 'schedule' || github.event_name == 'push' - }} - strategy: - max-parallel: 2 - matrix: - os: [linux-aarch64-310p-1, linux-aarch64-310p-4] - vllm_version: [v0.11.0] - name: 310p e2e test - runs-on: ${{ matrix.os }} - container: - # TODO(yikun): Remove m.daocloud.io prefix when infra proxy ready - image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-310p-ubuntu22.04-py3.11 - env: - VLLM_LOGGING_LEVEL: ERROR - VLLM_USE_MODELSCOPE: True - steps: - - name: Check npu and CANN info - run: | - npu-smi info - cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info - - - name: Config mirrors - run: | - sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list - pip config set global.index-url http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple - pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local - apt-get update -y - apt install git -y - - - name: Checkout vllm-project/vllm-ascend repo - uses: actions/checkout@v6 - - - name: Install system dependencies - run: | - apt-get -y install `cat packages.txt` - apt-get -y install git vim wget net-tools gcc g++ cmake libnuma-dev curl gnupg2 - - - name: Checkout vllm-project/vllm repo - uses: actions/checkout@v6 - with: - repository: vllm-project/vllm - ref: ${{ matrix.vllm_version }} - path: ./vllm-empty - - - name: Install vllm-project/vllm from source - working-directory: ./vllm-empty - run: | - VLLM_TARGET_DEVICE=empty pip install -e . - - - name: Install vllm-project/vllm-ascend - run: | - export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib - pip install -r requirements-dev.txt - pip install -v -e . - - - name: Run e2e test - env: - VLLM_WORKER_MULTIPROC_METHOD: spawn - VLLM_USE_MODELSCOPE: True - PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256 - run: | - if [[ "${{ matrix.os }}" == "linux-aarch64-310p-1" ]]; then - pytest -sv tests/e2e/310p/test_offline_inference_310p.py - else - pytest -sv tests/e2e/310p/test_offline_inference_parallel_310p.py - fi diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml index 9d26c2ac..16461015 100644 --- a/.github/workflows/pr_test_full.yaml +++ b/.github/workflows/pr_test_full.yaml @@ -83,4 +83,5 @@ jobs: vllm: ${{ matrix.vllm_version }} runner: linux-aarch64-a2 image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11 + contains_310: true type: full diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index 405d1ba6..3180f561 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -47,6 +47,7 @@ jobs: outputs: e2e_tracker: ${{ steps.filter.outputs.e2e_tracker }} ut_tracker: ${{ steps.filter.outputs.ut_tracker }} + _310_tracker: ${{ steps.filter.outputs._310_tracker }} steps: - name: Setup git proxy run: | @@ -73,6 +74,8 @@ jobs: ut_tracker: - 'tests/ut/**' - '.github/workflows/pr_test_light.yaml' + _310_tracker: + - 'vllm_ascend/_310p/**' ut: needs: [lint, changes] @@ -103,4 +106,5 @@ jobs: vllm: ${{ matrix.vllm_version }} runner: linux-aarch64-a2 image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11 + contains_310: ${{ needs.changes.outputs._310_tracker == 'true' }} type: light diff --git a/.github/workflows/schedule_test_vllm_main.yaml b/.github/workflows/schedule_test_vllm_main.yaml index 7f8c7876..c233d25d 100644 --- a/.github/workflows/schedule_test_vllm_main.yaml +++ b/.github/workflows/schedule_test_vllm_main.yaml @@ -36,4 +36,5 @@ jobs: vllm: main runner: linux-aarch64-a2 image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11 + contains_310: false type: full diff --git a/tests/e2e/310p/test_offline_inference_310p.py b/tests/e2e/310p/test_offline_inference_310p.py index 31f7eb92..188865f6 100644 --- a/tests/e2e/310p/test_offline_inference_310p.py +++ b/tests/e2e/310p/test_offline_inference_310p.py @@ -15,58 +15,61 @@ # limitations under the License. # This file is a part of the vllm-ascend project. import pytest -import vllm # noqa: F401 -from vllm import SamplingParams +from vllm.assets.image import ImageAsset -import vllm_ascend # noqa: F401 from tests.e2e.conftest import VllmRunner -MODELS = ["Qwen/Qwen3-0.6B", "Qwen/Qwen2.5-7B-Instruct"] - -@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float16"]) @pytest.mark.parametrize("max_tokens", [5]) -def test_models(model: str, dtype: str, max_tokens: int) -> None: +def test_llm_models(dtype: str, max_tokens: int) -> None: example_prompts = [ "Hello, my name is", "The future of AI is", ] - with VllmRunner(model, + with VllmRunner("Qwen/Qwen3-0.6B", tensor_parallel_size=1, dtype=dtype, max_model_len=2048, - enforce_eager=True, - compilation_config={ - "custom_ops": - ["none", "+rms_norm", "+rotary_embedding"] - }) as vllm_model: + enforce_eager=True) as vllm_model: vllm_model.generate_greedy(example_prompts, max_tokens) -VL_MODELS = ["Qwen/Qwen2.5-VL-3B-Instruct"] +def test_multimodal_vl(): + image = ImageAsset("cherry_blossom").pil_image.convert("RGB") - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float16"]) -def test_vl_model_with_samples(model: str, dtype: str) -> None: - example_prompts = [ - "Hello, my name is", - "The future of AI is", + img_questions = [ + "What is the content of this image?", + "Describe the content of this image in detail.", + "What's in the image?", + "Where is this image taken?", ] - with VllmRunner(model, - tensor_parallel_size=1, - dtype=dtype, - max_model_len=2048, + images = [image] * len(img_questions) + placeholder = "<|image_pad|>" + prompts = [ + ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>" + f"{q}<|im_end|>\n<|im_start|>assistant\n") for q in img_questions + ] + + with VllmRunner("Qwen/Qwen2.5-VL-3B-Instruct", + mm_processor_kwargs={ + "min_pixels": 28 * 28, + "max_pixels": 1280 * 28 * 28, + "fps": 1, + }, + max_model_len=8192, enforce_eager=True, - compilation_config={ - "custom_ops": - ["none", "+rms_norm", "+rotary_embedding"] - }) as vllm_model: - sampling_params = SamplingParams(max_tokens=100, - top_p=0.95, - top_k=50, - temperature=0.6) - vllm_model.generate(example_prompts, sampling_params) + limit_mm_per_prompt={"image": 1}) as vllm_model: + outputs = vllm_model.generate_greedy( + prompts=prompts, + images=images, + max_tokens=64, + ) + + assert len(outputs) == len(prompts) + + for _, output_str in outputs: + assert output_str, "Generated output should not be empty." diff --git a/tests/e2e/310p/test_offline_inference_parallel_310p.py b/tests/e2e/310p/test_offline_inference_parallel_310p.py new file mode 100644 index 00000000..c6467d60 --- /dev/null +++ b/tests/e2e/310p/test_offline_inference_parallel_310p.py @@ -0,0 +1,20 @@ +import pytest + +from tests.e2e.conftest import VllmRunner + + +@pytest.mark.parametrize("dtype", ["float16"]) +@pytest.mark.parametrize("max_tokens", [5]) +@pytest.skip("310p does not support parallel inference now. Fix me") +def test_models(dtype: str, max_tokens: int) -> None: + example_prompts = [ + "Hello, my name is", + "The future of AI is", + ] + + with VllmRunner("Qwen/Qwen3-0.6B", + tensor_parallel_size=4, + dtype=dtype, + max_model_len=2048, + enforce_eager=True) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens)