From f60bb474f98dff267d9e737a396f65f2430aba09 Mon Sep 17 00:00:00 2001 From: Li Wang Date: Tue, 29 Jul 2025 18:59:05 +0800 Subject: [PATCH] [CI] Enable linux-aarch64-a2 (64GB) and tp2 * 2 max-parallel to speed up CI (#2065) ### What this PR does / why we need it? Currently our workflow run time takes about 3 hours in total, which seriously affects the developer experience, so it is urgent to have a optimization, after this pr, It is expected that the running time of the full CI can be shortened to 1h40min. - Enable linux-aarch64-a2 (64GB) to replace linux-arm64-npu (32GB) - Change TP4 ---> TP2 * 2 max-parallel - Move DeepSeek-V2-Lite-W8A8 to single card test ### Does this PR introduce _any_ user-facing change? No - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/a2480251ec92ba2a849464dde48db8a2b7f6ef81 --------- Signed-off-by: wangli --- .github/actionlint.yaml | 8 ++-- .github/workflows/accuracy_test.yaml | 4 +- .github/workflows/vllm_ascend_doctest.yaml | 2 +- .github/workflows/vllm_ascend_test.yaml | 7 ++-- .../workflows/vllm_ascend_test_long_term.yaml | 2 +- benchmarks/scripts/run_accuracy.py | 12 +++--- .../disaggregated_prefill_v1/gen_ranktable.sh | 2 +- .../long_term/accuracy/accuracy_multicard.py | 4 +- .../multicard/test_fused_moe_allgather_ep.py | 4 +- .../test_offline_inference_distributed.py | 32 ++++---------- tests/e2e/multicard/test_pipeline_parallel.py | 2 +- .../e2e/multicard/test_torchair_graph_mode.py | 4 +- tests/e2e/singlecard/quant/test_w8a8.py | 42 +++++++++++++++++++ .../e2e/singlecard/test_offline_inference.py | 25 ----------- 14 files changed, 75 insertions(+), 75 deletions(-) create mode 100644 tests/e2e/singlecard/quant/test_w8a8.py diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml index 78ea6f3..3b4d23f 100644 --- a/.github/actionlint.yaml +++ b/.github/actionlint.yaml @@ -1,8 +1,10 @@ self-hosted-runner: # Labels of self-hosted runner in array of strings. labels: - - linux-arm64-npu-1 - - linux-arm64-npu-2 - - linux-arm64-npu-4 + - linux-aarch64-a2-0 + - linux-aarch64-a2-1 + - linux-aarch64-a2-2 + - linux-aarch64-a2-4 + - linux-aarch64-a2-8 - linux-arm64-npu-static-8 - ubuntu-24.04-arm diff --git a/.github/workflows/accuracy_test.yaml b/.github/workflows/accuracy_test.yaml index 949e76b..0a98feb 100644 --- a/.github/workflows/accuracy_test.yaml +++ b/.github/workflows/accuracy_test.yaml @@ -85,8 +85,8 @@ jobs: }} runs-on: >- ${{ - (matrix.model_name == 'Qwen/Qwen3-30B-A3B' && 'linux-arm64-npu-4') || - 'linux-arm64-npu-2' + (matrix.model_name == 'Qwen/Qwen3-30B-A3B' && 'linux-aarch64-a2-2') || + 'linux-aarch64-a2-1' }} strategy: matrix: diff --git a/.github/workflows/vllm_ascend_doctest.yaml b/.github/workflows/vllm_ascend_doctest.yaml index 25746db..1b4faea 100644 --- a/.github/workflows/vllm_ascend_doctest.yaml +++ b/.github/workflows/vllm_ascend_doctest.yaml @@ -48,7 +48,7 @@ jobs: matrix: vllm_verison: [v0.9.1-dev, v0.9.1-dev-openeuler, main, main-openeuler] name: vLLM Ascend test - runs-on: linux-arm64-npu-1 + runs-on: linux-aarch64-a2-1 container: image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:${{ matrix.vllm_verison }} steps: diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index 14d56ab..580559c 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -136,7 +136,7 @@ jobs: strategy: max-parallel: 2 matrix: - os: [linux-arm64-npu-1] + os: [linux-aarch64-a2-1] vllm_version: [main, v0.10.0] name: singlecard e2e test runs-on: ${{ matrix.os }} @@ -213,9 +213,9 @@ jobs: needs: [e2e] if: ${{ needs.e2e.result == 'success' }} strategy: - max-parallel: 1 + max-parallel: 2 matrix: - os: [linux-arm64-npu-4] + os: [linux-aarch64-a2-2] vllm_version: [main, v0.10.0] name: multicard e2e test runs-on: ${{ matrix.os }} @@ -275,7 +275,6 @@ jobs: # To avoid oom, we need to run the test in a single process. pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ - pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8 pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_dbo pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeekV3_dbo pytest -sv tests/e2e/multicard/test_data_parallel.py diff --git a/.github/workflows/vllm_ascend_test_long_term.yaml b/.github/workflows/vllm_ascend_test_long_term.yaml index d8af789..0dfa7e3 100644 --- a/.github/workflows/vllm_ascend_test_long_term.yaml +++ b/.github/workflows/vllm_ascend_test_long_term.yaml @@ -42,7 +42,7 @@ jobs: strategy: max-parallel: 2 matrix: - os: [linux-arm64-npu-1, linux-arm64-npu-4] + os: [linux-aarch64-a2-1, linux-aarch64-a2-2] vllm_version: [main, v0.10.0] name: vLLM Ascend long term test runs-on: ${{ matrix.os }} diff --git a/benchmarks/scripts/run_accuracy.py b/benchmarks/scripts/run_accuracy.py index 35b59bf..cc2f4e2 100644 --- a/benchmarks/scripts/run_accuracy.py +++ b/benchmarks/scripts/run_accuracy.py @@ -50,17 +50,17 @@ MODEL_TYPE = { # Command templates for running evaluations MODEL_RUN_INFO = { "Qwen/Qwen3-30B-A3B": ( - "export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=4,gpu_memory_utilization=0.6,enable_expert_parallel=True'\n" + "export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6,enable_expert_parallel=True'\n" "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n" "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1" ), "Qwen/Qwen3-8B-Base": ( - "export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n" + "export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=1,gpu_memory_utilization=0.6'\n" "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n" "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1" ), "Qwen/Qwen2.5-VL-7B-Instruct": ( - "export MODEL_ARGS='pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=2,max_images=2'\n" + "export MODEL_ARGS='pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=1,max_images=2'\n" "lm_eval --model vllm-vlm --model_args $MODEL_ARGS --tasks {datasets} \ \n" "--apply_chat_template --fewshot_as_multiturn --batch_size 1" ), @@ -94,9 +94,9 @@ EXECUTION_MODE = { # Model arguments for evaluation MODEL_ARGS = { - "Qwen/Qwen3-8B-Base": "pretrained=Qwen/Qwen3-8B-Base,max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6", - "Qwen/Qwen2.5-VL-7B-Instruct": "pretrained=Qwen/Qwen2.5-VL-7B-Instruct,max_model_len=8192,dtype=auto,tensor_parallel_size=2,max_images=2", - "Qwen/Qwen3-30B-A3B": "pretrained=Qwen/Qwen3-30B-A3B,max_model_len=4096,dtype=auto,tensor_parallel_size=4,gpu_memory_utilization=0.6,enable_expert_parallel=True", + "Qwen/Qwen3-8B-Base": "pretrained=Qwen/Qwen3-8B-Base,max_model_len=4096,dtype=auto,tensor_parallel_size=1,gpu_memory_utilization=0.6", + "Qwen/Qwen2.5-VL-7B-Instruct": "pretrained=Qwen/Qwen2.5-VL-7B-Instruct,max_model_len=8192,dtype=auto,tensor_parallel_size=1,max_images=2", + "Qwen/Qwen3-30B-A3B": "pretrained=Qwen/Qwen3-30B-A3B,max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6,enable_expert_parallel=True", } # Whether to apply chat template formatting diff --git a/examples/disaggregated_prefill_v1/gen_ranktable.sh b/examples/disaggregated_prefill_v1/gen_ranktable.sh index 33d4a32..e8a923a 100644 --- a/examples/disaggregated_prefill_v1/gen_ranktable.sh +++ b/examples/disaggregated_prefill_v1/gen_ranktable.sh @@ -76,4 +76,4 @@ if [[ -n "${GEN_RANKTABLE}" || ! -e ${PWD}/ranktable.json ]]; then --master_addr ${MASTER_ADDR} \ --master_port ${MASTER_PORT} \ gen_ranktable.py --local-host $LOCAL_HOST --prefill-device-cnt $PREFILL_DEVICE_CNT --decode-device-cnt $DECODE_DEVICE_CNT -fi \ No newline at end of file +fi diff --git a/tests/e2e/long_term/accuracy/accuracy_multicard.py b/tests/e2e/long_term/accuracy/accuracy_multicard.py index 9dd77a9..2bfb389 100644 --- a/tests/e2e/long_term/accuracy/accuracy_multicard.py +++ b/tests/e2e/long_term/accuracy/accuracy_multicard.py @@ -91,9 +91,9 @@ MORE_ARGS = { "Qwen/Qwen2.5-0.5B-Instruct": None, "Qwen/Qwen3-30B-A3B": - "tensor_parallel_size=4,enable_expert_parallel=True,enforce_eager=True", + "tensor_parallel_size=2,enable_expert_parallel=True,enforce_eager=True", "deepseek-ai/DeepSeek-V2-Lite": - "tensor_parallel_size=4,trust_remote_code=True,enforce_eager=True" + "tensor_parallel_size=2,trust_remote_code=True,enforce_eager=True" } multiprocessing.set_start_method("spawn", force=True) diff --git a/tests/e2e/multicard/test_fused_moe_allgather_ep.py b/tests/e2e/multicard/test_fused_moe_allgather_ep.py index e804d74..916ce05 100644 --- a/tests/e2e/multicard/test_fused_moe_allgather_ep.py +++ b/tests/e2e/multicard/test_fused_moe_allgather_ep.py @@ -46,7 +46,7 @@ def test_generate_with_allgather(): sampling_params = SamplingParams(max_tokens=100, temperature=0.0) with VllmRunner(snapshot_download("vllm-ascend/DeepSeek-V3-Pruning"), - tensor_parallel_size=4, + tensor_parallel_size=2, enforce_eager=True, max_model_len=1024, dtype="auto", @@ -74,7 +74,7 @@ def test_generate_with_alltoall(): sampling_params = SamplingParams(max_tokens=100, temperature=0.0) with VllmRunner(snapshot_download("vllm-ascend/DeepSeek-V3-Pruning"), - tensor_parallel_size=4, + tensor_parallel_size=2, enforce_eager=True, max_model_len=1024, dtype="auto", diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py index fa19ec3..224bf45 100644 --- a/tests/e2e/multicard/test_offline_inference_distributed.py +++ b/tests/e2e/multicard/test_offline_inference_distributed.py @@ -42,7 +42,7 @@ def test_models_distributed_QwQ(): with VllmRunner( "Qwen/QwQ-32B", dtype=dtype, - tensor_parallel_size=4, + tensor_parallel_size=2, distributed_executor_backend="mp", ) as vllm_model: vllm_model.generate_greedy(example_prompts, max_tokens) @@ -57,7 +57,7 @@ def test_models_distributed_DeepSeek_multistream_moe(): with VllmRunner( "vllm-ascend/DeepSeek-V3-Pruning", dtype=dtype, - tensor_parallel_size=4, + tensor_parallel_size=2, distributed_executor_backend="mp", additional_config={ "torchair_graph_config": { @@ -82,7 +82,7 @@ def test_models_distributed_DeepSeek_dbo(): with VllmRunner( "deepseek-ai/DeepSeek-V2-Lite", dtype=dtype, - tensor_parallel_size=4, + tensor_parallel_size=2, distributed_executor_backend="mp", ) as vllm_model: model_arch = 'DeepseekV2ForCausalLM' @@ -106,7 +106,7 @@ def test_models_distributed_DeepSeekV3_dbo(): with VllmRunner( "vllm-ascend/DeepSeek-V3-Pruning", dtype=dtype, - tensor_parallel_size=4, + tensor_parallel_size=2, distributed_executor_backend="mp", ) as vllm_model: model_arch = 'DeepseekV3ForCausalLM' @@ -118,24 +118,6 @@ def test_models_distributed_DeepSeekV3_dbo(): vllm_model.generate(example_prompts, sampling_params) -@pytest.mark.skip(reason="Due to OOM,waiting for 1311pr to merge in") -def test_models_distributed_DeepSeek_W8A8(): - example_prompts = [ - "Hello, my name is", - ] - max_tokens = 5 - - with VllmRunner( - snapshot_download("vllm-ascend/DeepSeek-V2-Lite-W8A8"), - max_model_len=8192, - enforce_eager=True, - dtype="auto", - tensor_parallel_size=4, - quantization="ascend", - ) as vllm_model: - vllm_model.generate_greedy(example_prompts, max_tokens) - - def test_models_distributed_pangu(): example_prompts = [ "Hello, my name is", @@ -147,7 +129,7 @@ def test_models_distributed_pangu(): max_model_len=8192, enforce_eager=True, dtype="auto", - tensor_parallel_size=4, + tensor_parallel_size=2, distributed_executor_backend="mp", ) as vllm_model: vllm_model.generate_greedy(example_prompts, max_tokens) @@ -169,7 +151,7 @@ def test_models_distributed_topk() -> None: with VllmRunner( "deepseek-ai/DeepSeek-V2-Lite", dtype=dtype, - tensor_parallel_size=4, + tensor_parallel_size=2, distributed_executor_backend="mp", ) as vllm_model: vllm_model.generate(example_prompts, sampling_params) @@ -186,7 +168,7 @@ def test_models_distributed_Qwen3_W8A8(): max_model_len=8192, enforce_eager=True, dtype="auto", - tensor_parallel_size=4, + tensor_parallel_size=2, quantization="ascend", ) as vllm_model: vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tests/e2e/multicard/test_pipeline_parallel.py b/tests/e2e/multicard/test_pipeline_parallel.py index 612744e..8dd3a90 100644 --- a/tests/e2e/multicard/test_pipeline_parallel.py +++ b/tests/e2e/multicard/test_pipeline_parallel.py @@ -22,7 +22,7 @@ MODELS = [ "Qwen/Qwen3-0.6B", ] -TENSOR_PARALLELS = [2] +TENSOR_PARALLELS = [1] PIPELINE_PARALLELS = [2] DIST_EXECUTOR_BACKEND = ["mp", "ray"] diff --git a/tests/e2e/multicard/test_torchair_graph_mode.py b/tests/e2e/multicard/test_torchair_graph_mode.py index 9d83d98..9ad336c 100644 --- a/tests/e2e/multicard/test_torchair_graph_mode.py +++ b/tests/e2e/multicard/test_torchair_graph_mode.py @@ -30,7 +30,7 @@ os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" def _deepseek_torchair_test_fixture( additional_config: Dict, *, - tensor_parallel_size=4, + tensor_parallel_size=2, ): example_prompts = [ "Hello, my name is", @@ -98,7 +98,7 @@ def test_e2e_deepseekv3_with_torchair_ms_mla(): def _pangu_torchair_test_fixture( additional_config: Dict, *, - tensor_parallel_size=4, + tensor_parallel_size=2, ): example_prompts = [ "Hello, my name is", diff --git a/tests/e2e/singlecard/quant/test_w8a8.py b/tests/e2e/singlecard/quant/test_w8a8.py new file mode 100644 index 0000000..6123d9b --- /dev/null +++ b/tests/e2e/singlecard/quant/test_w8a8.py @@ -0,0 +1,42 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +import pytest +from modelscope import snapshot_download # type: ignore[import-untyped] + +from tests.e2e.conftest import VllmRunner + +MODELS = [ + "vllm-ascend/DeepSeek-V2-Lite-W8A8", + "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8" +] + + +@pytest.mark.parametrize("model", MODELS) +def test_quant_W8A8(example_prompts, model): + max_tokens = 5 + model_path = snapshot_download(model) + with VllmRunner( + model_path, + max_model_len=8192, + enforce_eager=True, + dtype="auto", + gpu_memory_utilization=0.7, + quantization="ascend", + ) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tests/e2e/singlecard/test_offline_inference.py b/tests/e2e/singlecard/test_offline_inference.py index c6c68e5..687bb2d 100644 --- a/tests/e2e/singlecard/test_offline_inference.py +++ b/tests/e2e/singlecard/test_offline_inference.py @@ -25,7 +25,6 @@ from unittest.mock import patch import pytest import vllm # noqa: F401 -from modelscope import snapshot_download # type: ignore[import-untyped] from vllm import SamplingParams from vllm.assets.audio import AudioAsset from vllm.assets.image import ImageAsset @@ -40,9 +39,6 @@ MODELS = [ MULTIMODALITY_VL_MODELS = ["Qwen/Qwen2.5-VL-3B-Instruct"] MULTIMODALITY_AUDIO_MODELS = ["Qwen/Qwen2-Audio-7B-Instruct"] -QUANTIZATION_MODELS = [ - "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8", -] os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" AUDIO_ASSETS = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")] AUDIO_PROMPT_TEMPLATES = { @@ -70,27 +66,6 @@ def test_models(model: str, dtype: str, max_tokens: int) -> None: vllm_model.generate_greedy(example_prompts, max_tokens) -@pytest.mark.parametrize("model", QUANTIZATION_MODELS) -@pytest.mark.parametrize("max_tokens", [5]) -def test_quantization_models(model: str, max_tokens: int) -> None: - prompt = "The following numbers of the sequence " + ", ".join( - str(i) for i in range(1024)) + " are:" - example_prompts = [prompt] - - # NOTE: Using quantized model repo id from modelscope encounters an issue, - # this pr (https://github.com/vllm-project/vllm/pull/19212) fix the issue, - # after it is being merged, there's no need to download model explicitly. - model_path = snapshot_download(model) - - with VllmRunner(model_path, - max_model_len=8192, - enforce_eager=True, - dtype="auto", - gpu_memory_utilization=0.7, - quantization="ascend") as vllm_model: - vllm_model.generate_greedy(example_prompts, max_tokens) - - @pytest.mark.parametrize("model", MULTIMODALITY_VL_MODELS) def test_multimodal_vl(model, prompt_template, vllm_runner): image = ImageAsset("cherry_blossom") \