From e56dba9b0d44dbdd37f8c57ea89658f6af19b031 Mon Sep 17 00:00:00 2001 From: SILONG ZENG <2609716663@qq.com> Date: Thu, 11 Dec 2025 20:35:32 +0800 Subject: [PATCH] [CI]cleanup e2e test (#4800) ### What this PR does / why we need it? This PR refactors the E2E multicard test suite to improve test case identification and maintainability. Specifically, it renames various test functions to be more descriptive (explicitly indicating model families like Qwen/DeepSeek and parallelism strategies like DP/TP/PP/EP) and cleans up outdated or redundant test configurations in the offline distributed inference tests. **Key Changes:** 1. Test Function Renaming (Standardization): Renamed multiple test functions across **`tests/e2e/multicard/`** to include clear suffixes/prefixes regarding the model and parallel strategy. This helps differentiate test cases in CI logs and prevents naming collisions. **`test_aclgraph_capture_replay.py`:** - `test_aclgraph_capture_replay_dp2` -> `test_aclgraph_capture_replay_metrics_dp2` **`test_data_parallel.py`:** - `test_data_parallel_inference` -> `test_qwen_inference_dp2` **`test_data_parallel_tp2.py`:** - `test_data_parallel_inference` -> `test_qwen_inference_dp2_tp2` **`test_expert_parallel.py`:** - `test_e2e_ep_correctness` -> `test_deepseek_correctness_ep` **`test_external_launcher.py`:** - `test_external_launcher` -> `test_qwen_external_launcher` - `test_moe_external_launcher` -> `test_qwen_moe_external_launcher_ep` - `test_external_launcher_and_sleepmode` -> `test_qwen_external_launcher_with_sleepmode` - `test_external_launcher_and_sleepmode_level2` -> `test_qwen_external_launcher_with_sleepmode_level2` - `test_mm_allreduce` -> `test_qwen_external_launcher_with_matmul_allreduce` **`test_full_graph_mode.py`:** - `test_models_distributed_Qwen3_MOE_TP2_WITH_FULL_DECODE_ONLY` -> `test_qwen_moe_with_full_decode_only` - `test_models_distributed_Qwen3_MOE_TP2_WITH_FULL` -> `test_qwen_moe_with_full` **`test_fused_moe_allgather_ep.py`:** - `test_generate_with_allgather `-> `test_deepseek_moe_fused_allgather_ep` - `test_generate_with_alltoall` -> `test_deepseek_moe_fused_alltoall_ep` **`test_offline_weight_load.py`:** - `test_offline_weight_load_and_sleepmode` -> `test_qwen_offline_weight_load_and_sleepmode` **`test_pipeline_parallel.py`:** - `test_models` -> `test_models_pp2` 2. Distributed Inference Cleanup (**`test_offline_inference_distributed.py`**): **model list changes:** ``` QWEN_DENSE_MODELS = [ - "vllm-ascend/Qwen3-8B-W8A8", "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8" + "vllm-ascend/Qwen3-8B-W8A8", ] ``` ``` - QWEN_W4A8_OLD_VERSION_MODELS = [ - "vllm-ascend/Qwen3-8B-W4A8", - ] - QWEN_W4A8_NEW_VERSION_MODELS = [ - "vllm-ascend/DeepSeek-V3-W4A8-Pruing", - "vllm-ascend/DeepSeek-V3.1-W4A8-puring", - ] + DEEPSEEK_W4A8_MODELS = [ + "vllm-ascend/DeepSeek-V3.1-W4A8-puring", + ] ``` **Test Function Changes:** - removed `test_models_distributed_QwQ` - removed `test_models_distributed_Qwen3_W8A8` - removed `test_models_distributed_Qwen3_W4A8DYNAMIC_old_version` - `test_models_distributed_Qwen3_W4A8DYNAMIC_new_version` -> `test_models_distributed_Qwen3_W4A8DYNAMIC` - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 --------- Signed-off-by: MrZ20 <2609716663@qq.com> --- .github/workflows/_e2e_test.yaml | 6 +- .../multicard/test_aclgraph_capture_replay.py | 2 +- tests/e2e/multicard/test_data_parallel.py | 2 +- tests/e2e/multicard/test_data_parallel_tp2.py | 2 +- tests/e2e/multicard/test_expert_parallel.py | 2 +- tests/e2e/multicard/test_external_launcher.py | 10 +- tests/e2e/multicard/test_full_graph_mode.py | 4 +- .../multicard/test_fused_moe_allgather_ep.py | 4 +- .../test_offline_inference_distributed.py | 63 +------------ .../e2e/multicard/test_offline_weight_load.py | 2 +- tests/e2e/multicard/test_pipeline_parallel.py | 94 +++++++++---------- 11 files changed, 67 insertions(+), 124 deletions(-) diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index 9af889eb..cda75bae 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -189,11 +189,8 @@ jobs: #pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py # To avoid oom, we need to run the test in a single process. - pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe - pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W8A8 - pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC_new_version - pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC_old_version + pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W4A8DYNAMIC pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_sp_for_qwen3_moe pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_fc2_for_qwen3_moe @@ -272,7 +269,6 @@ jobs: # pytest -sv tests/e2e/multicard/test_qwen3_moe.py::test_models_distributed_Qwen3_MOE_TP2_WITH_EP # pytest -sv tests/e2e/multicard/test_qwen3_moe.py::test_models_distributed_Qwen3_MOE_W8A8_WITH_EP pytest -sv tests/e2e/multicard/test_data_parallel_tp2.py - - name: Install Ascend toolkit & triton_ascend (for Qwen3-Next-80B-A3B-Instruct) shell: bash -l {0} run: | diff --git a/tests/e2e/multicard/test_aclgraph_capture_replay.py b/tests/e2e/multicard/test_aclgraph_capture_replay.py index 4375e825..e81b5615 100644 --- a/tests/e2e/multicard/test_aclgraph_capture_replay.py +++ b/tests/e2e/multicard/test_aclgraph_capture_replay.py @@ -134,7 +134,7 @@ def _run_worker_process( @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [4, 36]) @patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1"}) -def test_aclgraph_capture_replay_dp2( +def test_aclgraph_capture_replay_metrics_dp2( model: str, max_tokens: int, monkeypatch: pytest.MonkeyPatch, diff --git a/tests/e2e/multicard/test_data_parallel.py b/tests/e2e/multicard/test_data_parallel.py index e6959b02..cb3c6048 100644 --- a/tests/e2e/multicard/test_data_parallel.py +++ b/tests/e2e/multicard/test_data_parallel.py @@ -38,7 +38,7 @@ MODELS = [ @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [32]) @patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1"}) -def test_data_parallel_inference(model, max_tokens): +def test_qwen_inference_dp2(model, max_tokens): moe_models = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"] quantization_models = ["vllm-ascend/Qwen3-30B-A3B-W8A8"] script = "examples/offline_data_parallel.py" diff --git a/tests/e2e/multicard/test_data_parallel_tp2.py b/tests/e2e/multicard/test_data_parallel_tp2.py index 6b0bdabe..202eaa9c 100644 --- a/tests/e2e/multicard/test_data_parallel_tp2.py +++ b/tests/e2e/multicard/test_data_parallel_tp2.py @@ -15,7 +15,7 @@ MODELS = ["Qwen/Qwen3-0.6B"] @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [32]) @patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1,2,3"}) -def test_data_parallel_inference(model, max_tokens): +def test_qwen_inference_dp2_tp2(model, max_tokens): script = "examples/offline_data_parallel.py" env = os.environ.copy() diff --git a/tests/e2e/multicard/test_expert_parallel.py b/tests/e2e/multicard/test_expert_parallel.py index b8f03d5f..762ca6d2 100644 --- a/tests/e2e/multicard/test_expert_parallel.py +++ b/tests/e2e/multicard/test_expert_parallel.py @@ -5,7 +5,7 @@ from tests.e2e.model_utils import check_outputs_equal @pytest.mark.parametrize("model_name", ["deepseek-ai/DeepSeek-V2-Lite-Chat"]) -def test_e2e_ep_correctness(model_name): +def test_deepseek_correctness_ep(model_name): example_prompts = [ "Hello, my name is", "The president of the United States is", diff --git a/tests/e2e/multicard/test_external_launcher.py b/tests/e2e/multicard/test_external_launcher.py index ece35def..4a4a17ec 100644 --- a/tests/e2e/multicard/test_external_launcher.py +++ b/tests/e2e/multicard/test_external_launcher.py @@ -37,7 +37,7 @@ DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10] @pytest.mark.parametrize("model", MODELS) @patch.dict(os.environ, {"HCCL_BUFFSIZE": "500"}) -def test_external_launcher(model): +def test_qwen_external_launcher(model): script = Path( __file__ ).parent.parent.parent.parent / "examples" / "offline_external_launcher.py" @@ -78,7 +78,7 @@ def test_external_launcher(model): @pytest.mark.parametrize("model", MOE_MODELS) -def test_moe_external_launcher(model): +def test_qwen_moe_external_launcher_ep(model): script = Path( __file__ ).parent.parent.parent.parent / "examples" / "offline_external_launcher.py" @@ -109,7 +109,7 @@ def test_moe_external_launcher(model): @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"}) -def test_external_launcher_and_sleepmode(): +def test_qwen_external_launcher_with_sleepmode(): script = Path( __file__ ).parent.parent.parent.parent / "examples" / "offline_external_launcher.py" @@ -154,7 +154,7 @@ def test_external_launcher_and_sleepmode(): @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"}) -def test_external_launcher_and_sleepmode_level2(): +def test_qwen_external_launcher_with_sleepmode_level2(): script = Path( __file__ ).parent.parent.parent.parent / "examples" / "offline_external_launcher.py" @@ -210,7 +210,7 @@ def test_external_launcher_and_sleepmode_level2(): "VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": "1", "HCCL_BUFFSIZE": "500" }) -def test_mm_allreduce(model): +def test_qwen_external_launcher_with_matmul_allreduce(model): script = Path( __file__ ).parent.parent.parent.parent / "examples" / "offline_external_launcher.py" diff --git a/tests/e2e/multicard/test_full_graph_mode.py b/tests/e2e/multicard/test_full_graph_mode.py index 3ccbf823..c788e9da 100644 --- a/tests/e2e/multicard/test_full_graph_mode.py +++ b/tests/e2e/multicard/test_full_graph_mode.py @@ -29,7 +29,7 @@ from tests.e2e.conftest import VllmRunner from tests.e2e.model_utils import check_outputs_equal -def test_models_distributed_Qwen3_MOE_TP2_WITH_FULL_DECODE_ONLY(): +def test_qwen_moe_with_full_decode_only(): if 'HCCL_OP_EXPANSION_MODE' in os.environ: del os.environ['HCCL_OP_EXPANSION_MODE'] prompts = [ @@ -75,7 +75,7 @@ def test_models_distributed_Qwen3_MOE_TP2_WITH_FULL_DECODE_ONLY(): ) -def test_models_distributed_Qwen3_MOE_TP2_WITH_FULL(): +def test_qwen_moe_with_full(): if 'HCCL_OP_EXPANSION_MODE' in os.environ: del os.environ['HCCL_OP_EXPANSION_MODE'] prompts = [ diff --git a/tests/e2e/multicard/test_fused_moe_allgather_ep.py b/tests/e2e/multicard/test_fused_moe_allgather_ep.py index 85d246e5..4fa111ce 100644 --- a/tests/e2e/multicard/test_fused_moe_allgather_ep.py +++ b/tests/e2e/multicard/test_fused_moe_allgather_ep.py @@ -41,7 +41,7 @@ from tests.e2e.conftest import VllmRunner "TASK_QUEUE_ENABLE": "1", "VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP": "1" }) -def test_generate_with_allgather(): +def test_deepseek_moe_fused_allgather_ep(): example_prompts = ["Hello, my name is"] sampling_params = SamplingParams(max_tokens=100, temperature=0.0) @@ -62,7 +62,7 @@ def test_generate_with_allgather(): "VLLM_WORKER_MULTIPROC_METHOD": "spawn", "TASK_QUEUE_ENABLE": "1" }) -def test_generate_with_alltoall(): +def test_deepseek_moe_fused_alltoall_ep(): example_prompts = ["Hello, my name is"] sampling_params = SamplingParams(max_tokens=100, temperature=0.0) diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py index a13276bb..f4cf5a25 100644 --- a/tests/e2e/multicard/test_offline_inference_distributed.py +++ b/tests/e2e/multicard/test_offline_inference_distributed.py @@ -33,20 +33,15 @@ os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" QWEN_DENSE_MODELS = [ - "vllm-ascend/Qwen3-8B-W8A8", "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8" + "vllm-ascend/Qwen3-8B-W8A8", ] -QWEN_W4A8_OLD_VERSION_MODELS = [ - "vllm-ascend/Qwen3-8B-W4A8", -] - -QWEN_W4A8_NEW_VERSION_MODELS = [ +QWEN_W4A8_MODELS = [ "vllm-ascend/Qwen3-1.7B-W4A8-V1", ] DEEPSEEK_W4A8_MODELS = [ - "vllm-ascend/DeepSeek-V3-W4A8-Pruing", - "vllm-ascend/DeepSeek-V3.1-W4A8-puring" + "vllm-ascend/DeepSeek-V3.1-W4A8-puring", ] KIMI_W4A16_MODELS = [ @@ -54,22 +49,6 @@ KIMI_W4A16_MODELS = [ ] -def test_models_distributed_QwQ(): - example_prompts = [ - "Hello, my name is", - ] - dtype = "half" - max_tokens = 5 - with VllmRunner( - "Qwen/QwQ-32B", - dtype=dtype, - tensor_parallel_size=2, - distributed_executor_backend="mp", - enforce_eager=False, - ) as vllm_model: - vllm_model.generate_greedy(example_prompts, max_tokens) - - def test_models_distributed_DeepSeek_multistream_moe(): example_prompts = [ "Hello, my name is", @@ -89,40 +68,8 @@ def test_models_distributed_DeepSeek_multistream_moe(): vllm_model.generate_greedy(example_prompts, max_tokens) -def test_models_distributed_Qwen3_W8A8(): - example_prompts = [ - "Hello, my name is", - ] - max_tokens = 5 - - with VllmRunner( - snapshot_download("vllm-ascend/Qwen3-8B-W8A8"), - max_model_len=8192, - dtype="auto", - tensor_parallel_size=2, - quantization="ascend", - ) as vllm_model: - vllm_model.generate_greedy(example_prompts, max_tokens) - - -@pytest.mark.parametrize("model", QWEN_W4A8_OLD_VERSION_MODELS) -def test_models_distributed_Qwen3_W4A8DYNAMIC_old_version(model): - prompts = [ - "Hello, my name is", - ] - max_tokens = 5 - with VllmRunner( - snapshot_download(model), - max_model_len=8192, - dtype="auto", - tensor_parallel_size=2, - quantization="ascend", - ) as vllm_model: - vllm_model.generate_greedy(prompts, max_tokens) - - -@pytest.mark.parametrize("model", QWEN_W4A8_NEW_VERSION_MODELS) -def test_models_distributed_Qwen3_W4A8DYNAMIC_new_version(model): +@pytest.mark.parametrize("model", QWEN_W4A8_MODELS) +def test_models_distributed_Qwen3_W4A8DYNAMIC(model): prompts = [ "Hello, my name is", ] diff --git a/tests/e2e/multicard/test_offline_weight_load.py b/tests/e2e/multicard/test_offline_weight_load.py index 0e9ba95c..dd0ac01f 100644 --- a/tests/e2e/multicard/test_offline_weight_load.py +++ b/tests/e2e/multicard/test_offline_weight_load.py @@ -31,7 +31,7 @@ MODELS = ["Qwen/Qwen3-30B-A3B"] @pytest.mark.parametrize("model", MODELS) @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"}) -def test_offline_weight_load_and_sleepmode(model): +def test_qwen_offline_weight_load_and_sleepmode(model): script = Path( __file__ ).parent.parent.parent.parent / "examples" / "offline_external_launcher.py" diff --git a/tests/e2e/multicard/test_pipeline_parallel.py b/tests/e2e/multicard/test_pipeline_parallel.py index fa21fe8d..855724ea 100644 --- a/tests/e2e/multicard/test_pipeline_parallel.py +++ b/tests/e2e/multicard/test_pipeline_parallel.py @@ -1,47 +1,47 @@ -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# -import pytest - -from tests.e2e.conftest import VllmRunner - -MODELS = [ - "Qwen/Qwen3-0.6B", - "deepseek-ai/DeepSeek-V2-Lite-Chat", -] - -TENSOR_PARALLELS = [1] -PIPELINE_PARALLELS = [2] -DIST_EXECUTOR_BACKEND = ["mp", "ray"] - -prompts = [ - "Hello, my name is", - "The future of AI is", -] - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS) -@pytest.mark.parametrize("pp_size", PIPELINE_PARALLELS) -@pytest.mark.parametrize("distributed_executor_backend", DIST_EXECUTOR_BACKEND) -def test_models(model: str, tp_size: int, pp_size: int, - distributed_executor_backend: str) -> None: - with VllmRunner(model, - tensor_parallel_size=tp_size, - pipeline_parallel_size=pp_size, - distributed_executor_backend=distributed_executor_backend, - gpu_memory_utilization=0.7) as vllm_model: - vllm_model.generate_greedy(prompts, 64) +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +import pytest + +from tests.e2e.conftest import VllmRunner + +MODELS = [ + "Qwen/Qwen3-0.6B", + "deepseek-ai/DeepSeek-V2-Lite-Chat", +] + +TENSOR_PARALLELS = [1] +PIPELINE_PARALLELS = [2] +DIST_EXECUTOR_BACKEND = ["mp", "ray"] + +prompts = [ + "Hello, my name is", + "The future of AI is", +] + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS) +@pytest.mark.parametrize("pp_size", PIPELINE_PARALLELS) +@pytest.mark.parametrize("distributed_executor_backend", DIST_EXECUTOR_BACKEND) +def test_models_pp2(model: str, tp_size: int, pp_size: int, + distributed_executor_backend: str) -> None: + with VllmRunner(model, + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + distributed_executor_backend=distributed_executor_backend, + gpu_memory_utilization=0.7) as vllm_model: + vllm_model.generate_greedy(prompts, 64)