[CI]cleanup e2e test (#4800)

### What this PR does / why we need it? This PR refactors the E2E multicard test suite to improve test case identification and maintainability. Specifically, it renames various test functions to be more descriptive (explicitly indicating model families like Qwen/DeepSeek and parallelism strategies like DP/TP/PP/EP) and cleans up outdated or redundant test configurations in the offline distributed inference tests. **Key Changes:** 1. Test Function Renaming (Standardization): Renamed multiple test functions across **`tests/e2e/multicard/`** to include clear suffixes/prefixes regarding the model and parallel strategy. This helps differentiate test cases in CI logs and prevents naming collisions. **`test_aclgraph_capture_replay.py`:** - `test_aclgraph_capture_replay_dp2` -> `test_aclgraph_capture_replay_metrics_dp2` **`test_data_parallel.py`:** - `test_data_parallel_inference` -> `test_qwen_inference_dp2` **`test_data_parallel_tp2.py`:** - `test_data_parallel_inference` -> `test_qwen_inference_dp2_tp2` **`test_expert_parallel.py`:** - `test_e2e_ep_correctness` -> `test_deepseek_correctness_ep` **`test_external_launcher.py`:** - `test_external_launcher` -> `test_qwen_external_launcher` - `test_moe_external_launcher` -> `test_qwen_moe_external_launcher_ep` - `test_external_launcher_and_sleepmode` -> `test_qwen_external_launcher_with_sleepmode` - `test_external_launcher_and_sleepmode_level2` -> `test_qwen_external_launcher_with_sleepmode_level2` - `test_mm_allreduce` -> `test_qwen_external_launcher_with_matmul_allreduce` **`test_full_graph_mode.py`:** - `test_models_distributed_Qwen3_MOE_TP2_WITH_FULL_DECODE_ONLY` -> `test_qwen_moe_with_full_decode_only` - `test_models_distributed_Qwen3_MOE_TP2_WITH_FULL` -> `test_qwen_moe_with_full` **`test_fused_moe_allgather_ep.py`:** - `test_generate_with_allgather `-> `test_deepseek_moe_fused_allgather_ep` - `test_generate_with_alltoall` -> `test_deepseek_moe_fused_alltoall_ep` **`test_offline_weight_load.py`:** - `test_offline_weight_load_and_sleepmode` -> `test_qwen_offline_weight_load_and_sleepmode` **`test_pipeline_parallel.py`:** - `test_models` -> `test_models_pp2` 2. Distributed Inference Cleanup (**`test_offline_inference_distributed.py`**): **model list changes:** ``` QWEN_DENSE_MODELS = [ - "vllm-ascend/Qwen3-8B-W8A8", "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8" + "vllm-ascend/Qwen3-8B-W8A8", ] ``` ``` - QWEN_W4A8_OLD_VERSION_MODELS = [ - "vllm-ascend/Qwen3-8B-W4A8", - ] - QWEN_W4A8_NEW_VERSION_MODELS = [ - "vllm-ascend/DeepSeek-V3-W4A8-Pruing", - "vllm-ascend/DeepSeek-V3.1-W4A8-puring", - ] + DEEPSEEK_W4A8_MODELS = [ + "vllm-ascend/DeepSeek-V3.1-W4A8-puring", + ] ``` **Test Function Changes:** - removed `test_models_distributed_QwQ` - removed `test_models_distributed_Qwen3_W8A8` - removed `test_models_distributed_Qwen3_W4A8DYNAMIC_old_version` - `test_models_distributed_Qwen3_W4A8DYNAMIC_new_version` -> `test_models_distributed_Qwen3_W4A8DYNAMIC` - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: MrZ20 <2609716663@qq.com>
2025-12-11 20:35:32 +08:00
parent 3349f61769
commit e56dba9b0d
11 changed files with 67 additions and 124 deletions
--- a/.github/workflows/_e2e_test.yaml
+++ b/.github/workflows/_e2e_test.yaml
@@ -189,11 +189,8 @@ jobs:
          #pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
          # To avoid oom, we need to run the test in a single process.
          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W8A8
+          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC
          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC_new_version
          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC_old_version
          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W4A8DYNAMIC
          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_sp_for_qwen3_moe
          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_fc2_for_qwen3_moe
@@ -272,7 +269,6 @@ jobs:
          # pytest -sv tests/e2e/multicard/test_qwen3_moe.py::test_models_distributed_Qwen3_MOE_TP2_WITH_EP
          # pytest -sv tests/e2e/multicard/test_qwen3_moe.py::test_models_distributed_Qwen3_MOE_W8A8_WITH_EP
          pytest -sv tests/e2e/multicard/test_data_parallel_tp2.py
      - name: Install Ascend toolkit & triton_ascend (for Qwen3-Next-80B-A3B-Instruct)
        shell: bash -l {0}
        run: |
--- a/tests/e2e/multicard/test_aclgraph_capture_replay.py
+++ b/tests/e2e/multicard/test_aclgraph_capture_replay.py
@@ -134,7 +134,7 @@ def _run_worker_process(
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [4, 36])
@patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1"})
-def test_aclgraph_capture_replay_dp2(
+def test_aclgraph_capture_replay_metrics_dp2(
    model: str,
    max_tokens: int,
    monkeypatch: pytest.MonkeyPatch,
--- a/tests/e2e/multicard/test_data_parallel.py
+++ b/tests/e2e/multicard/test_data_parallel.py
@@ -38,7 +38,7 @@ MODELS = [
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
@patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1"})
-def test_data_parallel_inference(model, max_tokens):
+def test_qwen_inference_dp2(model, max_tokens):
    moe_models = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"]
    quantization_models = ["vllm-ascend/Qwen3-30B-A3B-W8A8"]
    script = "examples/offline_data_parallel.py"
--- a/tests/e2e/multicard/test_data_parallel_tp2.py
+++ b/tests/e2e/multicard/test_data_parallel_tp2.py
@@ -15,7 +15,7 @@ MODELS = ["Qwen/Qwen3-0.6B"]
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
@patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1,2,3"})
-def test_data_parallel_inference(model, max_tokens):
+def test_qwen_inference_dp2_tp2(model, max_tokens):
    script = "examples/offline_data_parallel.py"
    env = os.environ.copy()
--- a/tests/e2e/multicard/test_expert_parallel.py
+++ b/tests/e2e/multicard/test_expert_parallel.py
@@ -5,7 +5,7 @@ from tests.e2e.model_utils import check_outputs_equal
@pytest.mark.parametrize("model_name", ["deepseek-ai/DeepSeek-V2-Lite-Chat"])
-def test_e2e_ep_correctness(model_name):
+def test_deepseek_correctness_ep(model_name):
    example_prompts = [
        "Hello, my name is",
        "The president of the United States is",
--- a/tests/e2e/multicard/test_external_launcher.py
+++ b/tests/e2e/multicard/test_external_launcher.py
@@ -37,7 +37,7 @@ DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
@pytest.mark.parametrize("model", MODELS)
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "500"})
-def test_external_launcher(model):
+def test_qwen_external_launcher(model):
    script = Path(
        __file__
    ).parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
@@ -78,7 +78,7 @@ def test_external_launcher(model):
@pytest.mark.parametrize("model", MOE_MODELS)
-def test_moe_external_launcher(model):
+def test_qwen_moe_external_launcher_ep(model):
    script = Path(
        __file__
    ).parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
@@ -109,7 +109,7 @@ def test_moe_external_launcher(model):
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
-def test_external_launcher_and_sleepmode():
+def test_qwen_external_launcher_with_sleepmode():
    script = Path(
        __file__
    ).parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
@@ -154,7 +154,7 @@ def test_external_launcher_and_sleepmode():
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
-def test_external_launcher_and_sleepmode_level2():
+def test_qwen_external_launcher_with_sleepmode_level2():
    script = Path(
        __file__
    ).parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
@@ -210,7 +210,7 @@ def test_external_launcher_and_sleepmode_level2():
    "VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": "1",
    "HCCL_BUFFSIZE": "500"
 })
-def test_mm_allreduce(model):
+def test_qwen_external_launcher_with_matmul_allreduce(model):
    script = Path(
        __file__
    ).parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
--- a/tests/e2e/multicard/test_full_graph_mode.py
+++ b/tests/e2e/multicard/test_full_graph_mode.py
@@ -29,7 +29,7 @@ from tests.e2e.conftest import VllmRunner
 from tests.e2e.model_utils import check_outputs_equal
-def test_models_distributed_Qwen3_MOE_TP2_WITH_FULL_DECODE_ONLY():
+def test_qwen_moe_with_full_decode_only():
    if 'HCCL_OP_EXPANSION_MODE' in os.environ:
        del os.environ['HCCL_OP_EXPANSION_MODE']
    prompts = [
@@ -75,7 +75,7 @@ def test_models_distributed_Qwen3_MOE_TP2_WITH_FULL_DECODE_ONLY():
    )
-def test_models_distributed_Qwen3_MOE_TP2_WITH_FULL():
+def test_qwen_moe_with_full():
    if 'HCCL_OP_EXPANSION_MODE' in os.environ:
        del os.environ['HCCL_OP_EXPANSION_MODE']
    prompts = [
--- a/tests/e2e/multicard/test_fused_moe_allgather_ep.py
+++ b/tests/e2e/multicard/test_fused_moe_allgather_ep.py
@@ -41,7 +41,7 @@ from tests.e2e.conftest import VllmRunner
        "TASK_QUEUE_ENABLE": "1",
        "VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP": "1"
    })
-def test_generate_with_allgather():
+def test_deepseek_moe_fused_allgather_ep():
    example_prompts = ["Hello, my name is"]
    sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
@@ -62,7 +62,7 @@ def test_generate_with_allgather():
    "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
    "TASK_QUEUE_ENABLE": "1"
 })
-def test_generate_with_alltoall():
+def test_deepseek_moe_fused_alltoall_ep():
    example_prompts = ["Hello, my name is"]
    sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
--- a/tests/e2e/multicard/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -33,20 +33,15 @@ os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 QWEN_DENSE_MODELS = [
-    "vllm-ascend/Qwen3-8B-W8A8", "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8"
+    "vllm-ascend/Qwen3-8B-W8A8",
 ]
-QWEN_W4A8_OLD_VERSION_MODELS = [
+QWEN_W4A8_MODELS = [
    "vllm-ascend/Qwen3-8B-W4A8",
 ]
 QWEN_W4A8_NEW_VERSION_MODELS = [
    "vllm-ascend/Qwen3-1.7B-W4A8-V1",
 ]
 DEEPSEEK_W4A8_MODELS = [
-    "vllm-ascend/DeepSeek-V3-W4A8-Pruing",
+    "vllm-ascend/DeepSeek-V3.1-W4A8-puring",
    "vllm-ascend/DeepSeek-V3.1-W4A8-puring"
 ]
 KIMI_W4A16_MODELS = [
@@ -54,22 +49,6 @@ KIMI_W4A16_MODELS = [
 ]
 def test_models_distributed_QwQ():
    example_prompts = [
        "Hello, my name is",
    ]
    dtype = "half"
    max_tokens = 5
    with VllmRunner(
            "Qwen/QwQ-32B",
            dtype=dtype,
            tensor_parallel_size=2,
            distributed_executor_backend="mp",
            enforce_eager=False,
    ) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)
 def test_models_distributed_DeepSeek_multistream_moe():
    example_prompts = [
        "Hello, my name is",
@@ -89,40 +68,8 @@ def test_models_distributed_DeepSeek_multistream_moe():
        vllm_model.generate_greedy(example_prompts, max_tokens)
-def test_models_distributed_Qwen3_W8A8():
+@pytest.mark.parametrize("model", QWEN_W4A8_MODELS)
-    example_prompts = [
+def test_models_distributed_Qwen3_W4A8DYNAMIC(model):
        "Hello, my name is",
    ]
    max_tokens = 5
    with VllmRunner(
            snapshot_download("vllm-ascend/Qwen3-8B-W8A8"),
            max_model_len=8192,
            dtype="auto",
            tensor_parallel_size=2,
            quantization="ascend",
    ) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)
@pytest.mark.parametrize("model", QWEN_W4A8_OLD_VERSION_MODELS)
 def test_models_distributed_Qwen3_W4A8DYNAMIC_old_version(model):
    prompts = [
        "Hello, my name is",
    ]
    max_tokens = 5
    with VllmRunner(
            snapshot_download(model),
            max_model_len=8192,
            dtype="auto",
            tensor_parallel_size=2,
            quantization="ascend",
    ) as vllm_model:
        vllm_model.generate_greedy(prompts, max_tokens)
@pytest.mark.parametrize("model", QWEN_W4A8_NEW_VERSION_MODELS)
 def test_models_distributed_Qwen3_W4A8DYNAMIC_new_version(model):
    prompts = [
        "Hello, my name is",
    ]
--- a/tests/e2e/multicard/test_offline_weight_load.py
+++ b/tests/e2e/multicard/test_offline_weight_load.py
@@ -31,7 +31,7 @@ MODELS = ["Qwen/Qwen3-30B-A3B"]
@pytest.mark.parametrize("model", MODELS)
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
-def test_offline_weight_load_and_sleepmode(model):
+def test_qwen_offline_weight_load_and_sleepmode(model):
    script = Path(
        __file__
    ).parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
--- a/tests/e2e/multicard/test_pipeline_parallel.py
+++ b/tests/e2e/multicard/test_pipeline_parallel.py
@@ -1,47 +1,47 @@
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# Copyright 2023 The vLLM team.
+# Copyright 2023 The vLLM team.
-#
+#
-# Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
+# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# You may obtain a copy of the License at
-#
+#
-#     http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
-#
+#
-# Unless required by applicable law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
+# See the License for the specific language governing permissions and
-# limitations under the License.
+# limitations under the License.
-# This file is a part of the vllm-ascend project.
+# This file is a part of the vllm-ascend project.
-#
+#
-import pytest
+import pytest
-
+
-from tests.e2e.conftest import VllmRunner
+from tests.e2e.conftest import VllmRunner
-
+
-MODELS = [
+MODELS = [
-    "Qwen/Qwen3-0.6B",
+    "Qwen/Qwen3-0.6B",
-    "deepseek-ai/DeepSeek-V2-Lite-Chat",
+    "deepseek-ai/DeepSeek-V2-Lite-Chat",
-]
+]
-
+
-TENSOR_PARALLELS = [1]
+TENSOR_PARALLELS = [1]
-PIPELINE_PARALLELS = [2]
+PIPELINE_PARALLELS = [2]
-DIST_EXECUTOR_BACKEND = ["mp", "ray"]
+DIST_EXECUTOR_BACKEND = ["mp", "ray"]
-
+
-prompts = [
+prompts = [
-    "Hello, my name is",
+    "Hello, my name is",
-    "The future of AI is",
+    "The future of AI is",
-]
+]
-
+
-
+
-@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
+@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
-@pytest.mark.parametrize("pp_size", PIPELINE_PARALLELS)
+@pytest.mark.parametrize("pp_size", PIPELINE_PARALLELS)
-@pytest.mark.parametrize("distributed_executor_backend", DIST_EXECUTOR_BACKEND)
+@pytest.mark.parametrize("distributed_executor_backend", DIST_EXECUTOR_BACKEND)
-def test_models(model: str, tp_size: int, pp_size: int,
+def test_models_pp2(model: str, tp_size: int, pp_size: int,
-                distributed_executor_backend: str) -> None:
+                    distributed_executor_backend: str) -> None:
-    with VllmRunner(model,
+    with VllmRunner(model,
-                    tensor_parallel_size=tp_size,
+                    tensor_parallel_size=tp_size,
-                    pipeline_parallel_size=pp_size,
+                    pipeline_parallel_size=pp_size,
-                    distributed_executor_backend=distributed_executor_backend,
+                    distributed_executor_backend=distributed_executor_backend,
-                    gpu_memory_utilization=0.7) as vllm_model:
+                    gpu_memory_utilization=0.7) as vllm_model:
-        vllm_model.generate_greedy(prompts, 64)
+        vllm_model.generate_greedy(prompts, 64)