[CI]cleanup e2e test (#4800)

### What this PR does / why we need it? This PR refactors the E2E multicard test suite to improve test case identification and maintainability. Specifically, it renames various test functions to be more descriptive (explicitly indicating model families like Qwen/DeepSeek and parallelism strategies like DP/TP/PP/EP) and cleans up outdated or redundant test configurations in the offline distributed inference tests. **Key Changes:** 1. Test Function Renaming (Standardization): Renamed multiple test functions across **`tests/e2e/multicard/`** to include clear suffixes/prefixes regarding the model and parallel strategy. This helps differentiate test cases in CI logs and prevents naming collisions. **`test_aclgraph_capture_replay.py`:** - `test_aclgraph_capture_replay_dp2` -> `test_aclgraph_capture_replay_metrics_dp2` **`test_data_parallel.py`:** - `test_data_parallel_inference` -> `test_qwen_inference_dp2` **`test_data_parallel_tp2.py`:** - `test_data_parallel_inference` -> `test_qwen_inference_dp2_tp2` **`test_expert_parallel.py`:** - `test_e2e_ep_correctness` -> `test_deepseek_correctness_ep` **`test_external_launcher.py`:** - `test_external_launcher` -> `test_qwen_external_launcher` - `test_moe_external_launcher` -> `test_qwen_moe_external_launcher_ep` - `test_external_launcher_and_sleepmode` -> `test_qwen_external_launcher_with_sleepmode` - `test_external_launcher_and_sleepmode_level2` -> `test_qwen_external_launcher_with_sleepmode_level2` - `test_mm_allreduce` -> `test_qwen_external_launcher_with_matmul_allreduce` **`test_full_graph_mode.py`:** - `test_models_distributed_Qwen3_MOE_TP2_WITH_FULL_DECODE_ONLY` -> `test_qwen_moe_with_full_decode_only` - `test_models_distributed_Qwen3_MOE_TP2_WITH_FULL` -> `test_qwen_moe_with_full` **`test_fused_moe_allgather_ep.py`:** - `test_generate_with_allgather `-> `test_deepseek_moe_fused_allgather_ep` - `test_generate_with_alltoall` -> `test_deepseek_moe_fused_alltoall_ep` **`test_offline_weight_load.py`:** - `test_offline_weight_load_and_sleepmode` -> `test_qwen_offline_weight_load_and_sleepmode` **`test_pipeline_parallel.py`:** - `test_models` -> `test_models_pp2` 2. Distributed Inference Cleanup (**`test_offline_inference_distributed.py`**): **model list changes:** ``` QWEN_DENSE_MODELS = [ - "vllm-ascend/Qwen3-8B-W8A8", "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8" + "vllm-ascend/Qwen3-8B-W8A8", ] ``` ``` - QWEN_W4A8_OLD_VERSION_MODELS = [ - "vllm-ascend/Qwen3-8B-W4A8", - ] - QWEN_W4A8_NEW_VERSION_MODELS = [ - "vllm-ascend/DeepSeek-V3-W4A8-Pruing", - "vllm-ascend/DeepSeek-V3.1-W4A8-puring", - ] + DEEPSEEK_W4A8_MODELS = [ + "vllm-ascend/DeepSeek-V3.1-W4A8-puring", + ] ``` **Test Function Changes:** - removed `test_models_distributed_QwQ` - removed `test_models_distributed_Qwen3_W8A8` - removed `test_models_distributed_Qwen3_W4A8DYNAMIC_old_version` - `test_models_distributed_Qwen3_W4A8DYNAMIC_new_version` -> `test_models_distributed_Qwen3_W4A8DYNAMIC` - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: MrZ20 <2609716663@qq.com>
2025-12-11 20:35:32 +08:00
parent 3349f61769
commit e56dba9b0d
11 changed files with 67 additions and 124 deletions
--- a/tests/e2e/multicard/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -33,20 +33,15 @@ os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

 QWEN_DENSE_MODELS = [
-    "vllm-ascend/Qwen3-8B-W8A8", "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8"
+    "vllm-ascend/Qwen3-8B-W8A8",
 ]

-QWEN_W4A8_OLD_VERSION_MODELS = [
-    "vllm-ascend/Qwen3-8B-W4A8",
-]
-
-QWEN_W4A8_NEW_VERSION_MODELS = [
+QWEN_W4A8_MODELS = [
    "vllm-ascend/Qwen3-1.7B-W4A8-V1",
 ]

 DEEPSEEK_W4A8_MODELS = [
-    "vllm-ascend/DeepSeek-V3-W4A8-Pruing",
-    "vllm-ascend/DeepSeek-V3.1-W4A8-puring"
+    "vllm-ascend/DeepSeek-V3.1-W4A8-puring",
 ]

 KIMI_W4A16_MODELS = [
@@ -54,22 +49,6 @@ KIMI_W4A16_MODELS = [
 ]


-def test_models_distributed_QwQ():
-    example_prompts = [
-        "Hello, my name is",
-    ]
-    dtype = "half"
-    max_tokens = 5
-    with VllmRunner(
-            "Qwen/QwQ-32B",
-            dtype=dtype,
-            tensor_parallel_size=2,
-            distributed_executor_backend="mp",
-            enforce_eager=False,
-    ) as vllm_model:
-        vllm_model.generate_greedy(example_prompts, max_tokens)
-
-
 def test_models_distributed_DeepSeek_multistream_moe():
    example_prompts = [
        "Hello, my name is",
@@ -89,40 +68,8 @@ def test_models_distributed_DeepSeek_multistream_moe():
        vllm_model.generate_greedy(example_prompts, max_tokens)


-def test_models_distributed_Qwen3_W8A8():
-    example_prompts = [
-        "Hello, my name is",
-    ]
-    max_tokens = 5
-
-    with VllmRunner(
-            snapshot_download("vllm-ascend/Qwen3-8B-W8A8"),
-            max_model_len=8192,
-            dtype="auto",
-            tensor_parallel_size=2,
-            quantization="ascend",
-    ) as vllm_model:
-        vllm_model.generate_greedy(example_prompts, max_tokens)
-
-
-@pytest.mark.parametrize("model", QWEN_W4A8_OLD_VERSION_MODELS)
-def test_models_distributed_Qwen3_W4A8DYNAMIC_old_version(model):
-    prompts = [
-        "Hello, my name is",
-    ]
-    max_tokens = 5
-    with VllmRunner(
-            snapshot_download(model),
-            max_model_len=8192,
-            dtype="auto",
-            tensor_parallel_size=2,
-            quantization="ascend",
-    ) as vllm_model:
-        vllm_model.generate_greedy(prompts, max_tokens)
-
-
-@pytest.mark.parametrize("model", QWEN_W4A8_NEW_VERSION_MODELS)
-def test_models_distributed_Qwen3_W4A8DYNAMIC_new_version(model):
+@pytest.mark.parametrize("model", QWEN_W4A8_MODELS)
+def test_models_distributed_Qwen3_W4A8DYNAMIC(model):
    prompts = [
        "Hello, my name is",
    ]