[Lint]Style: Convert test/ to ruff format(Batch #1) (#6738)

### What this PR does / why we need it? **Scope of Changes**: | File Path | | :--- | | `tests/e2e/310p/multicard/test_vl_model_multicard.py` | | `tests/e2e/310p/singlecard/test_vl_model_singlecard.py` | | `tests/e2e/310p/test_utils.py` | | `tests/e2e/conftest.py` | | `tests/e2e/model_utils.py` | | `tests/e2e/models/conftest.py` | | `tests/e2e/models/test_lm_eval_correctness.py` | | `tests/e2e/multicard/2-cards/spec_decode/test_spec_decode.py` | | `tests/e2e/multicard/2-cards/test_aclgraph_capture_replay.py` | | `tests/e2e/multicard/2-cards/test_data_parallel.py` | | `tests/e2e/multicard/2-cards/test_disaggregated_encoder.py` | | `tests/e2e/multicard/2-cards/test_expert_parallel.py` | | `tests/e2e/multicard/2-cards/test_external_launcher.py` | | `tests/e2e/multicard/2-cards/test_full_graph_mode.py` | | `tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py` | | `tests/e2e/multicard/2-cards/test_offline_inference_distributed.py` | | `tests/e2e/multicard/2-cards/test_offline_weight_load.py` | | `tests/e2e/multicard/2-cards/test_pipeline_parallel.py` | | `tests/e2e/multicard/2-cards/test_prefix_caching.py` | | `tests/e2e/multicard/2-cards/test_quantization.py` | | `tests/e2e/multicard/2-cards/test_qwen3_moe.py` | | `tests/e2e/multicard/2-cards/test_qwen3_moe_routing_replay.py` | | `tests/e2e/multicard/2-cards/test_qwen3_performance.py` | | `tests/e2e/multicard/2-cards/test_shared_expert_dp.py` | | `tests/e2e/multicard/2-cards/test_single_request_aclgraph.py` | | `tests/e2e/multicard/2-cards/test_sp_pass.py` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.15.0 - vLLM main: 9562912cea Signed-off-by: MrZ20 <2609716663@qq.com> Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
2026-03-10 09:52:50 +08:00
parent 9216e1b050
commit 43df2cb2fc
27 changed files with 753 additions and 859 deletions
--- a/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py
@@ -20,8 +20,10 @@

 Run `pytest tests/test_offline_inference.py`.
 """
+
 import os
 from unittest.mock import patch
+
 import pytest
 from vllm import SamplingParams

@@ -51,6 +53,7 @@ GPT_OSS_MODELS = [
    "unsloth/gpt-oss-20b-BF16",
 ]

+
 def test_deepseek_multistream_moe_tp2():
    example_prompts = [
        "Hello, my name is",
@@ -58,15 +61,15 @@ def test_deepseek_multistream_moe_tp2():
    dtype = "half"
    max_tokens = 5
    with VllmRunner(
-            "vllm-ascend/DeepSeek-V3-Pruning",
-            dtype=dtype,
-            tensor_parallel_size=2,
-            cudagraph_capture_sizes=[1, 2, 4, 8],
-            distributed_executor_backend="mp",
-            additional_config={
-                "enable_multistream_moe": True,
-                "refresh": True,
-            },
+        "vllm-ascend/DeepSeek-V3-Pruning",
+        dtype=dtype,
+        tensor_parallel_size=2,
+        cudagraph_capture_sizes=[1, 2, 4, 8],
+        distributed_executor_backend="mp",
+        additional_config={
+            "enable_multistream_moe": True,
+            "refresh": True,
+        },
    ) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)

@@ -78,12 +81,12 @@ def test_qwen3_w4a8_dynamic_tp2(model):
    ]
    max_tokens = 5
    with VllmRunner(
-            model,
-            max_model_len=8192,
-            dtype="auto",
-            tensor_parallel_size=2,
-            cudagraph_capture_sizes=[1, 2, 4, 8],
-            quantization="ascend",
+        model,
+        max_model_len=8192,
+        dtype="auto",
+        tensor_parallel_size=2,
+        cudagraph_capture_sizes=[1, 2, 4, 8],
+        quantization="ascend",
    ) as vllm_model:
        vllm_model.generate_greedy(prompts, max_tokens)

@@ -92,20 +95,17 @@ def test_qwen3_moe_sp_tp2() -> None:
    example_prompts = [
        "Hello, my name is",
    ]
-    sampling_params = SamplingParams(max_tokens=5,
-                                     temperature=0.0,
-                                     top_k=50,
-                                     top_p=0.9)
+    sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)

-    with VllmRunner("Qwen/Qwen3-30B-A3B",
-                    dtype="auto",
-                    tensor_parallel_size=2,
-                    distributed_executor_backend="mp",
-                    compilation_config={"pass_config": {
-                        "enable_sp": True
-                    }},
-                    enable_expert_parallel=True,
-                    enforce_eager=True) as vllm_model:
+    with VllmRunner(
+        "Qwen/Qwen3-30B-A3B",
+        dtype="auto",
+        tensor_parallel_size=2,
+        distributed_executor_backend="mp",
+        compilation_config={"pass_config": {"enable_sp": True}},
+        enable_expert_parallel=True,
+        enforce_eager=True,
+    ) as vllm_model:
        vllm_model.generate(example_prompts, sampling_params)


@@ -113,33 +113,34 @@ def test_qwen3_moe_sp_tp2() -> None:
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "2048"})
 def test_deepseek_w4a8_accuracy_tp2(model):
    prompts = [
-        "Hello, my name is", "The president of the United States is",
-        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs"
-    ]
-    vllm_ds_w4a8_answers = [
-        '逍遙而至地去 accrued', '平行于我udo madreHelen', 'ysteepaolis backwards Kj'
+        "Hello, my name is",
+        "The president of the United States is",
+        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs",
    ]
+    vllm_ds_w4a8_answers = ["逍遙而至地去 accrued", "平行于我udo madreHelen", "ysteepaolis backwards Kj"]
    sampling_params = SamplingParams(max_tokens=5, temperature=0.0)
-    with VllmRunner(model,
-                    dtype="auto",
-                    tensor_parallel_size=2,
-                    cudagraph_capture_sizes=[1, 2, 4, 8],
-                    quantization="ascend",
-                    enable_expert_parallel=True) as vllm_model:
-        vllm_quant_outputs = vllm_model.model.generate(prompts,
-                                                       sampling_params)
+    with VllmRunner(
+        model,
+        dtype="auto",
+        tensor_parallel_size=2,
+        cudagraph_capture_sizes=[1, 2, 4, 8],
+        quantization="ascend",
+        enable_expert_parallel=True,
+    ) as vllm_model:
+        vllm_quant_outputs = vllm_model.model.generate(prompts, sampling_params)

    vllm_quant_outputs_list = []
    for output in vllm_quant_outputs:
-        vllm_quant_outputs_list.append(
-            ([output.outputs[0].index], output.outputs[0].text))
+        vllm_quant_outputs_list.append(([output.outputs[0].index], output.outputs[0].text))
    vllm_answer_list = []
-    vllm_answer_list = ([([0], answer) for answer in vllm_ds_w4a8_answers])
+    vllm_answer_list = [([0], answer) for answer in vllm_ds_w4a8_answers]

-    check_outputs_equal(outputs_0_lst=vllm_answer_list,
-                        outputs_1_lst=vllm_quant_outputs_list,
-                        name_0="vllm_quant_outputs",
-                        name_1="vllm_answer_outputs")
+    check_outputs_equal(
+        outputs_0_lst=vllm_answer_list,
+        outputs_1_lst=vllm_quant_outputs_list,
+        name_0="vllm_quant_outputs",
+        name_1="vllm_answer_outputs",
+    )


@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
@@ -148,17 +149,16 @@ def test_qwen3_moe_fc2_tp2() -> None:
    example_prompts = [
        "Hello, my name is",
    ]
-    sampling_params = SamplingParams(max_tokens=5,
-                                     temperature=0.0,
-                                     top_k=50,
-                                     top_p=0.9)
+    sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)

-    with VllmRunner("Qwen/Qwen3-30B-A3B",
-                    dtype="auto",
-                    tensor_parallel_size=2,
-                    distributed_executor_backend="mp",
-                    enable_expert_parallel=True,
-                    enforce_eager=True) as vllm_model:
+    with VllmRunner(
+        "Qwen/Qwen3-30B-A3B",
+        dtype="auto",
+        tensor_parallel_size=2,
+        distributed_executor_backend="mp",
+        enable_expert_parallel=True,
+        enforce_eager=True,
+    ) as vllm_model:
        vllm_model.generate(example_prompts, sampling_params)


@@ -168,20 +168,17 @@ def test_qwen3_moe_fc2_oshard_tp2() -> None:
    example_prompts = [
        "Hello, my name is",
    ]
-    sampling_params = SamplingParams(max_tokens=5,
-                                     temperature=0.0,
-                                     top_k=50,
-                                     top_p=0.9)
+    sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)

    with VllmRunner(
-            "Qwen/Qwen3-30B-A3B",
-            dtype="auto",
-            tensor_parallel_size=2,
-            distributed_executor_backend="mp",
-            enable_expert_parallel=True,
-            enforce_eager=
-            True,  # TODO(Levi-JQ): support graph mode for fc2 in Qwen 
-            additional_config={"layer_sharding": ["o_proj"]}) as vllm_model:
+        "Qwen/Qwen3-30B-A3B",
+        dtype="auto",
+        tensor_parallel_size=2,
+        distributed_executor_backend="mp",
+        enable_expert_parallel=True,
+        enforce_eager=True,  # TODO(Levi-JQ): support graph mode for fc2 in Qwen
+        additional_config={"layer_sharding": ["o_proj"]},
+    ) as vllm_model:
        vllm_model.generate(example_prompts, sampling_params)


@@ -190,17 +187,16 @@ def test_deepseek_v2_lite_fc1_tp2() -> None:
    example_prompts = [
        "test" * 1001,
    ]
-    sampling_params = SamplingParams(max_tokens=5,
-                                     temperature=0.0,
-                                     top_k=50,
-                                     top_p=0.9)
-    with VllmRunner("vllm-ascend/DeepSeek-V2-Lite-W8A8",
-                    dtype="auto",
-                    tensor_parallel_size=2,
-                    distributed_executor_backend="mp",
-                    enable_expert_parallel=True,
-                    enforce_eager=True,
-                    quantization="ascend") as vllm_model:
+    sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)
+    with VllmRunner(
+        "vllm-ascend/DeepSeek-V2-Lite-W8A8",
+        dtype="auto",
+        tensor_parallel_size=2,
+        distributed_executor_backend="mp",
+        enable_expert_parallel=True,
+        enforce_eager=True,
+        quantization="ascend",
+    ) as vllm_model:
        vllm_model.generate(example_prompts, sampling_params)


@@ -213,12 +209,12 @@ def test_qwen3_dense_fc1_tp2(model):
    max_tokens = 5

    with VllmRunner(
-            model,
-            max_model_len=8192,
-            dtype="auto",
-            tensor_parallel_size=2,
-            cudagraph_capture_sizes=[1, 2, 4, 8],
-            quantization="ascend",
+        model,
+        max_model_len=8192,
+        dtype="auto",
+        tensor_parallel_size=2,
+        cudagraph_capture_sizes=[1, 2, 4, 8],
+        quantization="ascend",
    ) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)

@@ -232,13 +228,13 @@ def test_qwen3_dense_prefetch_mlp_weight_tp2(model):
    max_tokens = 5

    with VllmRunner(
-            model,
-            max_model_len=8192,
-            dtype="auto",
-            tensor_parallel_size=2,
-            cudagraph_capture_sizes=[1, 2, 4, 8],
-            quantization="ascend",
-            additional_config={"weight_prefetch_config": {"enabled": True}},
+        model,
+        max_model_len=8192,
+        dtype="auto",
+        tensor_parallel_size=2,
+        cudagraph_capture_sizes=[1, 2, 4, 8],
+        quantization="ascend",
+        additional_config={"weight_prefetch_config": {"enabled": True}},
    ) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)

@@ -252,28 +248,20 @@ def test_deepseek3_2_w8a8_pruning_mtp_tp2_ep():
        "Hello ",
    ]
    # "max_position_embeddings": 163840,
-    long_example_prompts = [
-        "Hello " * (163839 - 500) + "Hello"
-    ]
+    long_example_prompts = ["Hello " * (163839 - 500) + "Hello"]
    max_tokens = 500
-    with VllmRunner("vllm-ascend/DeepSeek-V3.2-W8A8-Pruning",
-                    tensor_parallel_size=2,
-                    quantization="ascend",
-                    enable_expert_parallel=True,
-                    max_model_len=163840,
-                    compilation_config={
-                        "cudagraph_capture_sizes": [2, 4, 6, 8, 10, 12],
-                        "cudagraph_mode": "FULL_DECODE_ONLY"
-                    },
-                    speculative_config={
-                        "num_speculative_tokens": 1,
-                        "method": "deepseek_mtp"
-                    },
-                    additional_config={
-                        "layer_sharding":["q_b_proj", "o_proj"]
-                    },
-                    reasoning_parser="deepseek_v3",
-                    tokenizer_mode="deepseek_v32") as vllm_model:
+    with VllmRunner(
+        "vllm-ascend/DeepSeek-V3.2-W8A8-Pruning",
+        tensor_parallel_size=2,
+        quantization="ascend",
+        enable_expert_parallel=True,
+        max_model_len=163840,
+        compilation_config={"cudagraph_capture_sizes": [2, 4, 6, 8, 10, 12], "cudagraph_mode": "FULL_DECODE_ONLY"},
+        speculative_config={"num_speculative_tokens": 1, "method": "deepseek_mtp"},
+        additional_config={"layer_sharding": ["q_b_proj", "o_proj"]},
+        reasoning_parser="deepseek_v3",
+        tokenizer_mode="deepseek_v32",
+    ) as vllm_model:
        vllm_model.generate_greedy(short_example_prompts, max_tokens)
        vllm_model.generate_greedy(long_example_prompts, max_tokens)

@@ -285,10 +273,10 @@ def test_qwen3_w4a4_distributed_tp2(model):
    ]
    max_tokens = 5
    with VllmRunner(
-            model,
-            tensor_parallel_size=2,
-            cudagraph_capture_sizes=[1, 2, 4, 8],
-            quantization="ascend",
+        model,
+        tensor_parallel_size=2,
+        cudagraph_capture_sizes=[1, 2, 4, 8],
+        quantization="ascend",
    ) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)

@@ -300,8 +288,8 @@ def test_gpt_oss_distributed_tp2(model):
    ]
    max_tokens = 5
    with VllmRunner(
-            model,
-            tensor_parallel_size=2,
-            enforce_eager=True,
+        model,
+        tensor_parallel_size=2,
+        enforce_eager=True,
    ) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)