[Lint]Style: Convert test/ to ruff format(Batch #1) (#6738)

### What this PR does / why we need it? **Scope of Changes**: | File Path | | :--- | | `tests/e2e/310p/multicard/test_vl_model_multicard.py` | | `tests/e2e/310p/singlecard/test_vl_model_singlecard.py` | | `tests/e2e/310p/test_utils.py` | | `tests/e2e/conftest.py` | | `tests/e2e/model_utils.py` | | `tests/e2e/models/conftest.py` | | `tests/e2e/models/test_lm_eval_correctness.py` | | `tests/e2e/multicard/2-cards/spec_decode/test_spec_decode.py` | | `tests/e2e/multicard/2-cards/test_aclgraph_capture_replay.py` | | `tests/e2e/multicard/2-cards/test_data_parallel.py` | | `tests/e2e/multicard/2-cards/test_disaggregated_encoder.py` | | `tests/e2e/multicard/2-cards/test_expert_parallel.py` | | `tests/e2e/multicard/2-cards/test_external_launcher.py` | | `tests/e2e/multicard/2-cards/test_full_graph_mode.py` | | `tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py` | | `tests/e2e/multicard/2-cards/test_offline_inference_distributed.py` | | `tests/e2e/multicard/2-cards/test_offline_weight_load.py` | | `tests/e2e/multicard/2-cards/test_pipeline_parallel.py` | | `tests/e2e/multicard/2-cards/test_prefix_caching.py` | | `tests/e2e/multicard/2-cards/test_quantization.py` | | `tests/e2e/multicard/2-cards/test_qwen3_moe.py` | | `tests/e2e/multicard/2-cards/test_qwen3_moe_routing_replay.py` | | `tests/e2e/multicard/2-cards/test_qwen3_performance.py` | | `tests/e2e/multicard/2-cards/test_shared_expert_dp.py` | | `tests/e2e/multicard/2-cards/test_single_request_aclgraph.py` | | `tests/e2e/multicard/2-cards/test_sp_pass.py` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.15.0 - vLLM main: 9562912cea Signed-off-by: MrZ20 <2609716663@qq.com> Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
2026-03-10 09:52:50 +08:00
parent 9216e1b050
commit 43df2cb2fc
27 changed files with 753 additions and 859 deletions
--- a/tests/e2e/multicard/2-cards/spec_decode/test_spec_decode.py
+++ b/tests/e2e/multicard/2-cards/spec_decode/test_spec_decode.py
@@ -18,15 +18,12 @@

 from __future__ import annotations

-import math
 import os
-import random
-from typing import Any, Union
 from unittest.mock import patch

 import pytest
 from transformers import AutoTokenizer
-from vllm import LLM, SamplingParams
+from vllm import SamplingParams
 from vllm.config import CompilationConfig
 from vllm.v1.metrics.reader import Counter, Vector

@@ -101,7 +98,8 @@ def test_eagle3_sp_acceptance(
            [prompt],
            tokenize=False,
            add_generation_prompt=True,
-        ) for prompt in prompts
+        )
+        for prompt in prompts
    ]

    speculative_config = {
@@ -112,21 +110,20 @@ def test_eagle3_sp_acceptance(
        "model": spec_model_name,
    }

-    compilation_config = CompilationConfig(cudagraph_mode="FULL_DECODE_ONLY",
-                                           cudagraph_capture_sizes=[12])
+    compilation_config = CompilationConfig(cudagraph_mode="FULL_DECODE_ONLY", cudagraph_capture_sizes=[12])

    with VllmRunner(
-            main_model_name,
-            enforce_eager=True,
-            max_model_len=8192,
-            disable_log_stats=False,
-            tensor_parallel_size=2,
-            max_num_seqs=256,
-            distributed_executor_backend="mp",
-            gpu_memory_utilization=0.7,
-            speculative_config=speculative_config,
-            compilation_config=compilation_config,
-            async_scheduling=async_scheduling,
+        main_model_name,
+        enforce_eager=True,
+        max_model_len=8192,
+        disable_log_stats=False,
+        tensor_parallel_size=2,
+        max_num_seqs=256,
+        distributed_executor_backend="mp",
+        gpu_memory_utilization=0.7,
+        speculative_config=speculative_config,
+        compilation_config=compilation_config,
+        async_scheduling=async_scheduling,
    ) as llm:
        _ = llm.generate(prompts, sampling_params)
        metrics = llm.model.get_metrics()
@@ -142,10 +139,7 @@ def test_eagle3_sp_acceptance(
            for pos in range(len(metric.values)):
                num_accepted_tokens_per_pos[pos] += metric.values[pos]

-    acceptance_per_pos = [
-        num_accepted_tokens / num_drafts
-        for num_accepted_tokens in num_accepted_tokens_per_pos
-    ]
+    acceptance_per_pos = [num_accepted_tokens / num_drafts for num_accepted_tokens in num_accepted_tokens_per_pos]
    golden = BASELINES_SP[method]

    match = all(abs(a - b) < 0.06 for a, b in zip(acceptance_per_pos, golden))
--- a/tests/e2e/multicard/2-cards/test_aclgraph_capture_replay.py
+++ b/tests/e2e/multicard/2-cards/test_aclgraph_capture_replay.py
@@ -25,8 +25,8 @@ import pytest
 import torch
 from vllm.utils.network_utils import get_open_port

-from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
 from tests.e2e.conftest import wait_until_npu_memory_free
+from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type

 MODELS = [
    # Offline data parallel mode will be not supported/useful for dense models
@@ -58,8 +58,7 @@ def _install_spies(counters: dict[str, Any]) -> contextlib.ExitStack:
    ]

    for cls, method, counter in hooks:
-        stack.enter_context(
-            patch.object(cls, method, make_spy(cls, method, counter)))
+        stack.enter_context(patch.object(cls, method, make_spy(cls, method, counter)))

    return stack

@@ -75,18 +74,19 @@ def _run_worker_process(
    max_tokens: int,
 ):
    """Main entry point for the worker process."""
-    os.environ.update({
-        "VLLM_DP_RANK": str(rank),
-        "VLLM_DP_RANK_LOCAL": str(local_rank),
-        "VLLM_DP_SIZE": str(world_size),
-        "VLLM_DP_MASTER_IP": master_ip,
-        "VLLM_DP_MASTER_PORT": str(master_port),
-    })
+    os.environ.update(
+        {
+            "VLLM_DP_RANK": str(rank),
+            "VLLM_DP_RANK_LOCAL": str(local_rank),
+            "VLLM_DP_SIZE": str(world_size),
+            "VLLM_DP_MASTER_IP": master_ip,
+            "VLLM_DP_MASTER_PORT": str(master_port),
+        }
+    )

    # Import vLLM only after environment setup
    from vllm import LLM, SamplingParams
-    from vllm.distributed.parallel_state import (
-        destroy_distributed_environment, destroy_model_parallel)
+    from vllm.distributed.parallel_state import destroy_distributed_environment, destroy_model_parallel

    # Apply hooks and run inference
    with _install_spies(counters):
@@ -100,23 +100,20 @@ def _run_worker_process(
        # Simple data sharding
        chunk_size = len(prompts) // world_size
        start_idx = rank * chunk_size
-        end_idx = start_idx + chunk_size if rank < world_size - 1 else len(
-            prompts)
+        end_idx = start_idx + chunk_size if rank < world_size - 1 else len(prompts)
        local_prompts = prompts[start_idx:end_idx]

        llm = LLM(
            model=model_path,
            quantization="ascend" if "W8A8" in model_path else None,
-            enable_expert_parallel=True if "DeepSeek" in model_path else False,
+            enable_expert_parallel="DeepSeek" in model_path,
            trust_remote_code=True,
        )

        # Expose model config to the main test process
-        counters["hidden_layers"].value = (
-            llm.llm_engine.model_config.hf_text_config.num_hidden_layers)
+        counters["hidden_layers"].value = llm.llm_engine.model_config.hf_text_config.num_hidden_layers

-        llm.generate(local_prompts,
-                     SamplingParams(max_tokens=max_tokens, temperature=0.0))
+        llm.generate(local_prompts, SamplingParams(max_tokens=max_tokens, temperature=0.0))

        # Explicit cleanup is mandatory in multi-process vLLM tests
        del llm
@@ -162,8 +159,7 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
    for rank in range(dp_size):
        p = multiprocessing.Process(
            target=_run_worker_process,
-            args=(rank, rank, dp_size, "127.0.0.1", port, counters, model,
-                  max_tokens),
+            args=(rank, rank, dp_size, "127.0.0.1", port, counters, model, max_tokens),
        )
        p.start()
        workers.append(p)
@@ -175,8 +171,7 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
            for k in workers:
                if k.is_alive():
                    k.kill()
-            raise RuntimeError(
-                f"Worker {p.pid} failed with exit code {p.exitcode}")
+            raise RuntimeError(f"Worker {p.pid} failed with exit code {p.exitcode}")

    actual_capture = counters["capture"].value
    actual_replay = counters["replay"].value
@@ -185,18 +180,16 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
    num_layers = counters["hidden_layers"].value

    num_acl_graphs = num_layers + 1
-    num_comm_groups = sum(1 for s in [dp_size, 1]
-                          if s > 1)  # dp_size=2, tp_size=1
+    num_comm_groups = sum(1 for s in [dp_size, 1] if s > 1)  # dp_size=2, tp_size=1

    # Metric 1: Graph Capture (ACL Graph Construction)
    # Ref: vllm_ascend.utils.update_aclgraph_sizes
-    max_batch_sizes = math.floor((1800 - num_comm_groups * 40) /
-                                 num_acl_graphs / (1 + num_comm_groups * 2))
+    max_batch_sizes = math.floor((1800 - num_comm_groups * 40) / num_acl_graphs / (1 + num_comm_groups * 2))

    expected_capture = max_batch_sizes * num_acl_graphs * dp_size
-    assert (
-        actual_capture == expected_capture
-    ), f"Capture count mismatch. Expected: {expected_capture}, Got: {actual_capture}"
+    assert actual_capture == expected_capture, (
+        f"Capture count mismatch. Expected: {expected_capture}, Got: {actual_capture}"
+    )

    # Metric 2: Model Execution (NPUModelRunner.execute_model)
    # vLLM Step Breakdown:
@@ -207,9 +200,9 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
    # vllm default enables Async scheduler, this will take 1 more steps
    expected_exec_model = (total_steps + 1 + 1) * dp_size

-    assert (
-        num_execute_model == expected_exec_model
-    ), f"Model execution count mismatch. Expected: {expected_exec_model}, Got: {num_execute_model}"
+    assert num_execute_model == expected_exec_model, (
+        f"Model execution count mismatch. Expected: {expected_exec_model}, Got: {num_execute_model}"
+    )

    # Metric 3: Dummy Runs (Warmup & Alignment)
    # vLLM synchronizes globally every 32 steps.
@@ -228,14 +221,12 @@ def test_models_aclgraph_capture_replay_metrics_dp2(

    expected_dummy_run = (warmup_runs + padding_runs) * dp_size

-    assert (
-        num_dummy_run == expected_dummy_run
-    ), f"Dummy run count mismatch. Expected: {expected_dummy_run}, Got: {num_dummy_run}"
+    assert num_dummy_run == expected_dummy_run, (
+        f"Dummy run count mismatch. Expected: {expected_dummy_run}, Got: {num_dummy_run}"
+    )

    # Metric 4: Graph Replay (Inference Execution)
    # Replays happen for every aligned step across all graphs.
    expected_replay = num_acl_graphs * aligned_steps * dp_size

-    assert (
-        actual_replay == expected_replay
-    ), f"Replay count mismatch. Expected: {expected_replay}, Got: {actual_replay}"
+    assert actual_replay == expected_replay, f"Replay count mismatch. Expected: {expected_replay}, Got: {actual_replay}"
--- a/tests/e2e/multicard/2-cards/test_data_parallel.py
+++ b/tests/e2e/multicard/2-cards/test_data_parallel.py
@@ -64,12 +64,8 @@ def test_qwen3_inference_dp2(model, max_tokens):
        cmd.append("ascend")

    print(f"Running subprocess: {' '.join(cmd)}")
-    proc = subprocess.run(cmd,
-                          env=env,
-                          stdout=subprocess.PIPE,
-                          stderr=subprocess.STDOUT,
-                          timeout=600)
-    output = proc.stdout.decode(errors='ignore')
+    proc = subprocess.run(cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, timeout=600)
+    output = proc.stdout.decode(errors="ignore")

    print(output)

--- a/tests/e2e/multicard/2-cards/test_disaggregated_encoder.py
+++ b/tests/e2e/multicard/2-cards/test_disaggregated_encoder.py
@@ -27,6 +27,7 @@ MODELS = [
 SHARED_STORAGE_PATH = "/dev/shm/epd/storage"
 TENSOR_PARALLELS = [1]

+
@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
@@ -36,36 +37,61 @@ async def test_models(model: str, tp_size: int) -> None:
    vllm_server_args = [
        [
            "--port",
-            str(encode_port), "--model", model, "--gpu-memory-utilization",
-            "0.01", "--tensor-parallel-size",
-            str(tp_size), "--enforce-eager", "--no-enable-prefix-caching",
-            "--max-model-len", "10000", "--max-num-batched-tokens", "10000",
-            "--max-num-seqs", "1", "--ec-transfer-config",
-            '{"ec_connector_extra_config":{"shared_storage_path":"' +
-            SHARED_STORAGE_PATH +
-            '"},"ec_connector":"ECExampleConnector","ec_role": "ec_producer"}'
+            str(encode_port),
+            "--model",
+            model,
+            "--gpu-memory-utilization",
+            "0.01",
+            "--tensor-parallel-size",
+            str(tp_size),
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "--max-model-len",
+            "10000",
+            "--max-num-batched-tokens",
+            "10000",
+            "--max-num-seqs",
+            "1",
+            "--ec-transfer-config",
+            '{"ec_connector_extra_config":{"shared_storage_path":"'
+            + SHARED_STORAGE_PATH
+            + '"},"ec_connector":"ECExampleConnector","ec_role": "ec_producer"}',
        ],
        [
            "--port",
-            str(pd_port), "--model", model, "--gpu-memory-utilization", "0.95",
+            str(pd_port),
+            "--model",
+            model,
+            "--gpu-memory-utilization",
+            "0.95",
            "--tensor-parallel-size",
-            str(tp_size), "--enforce-eager", "--max-model-len", "10000",
-            "--max-num-batched-tokens", "10000", "--max-num-seqs", "128",
+            str(tp_size),
+            "--enforce-eager",
+            "--max-model-len",
+            "10000",
+            "--max-num-batched-tokens",
+            "10000",
+            "--max-num-seqs",
+            "128",
            "--ec-transfer-config",
-            '{"ec_connector_extra_config":{"shared_storage_path":"' +
-            SHARED_STORAGE_PATH +
-            '"},"ec_connector":"ECExampleConnector","ec_role": "ec_consumer"}'
-        ]
+            '{"ec_connector_extra_config":{"shared_storage_path":"'
+            + SHARED_STORAGE_PATH
+            + '"},"ec_connector":"ECExampleConnector","ec_role": "ec_consumer"}',
+        ],
    ]
    proxy_port = get_open_port()
    proxy_args = [
-        "--host", "127.0.0.1", "--port",
-        str(proxy_port), "--encode-servers-urls",
-        f"http://localhost:{encode_port}", "--decode-servers-urls",
-        f"http://localhost:{pd_port}", "--prefill-servers-urls", "disable"
+        "--host",
+        "127.0.0.1",
+        "--port",
+        str(proxy_port),
+        "--encode-servers-urls",
+        f"http://localhost:{encode_port}",
+        "--decode-servers-urls",
+        f"http://localhost:{pd_port}",
+        "--prefill-servers-urls",
+        "disable",
    ]

-    with RemoteEPDServer(vllm_serve_args=vllm_server_args) as _:
-        with DisaggEpdProxy(proxy_args=proxy_args) as proxy:
-            send_image_request(model, proxy)
-
+    with RemoteEPDServer(vllm_serve_args=vllm_server_args) as _, DisaggEpdProxy(proxy_args=proxy_args) as proxy:
+        send_image_request(model, proxy)
--- a/tests/e2e/multicard/2-cards/test_expert_parallel.py
+++ b/tests/e2e/multicard/2-cards/test_expert_parallel.py
@@ -15,15 +15,12 @@ def test_deepseek_correctness_ep(model_name):
    max_tokens = 5

    # FIXME: Really strange that chunked prefill might lead to different results, investigate further
-    with VllmRunner(model_name,
-                    cudagraph_capture_sizes=[1, 2, 4, 8],
-                    tensor_parallel_size=2) as vllm_model:
+    with VllmRunner(model_name, cudagraph_capture_sizes=[1, 2, 4, 8], tensor_parallel_size=2) as vllm_model:
        tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)

-    with VllmRunner(model_name,
-                    tensor_parallel_size=2,
-                    cudagraph_capture_sizes=[1, 2, 4, 8],
-                    enable_expert_parallel=True) as vllm_model:
+    with VllmRunner(
+        model_name, tensor_parallel_size=2, cudagraph_capture_sizes=[1, 2, 4, 8], enable_expert_parallel=True
+    ) as vllm_model:
        ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)

    check_outputs_equal(
--- a/tests/e2e/multicard/2-cards/test_external_launcher.py
+++ b/tests/e2e/multicard/2-cards/test_external_launcher.py
@@ -29,6 +29,7 @@ from unittest.mock import patch
 import pytest
 import torch_npu
 from modelscope import snapshot_download  # type: ignore
+
 from tests.e2e.conftest import wait_until_npu_memory_free

 MODELS = ["Qwen/Qwen3-0.6B"]
@@ -39,9 +40,7 @@ DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
@pytest.mark.parametrize("model", MODELS)
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "500"})
 def test_qwen3_external_launcher(model):
-    script = Path(
-        __file__
-    ).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
+    script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
    env = os.environ.copy()
    # TODO: Change to 2 when ci machine has 4 cards
    cmd = [
@@ -68,7 +67,7 @@ def test_qwen3_external_launcher(model):
        stderr=subprocess.STDOUT,
        timeout=600,
    )
-    output = proc.stdout.decode(errors='ignore')
+    output = proc.stdout.decode(errors="ignore")

    print(output)

@@ -81,16 +80,24 @@ def test_qwen3_external_launcher(model):
@pytest.mark.parametrize("model", MOE_MODELS)
@wait_until_npu_memory_free()
 def test_qwen3_moe_external_launcher_ep_tp2(model):
-    script = Path(
-        __file__
-    ).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
+    script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
    env = os.environ.copy()
    # TODO: Change to 2 when ci machine has 4 cards
    cmd = [
        sys.executable,
-        str(script), "--model", model, "--tp-size", "2", "--node-size", "1",
-        "--node-rank", "0", "--proc-per-node", "2", "--trust-remote-code",
-        "--enable-expert-parallel"
+        str(script),
+        "--model",
+        model,
+        "--tp-size",
+        "2",
+        "--node-size",
+        "1",
+        "--node-rank",
+        "0",
+        "--proc-per-node",
+        "2",
+        "--trust-remote-code",
+        "--enable-expert-parallel",
    ]

    print(f"Running subprocess: {' '.join(cmd)}")
@@ -101,7 +108,7 @@ def test_qwen3_moe_external_launcher_ep_tp2(model):
        stderr=subprocess.STDOUT,
        timeout=600,
    )
-    output = proc.stdout.decode(errors='ignore')
+    output = proc.stdout.decode(errors="ignore")

    print(output)

@@ -113,9 +120,7 @@ def test_qwen3_moe_external_launcher_ep_tp2(model):
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
@wait_until_npu_memory_free()
 def test_qwen3_external_launcher_with_sleepmode():
-    script = Path(
-        __file__
-    ).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
+    script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
    env = os.environ.copy()
    # TODO: Change to 2 when ci machine has 4 cards
    cmd = [
@@ -147,7 +152,7 @@ def test_qwen3_external_launcher_with_sleepmode():
        stderr=subprocess.STDOUT,
        timeout=300,
    )
-    output = proc.stdout.decode(errors='ignore')
+    output = proc.stdout.decode(errors="ignore")

    print(output)

@@ -158,9 +163,7 @@ def test_qwen3_external_launcher_with_sleepmode():

@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
 def test_qwen3_external_launcher_with_sleepmode_level2():
-    script = Path(
-        __file__
-    ).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
+    script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
    env = os.environ.copy()
    model_path = snapshot_download("Qwen/Qwen3-8B")
    # TODO: Add moe model test
@@ -195,7 +198,7 @@ def test_qwen3_external_launcher_with_sleepmode_level2():
        stderr=subprocess.STDOUT,
        timeout=300,
    )
-    output = proc.stdout.decode(errors='ignore')
+    output = proc.stdout.decode(errors="ignore")

    print(output)

@@ -210,14 +213,9 @@ def test_qwen3_external_launcher_with_sleepmode_level2():
 )
@pytest.mark.parametrize("model", MODELS)
@wait_until_npu_memory_free()
-@patch.dict(os.environ, {
-    "VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": "1",
-    "HCCL_BUFFSIZE": "500"
-})
+@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": "1", "HCCL_BUFFSIZE": "500"})
 def test_qwen3_external_launcher_with_matmul_allreduce(model):
-    script = Path(
-        __file__
-    ).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
+    script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
    env = os.environ.copy()
    cmd = [
        sys.executable,
@@ -236,7 +234,7 @@ def test_qwen3_external_launcher_with_matmul_allreduce(model):
        timeout=600,
    )

-    output = proc.stdout.decode(errors='ignore')
+    output = proc.stdout.decode(errors="ignore")
    print(output)

    assert "Generated text:" in output
--- a/tests/e2e/multicard/2-cards/test_full_graph_mode.py
+++ b/tests/e2e/multicard/2-cards/test_full_graph_mode.py
@@ -26,41 +26,39 @@ from tests.e2e.model_utils import check_outputs_equal


 def test_qwen3_moe_full_decode_only_tp2():
-    if 'HCCL_OP_EXPANSION_MODE' in os.environ:
-        del os.environ['HCCL_OP_EXPANSION_MODE']
+    if "HCCL_OP_EXPANSION_MODE" in os.environ:
+        del os.environ["HCCL_OP_EXPANSION_MODE"]
    prompts = [
-        "Hello, my name is", "The president of the United States is",
-        "The capital of France is", "The future of AI is"
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
    ]
    model = "Qwen/Qwen3-30B-A3B"
    sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
-    with VllmRunner(model,
-                    max_model_len=1024,
-                    tensor_parallel_size=2,
-                    compilation_config={
-                        "cudagraph_mode": "FULL_DECODE_ONLY",
-                        "cudagraph_capture_sizes": [4, 8, 24, 48, 60]
-                    }) as runner:
-        vllm_fullgraph_outputs = runner.model.generate(prompts,
-                                                       sampling_params)
+    with VllmRunner(
+        model,
+        max_model_len=1024,
+        tensor_parallel_size=2,
+        compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [4, 8, 24, 48, 60]},
+    ) as runner:
+        vllm_fullgraph_outputs = runner.model.generate(prompts, sampling_params)

    with VllmRunner(
-            model,
-            max_model_len=1024,
-            cudagraph_capture_sizes=[4, 8, 24, 48, 60],
-            tensor_parallel_size=2,
+        model,
+        max_model_len=1024,
+        cudagraph_capture_sizes=[4, 8, 24, 48, 60],
+        tensor_parallel_size=2,
    ) as runner:
        vllm_eager_outputs = runner.model.generate(prompts, sampling_params)

    vllm_fullgraph_outputs_list = []
    for output in vllm_fullgraph_outputs:
-        vllm_fullgraph_outputs_list.append(
-            (output.outputs[0].index, output.outputs[0].text))
+        vllm_fullgraph_outputs_list.append((output.outputs[0].index, output.outputs[0].text))

    vllm_eager_outputs_list = []
    for output in vllm_eager_outputs:
-        vllm_eager_outputs_list.append(
-            (output.outputs[0].index, output.outputs[0].text))
+        vllm_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))

    check_outputs_equal(
        outputs_0_lst=vllm_eager_outputs_list,
@@ -72,41 +70,39 @@ def test_qwen3_moe_full_decode_only_tp2():

@pytest.mark.skip(reason="CANN8.5 failed with this test, fix me")
 def test_qwen3_moe_full_graph_tp2():
-    if 'HCCL_OP_EXPANSION_MODE' in os.environ:
-        del os.environ['HCCL_OP_EXPANSION_MODE']
+    if "HCCL_OP_EXPANSION_MODE" in os.environ:
+        del os.environ["HCCL_OP_EXPANSION_MODE"]
    prompts = [
-        "Hello, my name is", "The president of the United States is",
-        "The capital of France is", "The future of AI is"
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
    ]
    model = "Qwen/Qwen3-30B-A3B"
    sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
-    with VllmRunner(model,
-                    max_model_len=1024,
-                    tensor_parallel_size=2,
-                    compilation_config={
-                        "cudagraph_mode": "FULL",
-                        "cudagraph_capture_sizes": [4, 8, 24, 48, 60]
-                    }) as runner:
-        vllm_fullgraph_outputs = runner.model.generate(prompts,
-                                                       sampling_params)
+    with VllmRunner(
+        model,
+        max_model_len=1024,
+        tensor_parallel_size=2,
+        compilation_config={"cudagraph_mode": "FULL", "cudagraph_capture_sizes": [4, 8, 24, 48, 60]},
+    ) as runner:
+        vllm_fullgraph_outputs = runner.model.generate(prompts, sampling_params)

    with VllmRunner(
-            model,
-            max_model_len=1024,
-            cudagraph_capture_sizes=[4, 8, 24, 48, 60],
-            tensor_parallel_size=2,
+        model,
+        max_model_len=1024,
+        cudagraph_capture_sizes=[4, 8, 24, 48, 60],
+        tensor_parallel_size=2,
    ) as runner:
        vllm_eager_outputs = runner.model.generate(prompts, sampling_params)

    vllm_fullgraph_outputs_list = []
    for output in vllm_fullgraph_outputs:
-        vllm_fullgraph_outputs_list.append(
-            (output.outputs[0].index, output.outputs[0].text))
+        vllm_fullgraph_outputs_list.append((output.outputs[0].index, output.outputs[0].text))

    vllm_eager_outputs_list = []
    for output in vllm_eager_outputs:
-        vllm_eager_outputs_list.append(
-            (output.outputs[0].index, output.outputs[0].text))
+        vllm_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))

    check_outputs_equal(
        outputs_0_lst=vllm_eager_outputs_list,
--- a/tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py
+++ b/tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py
@@ -1,23 +1,22 @@
 import pytest

 from tests.e2e.conftest import VllmRunner
-from tests.e2e.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT,
-                                                  MODEL_PATH, do_sample)
+from tests.e2e.singlecard.test_ilama_lora import EXPECTED_LORA_OUTPUT, MODEL_PATH, do_sample


@pytest.mark.parametrize("distributed_executor_backend", ["mp"])
 def test_ilama_lora_tp2(distributed_executor_backend, ilama_lora_files):
    with VllmRunner(
-            MODEL_PATH,
-            enable_lora=True,
-            max_loras=4,
-            dtype="half",
-            max_model_len=1024,
-            max_num_seqs=16,
-            tensor_parallel_size=2,
-            cudagraph_capture_sizes=[1, 2, 4, 8],
-            distributed_executor_backend=distributed_executor_backend,
-            enforce_eager=True,
+        MODEL_PATH,
+        enable_lora=True,
+        max_loras=4,
+        dtype="half",
+        max_model_len=1024,
+        max_num_seqs=16,
+        tensor_parallel_size=2,
+        cudagraph_capture_sizes=[1, 2, 4, 8],
+        distributed_executor_backend=distributed_executor_backend,
+        enforce_eager=True,
    ) as vllm_model:
        output = do_sample(vllm_model.model, ilama_lora_files, lora_id=2)

--- a/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py
@@ -20,8 +20,10 @@

 Run `pytest tests/test_offline_inference.py`.
 """
+
 import os
 from unittest.mock import patch
+
 import pytest
 from vllm import SamplingParams

@@ -51,6 +53,7 @@ GPT_OSS_MODELS = [
    "unsloth/gpt-oss-20b-BF16",
 ]

+
 def test_deepseek_multistream_moe_tp2():
    example_prompts = [
        "Hello, my name is",
@@ -58,15 +61,15 @@ def test_deepseek_multistream_moe_tp2():
    dtype = "half"
    max_tokens = 5
    with VllmRunner(
-            "vllm-ascend/DeepSeek-V3-Pruning",
-            dtype=dtype,
-            tensor_parallel_size=2,
-            cudagraph_capture_sizes=[1, 2, 4, 8],
-            distributed_executor_backend="mp",
-            additional_config={
-                "enable_multistream_moe": True,
-                "refresh": True,
-            },
+        "vllm-ascend/DeepSeek-V3-Pruning",
+        dtype=dtype,
+        tensor_parallel_size=2,
+        cudagraph_capture_sizes=[1, 2, 4, 8],
+        distributed_executor_backend="mp",
+        additional_config={
+            "enable_multistream_moe": True,
+            "refresh": True,
+        },
    ) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)

@@ -78,12 +81,12 @@ def test_qwen3_w4a8_dynamic_tp2(model):
    ]
    max_tokens = 5
    with VllmRunner(
-            model,
-            max_model_len=8192,
-            dtype="auto",
-            tensor_parallel_size=2,
-            cudagraph_capture_sizes=[1, 2, 4, 8],
-            quantization="ascend",
+        model,
+        max_model_len=8192,
+        dtype="auto",
+        tensor_parallel_size=2,
+        cudagraph_capture_sizes=[1, 2, 4, 8],
+        quantization="ascend",
    ) as vllm_model:
        vllm_model.generate_greedy(prompts, max_tokens)

@@ -92,20 +95,17 @@ def test_qwen3_moe_sp_tp2() -> None:
    example_prompts = [
        "Hello, my name is",
    ]
-    sampling_params = SamplingParams(max_tokens=5,
-                                     temperature=0.0,
-                                     top_k=50,
-                                     top_p=0.9)
+    sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)

-    with VllmRunner("Qwen/Qwen3-30B-A3B",
-                    dtype="auto",
-                    tensor_parallel_size=2,
-                    distributed_executor_backend="mp",
-                    compilation_config={"pass_config": {
-                        "enable_sp": True
-                    }},
-                    enable_expert_parallel=True,
-                    enforce_eager=True) as vllm_model:
+    with VllmRunner(
+        "Qwen/Qwen3-30B-A3B",
+        dtype="auto",
+        tensor_parallel_size=2,
+        distributed_executor_backend="mp",
+        compilation_config={"pass_config": {"enable_sp": True}},
+        enable_expert_parallel=True,
+        enforce_eager=True,
+    ) as vllm_model:
        vllm_model.generate(example_prompts, sampling_params)


@@ -113,33 +113,34 @@ def test_qwen3_moe_sp_tp2() -> None:
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "2048"})
 def test_deepseek_w4a8_accuracy_tp2(model):
    prompts = [
-        "Hello, my name is", "The president of the United States is",
-        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs"
-    ]
-    vllm_ds_w4a8_answers = [
-        '逍遙而至地去 accrued', '平行于我udo madreHelen', 'ysteepaolis backwards Kj'
+        "Hello, my name is",
+        "The president of the United States is",
+        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs",
    ]
+    vllm_ds_w4a8_answers = ["逍遙而至地去 accrued", "平行于我udo madreHelen", "ysteepaolis backwards Kj"]
    sampling_params = SamplingParams(max_tokens=5, temperature=0.0)
-    with VllmRunner(model,
-                    dtype="auto",
-                    tensor_parallel_size=2,
-                    cudagraph_capture_sizes=[1, 2, 4, 8],
-                    quantization="ascend",
-                    enable_expert_parallel=True) as vllm_model:
-        vllm_quant_outputs = vllm_model.model.generate(prompts,
-                                                       sampling_params)
+    with VllmRunner(
+        model,
+        dtype="auto",
+        tensor_parallel_size=2,
+        cudagraph_capture_sizes=[1, 2, 4, 8],
+        quantization="ascend",
+        enable_expert_parallel=True,
+    ) as vllm_model:
+        vllm_quant_outputs = vllm_model.model.generate(prompts, sampling_params)

    vllm_quant_outputs_list = []
    for output in vllm_quant_outputs:
-        vllm_quant_outputs_list.append(
-            ([output.outputs[0].index], output.outputs[0].text))
+        vllm_quant_outputs_list.append(([output.outputs[0].index], output.outputs[0].text))
    vllm_answer_list = []
-    vllm_answer_list = ([([0], answer) for answer in vllm_ds_w4a8_answers])
+    vllm_answer_list = [([0], answer) for answer in vllm_ds_w4a8_answers]

-    check_outputs_equal(outputs_0_lst=vllm_answer_list,
-                        outputs_1_lst=vllm_quant_outputs_list,
-                        name_0="vllm_quant_outputs",
-                        name_1="vllm_answer_outputs")
+    check_outputs_equal(
+        outputs_0_lst=vllm_answer_list,
+        outputs_1_lst=vllm_quant_outputs_list,
+        name_0="vllm_quant_outputs",
+        name_1="vllm_answer_outputs",
+    )


@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
@@ -148,17 +149,16 @@ def test_qwen3_moe_fc2_tp2() -> None:
    example_prompts = [
        "Hello, my name is",
    ]
-    sampling_params = SamplingParams(max_tokens=5,
-                                     temperature=0.0,
-                                     top_k=50,
-                                     top_p=0.9)
+    sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)

-    with VllmRunner("Qwen/Qwen3-30B-A3B",
-                    dtype="auto",
-                    tensor_parallel_size=2,
-                    distributed_executor_backend="mp",
-                    enable_expert_parallel=True,
-                    enforce_eager=True) as vllm_model:
+    with VllmRunner(
+        "Qwen/Qwen3-30B-A3B",
+        dtype="auto",
+        tensor_parallel_size=2,
+        distributed_executor_backend="mp",
+        enable_expert_parallel=True,
+        enforce_eager=True,
+    ) as vllm_model:
        vllm_model.generate(example_prompts, sampling_params)


@@ -168,20 +168,17 @@ def test_qwen3_moe_fc2_oshard_tp2() -> None:
    example_prompts = [
        "Hello, my name is",
    ]
-    sampling_params = SamplingParams(max_tokens=5,
-                                     temperature=0.0,
-                                     top_k=50,
-                                     top_p=0.9)
+    sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)

    with VllmRunner(
-            "Qwen/Qwen3-30B-A3B",
-            dtype="auto",
-            tensor_parallel_size=2,
-            distributed_executor_backend="mp",
-            enable_expert_parallel=True,
-            enforce_eager=
-            True,  # TODO(Levi-JQ): support graph mode for fc2 in Qwen 
-            additional_config={"layer_sharding": ["o_proj"]}) as vllm_model:
+        "Qwen/Qwen3-30B-A3B",
+        dtype="auto",
+        tensor_parallel_size=2,
+        distributed_executor_backend="mp",
+        enable_expert_parallel=True,
+        enforce_eager=True,  # TODO(Levi-JQ): support graph mode for fc2 in Qwen
+        additional_config={"layer_sharding": ["o_proj"]},
+    ) as vllm_model:
        vllm_model.generate(example_prompts, sampling_params)


@@ -190,17 +187,16 @@ def test_deepseek_v2_lite_fc1_tp2() -> None:
    example_prompts = [
        "test" * 1001,
    ]
-    sampling_params = SamplingParams(max_tokens=5,
-                                     temperature=0.0,
-                                     top_k=50,
-                                     top_p=0.9)
-    with VllmRunner("vllm-ascend/DeepSeek-V2-Lite-W8A8",
-                    dtype="auto",
-                    tensor_parallel_size=2,
-                    distributed_executor_backend="mp",
-                    enable_expert_parallel=True,
-                    enforce_eager=True,
-                    quantization="ascend") as vllm_model:
+    sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)
+    with VllmRunner(
+        "vllm-ascend/DeepSeek-V2-Lite-W8A8",
+        dtype="auto",
+        tensor_parallel_size=2,
+        distributed_executor_backend="mp",
+        enable_expert_parallel=True,
+        enforce_eager=True,
+        quantization="ascend",
+    ) as vllm_model:
        vllm_model.generate(example_prompts, sampling_params)


@@ -213,12 +209,12 @@ def test_qwen3_dense_fc1_tp2(model):
    max_tokens = 5

    with VllmRunner(
-            model,
-            max_model_len=8192,
-            dtype="auto",
-            tensor_parallel_size=2,
-            cudagraph_capture_sizes=[1, 2, 4, 8],
-            quantization="ascend",
+        model,
+        max_model_len=8192,
+        dtype="auto",
+        tensor_parallel_size=2,
+        cudagraph_capture_sizes=[1, 2, 4, 8],
+        quantization="ascend",
    ) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)

@@ -232,13 +228,13 @@ def test_qwen3_dense_prefetch_mlp_weight_tp2(model):
    max_tokens = 5

    with VllmRunner(
-            model,
-            max_model_len=8192,
-            dtype="auto",
-            tensor_parallel_size=2,
-            cudagraph_capture_sizes=[1, 2, 4, 8],
-            quantization="ascend",
-            additional_config={"weight_prefetch_config": {"enabled": True}},
+        model,
+        max_model_len=8192,
+        dtype="auto",
+        tensor_parallel_size=2,
+        cudagraph_capture_sizes=[1, 2, 4, 8],
+        quantization="ascend",
+        additional_config={"weight_prefetch_config": {"enabled": True}},
    ) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)

@@ -252,28 +248,20 @@ def test_deepseek3_2_w8a8_pruning_mtp_tp2_ep():
        "Hello ",
    ]
    # "max_position_embeddings": 163840,
-    long_example_prompts = [
-        "Hello " * (163839 - 500) + "Hello"
-    ]
+    long_example_prompts = ["Hello " * (163839 - 500) + "Hello"]
    max_tokens = 500
-    with VllmRunner("vllm-ascend/DeepSeek-V3.2-W8A8-Pruning",
-                    tensor_parallel_size=2,
-                    quantization="ascend",
-                    enable_expert_parallel=True,
-                    max_model_len=163840,
-                    compilation_config={
-                        "cudagraph_capture_sizes": [2, 4, 6, 8, 10, 12],
-                        "cudagraph_mode": "FULL_DECODE_ONLY"
-                    },
-                    speculative_config={
-                        "num_speculative_tokens": 1,
-                        "method": "deepseek_mtp"
-                    },
-                    additional_config={
-                        "layer_sharding":["q_b_proj", "o_proj"]
-                    },
-                    reasoning_parser="deepseek_v3",
-                    tokenizer_mode="deepseek_v32") as vllm_model:
+    with VllmRunner(
+        "vllm-ascend/DeepSeek-V3.2-W8A8-Pruning",
+        tensor_parallel_size=2,
+        quantization="ascend",
+        enable_expert_parallel=True,
+        max_model_len=163840,
+        compilation_config={"cudagraph_capture_sizes": [2, 4, 6, 8, 10, 12], "cudagraph_mode": "FULL_DECODE_ONLY"},
+        speculative_config={"num_speculative_tokens": 1, "method": "deepseek_mtp"},
+        additional_config={"layer_sharding": ["q_b_proj", "o_proj"]},
+        reasoning_parser="deepseek_v3",
+        tokenizer_mode="deepseek_v32",
+    ) as vllm_model:
        vllm_model.generate_greedy(short_example_prompts, max_tokens)
        vllm_model.generate_greedy(long_example_prompts, max_tokens)

@@ -285,10 +273,10 @@ def test_qwen3_w4a4_distributed_tp2(model):
    ]
    max_tokens = 5
    with VllmRunner(
-            model,
-            tensor_parallel_size=2,
-            cudagraph_capture_sizes=[1, 2, 4, 8],
-            quantization="ascend",
+        model,
+        tensor_parallel_size=2,
+        cudagraph_capture_sizes=[1, 2, 4, 8],
+        quantization="ascend",
    ) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)

@@ -300,8 +288,8 @@ def test_gpt_oss_distributed_tp2(model):
    ]
    max_tokens = 5
    with VllmRunner(
-            model,
-            tensor_parallel_size=2,
-            enforce_eager=True,
+        model,
+        tensor_parallel_size=2,
+        enforce_eager=True,
    ) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)
--- a/tests/e2e/multicard/2-cards/test_offline_weight_load.py
+++ b/tests/e2e/multicard/2-cards/test_offline_weight_load.py
@@ -32,9 +32,7 @@ MODELS = ["Qwen/Qwen3-30B-A3B"]
@pytest.mark.parametrize("model", MODELS)
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
 def test_qwen3_offline_load_and_sleepmode_tp2(model):
-    script = Path(
-        __file__
-    ).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
+    script = Path(__file__).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
    env = os.environ.copy()
    cmd = [
        sys.executable,
@@ -65,7 +63,7 @@ def test_qwen3_offline_load_and_sleepmode_tp2(model):
        stderr=subprocess.STDOUT,
        timeout=600,
    )
-    output = proc.stdout.decode(errors='ignore')
+    output = proc.stdout.decode(errors="ignore")

    print(output)

--- a/tests/e2e/multicard/2-cards/test_pipeline_parallel.py
+++ b/tests/e2e/multicard/2-cards/test_pipeline_parallel.py
@@ -37,12 +37,13 @@ prompts = [
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
@pytest.mark.parametrize("pp_size", PIPELINE_PARALLELS)
@pytest.mark.parametrize("distributed_executor_backend", DIST_EXECUTOR_BACKEND)
-def test_models_pp2(model: str, tp_size: int, pp_size: int,
-                    distributed_executor_backend: str) -> None:
-    with VllmRunner(model,
-                    tensor_parallel_size=tp_size,
-                    pipeline_parallel_size=pp_size,
-                    cudagraph_capture_sizes=[1, 2, 4, 8],
-                    distributed_executor_backend=distributed_executor_backend,
-                    gpu_memory_utilization=0.7) as vllm_model:
+def test_models_pp2(model: str, tp_size: int, pp_size: int, distributed_executor_backend: str) -> None:
+    with VllmRunner(
+        model,
+        tensor_parallel_size=tp_size,
+        pipeline_parallel_size=pp_size,
+        cudagraph_capture_sizes=[1, 2, 4, 8],
+        distributed_executor_backend=distributed_executor_backend,
+        gpu_memory_utilization=0.7,
+    ) as vllm_model:
        vllm_model.generate_greedy(prompts, 64)
--- a/tests/e2e/multicard/2-cards/test_prefix_caching.py
+++ b/tests/e2e/multicard/2-cards/test_prefix_caching.py
@@ -11,11 +11,14 @@ MODELS = [
    # for MHA
    "Qwen/Qwen3-8B",
    # for MLA
-    "deepseek-ai/DeepSeek-V2-Lite-Chat"
+    "deepseek-ai/DeepSeek-V2-Lite-Chat",
 ]

 # A prompt containing a large markdown table. The table is randomly generated by GPT-4.
-LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """
+# ruff: noqa: E501
+LONG_PROMPT = (
+    "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n"
+    + """
 | ID  | Name          | Age | Occupation    | Country       | Email                  | Phone Number   | Address                       |
 |-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------|
 | 1   | John Doe      | 29  | Engineer      | USA           | john.doe@example.com   | 555-1234       | 123 Elm St, Springfield, IL  |
@@ -49,32 +52,34 @@ LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables i
 | 29  | Amy White     | 33  | Musician      | New Zealand   | amy.w@example.com      | 555-5658       | 159 Maple St, Wellington, NZ |
 | 30  | Ben Black     | 38  | Chef          | Ireland       | ben.b@example.com      | 555-7870       | 246 Fir St, Waterford, IE    |
 """
+)

 INPUT_PROMPTS = [
-    LONG_PROMPT +
-    "Question: what is the age of John Doe? Your answer: The age of John Doe is ",
-    LONG_PROMPT +
-    "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is "
+    LONG_PROMPT + "Question: what is the age of John Doe? Your answer: The age of John Doe is ",
+    LONG_PROMPT + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ",
 ]


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [50])
 def test_models_prefix_cache_tp2(model: str, max_tokens: int) -> None:
-    with VllmRunner(model,
-                    max_model_len=2048,
-                    tensor_parallel_size=2,
-                    cudagraph_capture_sizes=[1, 2, 4, 8],
-                    gpu_memory_utilization=0.7) as vllm_model:
-        prefix_cache_output = vllm_model.generate_greedy(
-            INPUT_PROMPTS, max_tokens)
+    with VllmRunner(
+        model,
+        max_model_len=2048,
+        tensor_parallel_size=2,
+        cudagraph_capture_sizes=[1, 2, 4, 8],
+        gpu_memory_utilization=0.7,
+    ) as vllm_model:
+        prefix_cache_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)

-    with VllmRunner(model,
-                    enable_prefix_caching=False,
-                    max_model_len=2048,
-                    tensor_parallel_size=2,
-                    cudagraph_capture_sizes=[1, 2, 4, 8],
-                    gpu_memory_utilization=0.7) as vllm_model:
+    with VllmRunner(
+        model,
+        enable_prefix_caching=False,
+        max_model_len=2048,
+        tensor_parallel_size=2,
+        cudagraph_capture_sizes=[1, 2, 4, 8],
+        gpu_memory_utilization=0.7,
+    ) as vllm_model:
        vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)

    check_outputs_equal(
--- a/tests/e2e/multicard/2-cards/test_quantization.py
+++ b/tests/e2e/multicard/2-cards/test_quantization.py
@@ -16,7 +16,6 @@
 # This file is a part of the vllm-ascend project.
 # Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
 #
-import pytest

 from tests.e2e.conftest import VllmRunner

@@ -27,16 +26,16 @@ def test_qwen2_5_w8a8_external_quantized_tp2():
    ]
    max_tokens = 5
    with VllmRunner(
-            "neuralmagic/Qwen2.5-3B-quantized.w8a8",
-            tensor_parallel_size=2,
-            cudagraph_capture_sizes=[1, 2, 4, 8],
-            max_model_len=4096,
-            gpu_memory_utilization=0.8,
+        "neuralmagic/Qwen2.5-3B-quantized.w8a8",
+        tensor_parallel_size=2,
+        cudagraph_capture_sizes=[1, 2, 4, 8],
+        max_model_len=4096,
+        gpu_memory_utilization=0.8,
    ) as vllm_model:
        vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)

    golden_results = [
-        'The president of the United States is the head of state and',
+        "The president of the United States is the head of state and",
    ]

    for i in range(len(vllm_output)):
@@ -50,36 +49,37 @@ def test_qwen3_moe_w8a8_dynamic_llm_compressor():
    ]
    max_tokens = 5
    with VllmRunner(
-            "vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8",
-            tensor_parallel_size=2,
-            max_model_len=4096,
-            gpu_memory_utilization=0.8,
+        "vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8",
+        tensor_parallel_size=2,
+        max_model_len=4096,
+        gpu_memory_utilization=0.8,
    ) as vllm_model:
        vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)

    golden_results = [
-        'The president of the United States is the head of state and',
+        "The president of the United States is the head of state and",
    ]

    for i in range(len(vllm_output)):
        assert golden_results[i] == vllm_output[i][1]
        print(f"Generated text: {vllm_output[i][1]!r}")

+
 def test_qwen3_moe_w4a8_dynamic_llm_compressor():
    example_prompts = [
        "The president of the United States is",
    ]
    max_tokens = 5
    with VllmRunner(
-            "vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w4a8",
-            tensor_parallel_size=2,
-            max_model_len=4096,
-            gpu_memory_utilization=0.8,
+        "vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w4a8",
+        tensor_parallel_size=2,
+        max_model_len=4096,
+        gpu_memory_utilization=0.8,
    ) as vllm_model:
        vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)

    golden_results = [
-        'The president of the United States is the head of state and',
+        "The president of the United States is the head of state and",
    ]

    for i in range(len(vllm_output)):
--- a/tests/e2e/multicard/2-cards/test_qwen3_moe.py
+++ b/tests/e2e/multicard/2-cards/test_qwen3_moe.py
@@ -34,11 +34,11 @@ def test_qwen3_moe_distributed_mp_tp2_ep():
    ]
    max_tokens = 5
    with VllmRunner(
-            "Qwen/Qwen3-30B-A3B",
-            tensor_parallel_size=2,
-            enable_expert_parallel=True,
-            cudagraph_capture_sizes=[1, 2, 4, 8],
-            distributed_executor_backend="mp",
+        "Qwen/Qwen3-30B-A3B",
+        tensor_parallel_size=2,
+        enable_expert_parallel=True,
+        cudagraph_capture_sizes=[1, 2, 4, 8],
+        distributed_executor_backend="mp",
    ) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)

@@ -49,27 +49,27 @@ def test_qwen3_moe_w8a8_distributed_tp2():
    ]
    max_tokens = 5
    with VllmRunner(
-            "vllm-ascend/Qwen3-30B-A3B-W8A8",
-            max_model_len=8192,
-            tensor_parallel_size=2,
-            cudagraph_capture_sizes=[1, 2, 4, 8],
-            quantization="ascend",
+        "vllm-ascend/Qwen3-30B-A3B-W8A8",
+        max_model_len=8192,
+        tensor_parallel_size=2,
+        cudagraph_capture_sizes=[1, 2, 4, 8],
+        quantization="ascend",
    ) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)


 def test_qwen3_moe_distributed_aiv_tp2():
-    os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV'
+    os.environ["HCCL_OP_EXPANSION_MODE"] = "AIV"
    example_prompts = [
        "Hello, my name is",
    ]
    dtype = "auto"
    max_tokens = 5
    with VllmRunner(
-            "Qwen/Qwen3-30B-A3B",
-            dtype=dtype,
-            tensor_parallel_size=2,
-            cudagraph_capture_sizes=[1, 2, 4, 8],
+        "Qwen/Qwen3-30B-A3B",
+        dtype=dtype,
+        tensor_parallel_size=2,
+        cudagraph_capture_sizes=[1, 2, 4, 8],
    ) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)

@@ -80,23 +80,24 @@ async def test_qwen3_moe_w8a8_distributed_tp2_ep_dynamic_eplb():
    port = get_open_port()
    compilation_config = json.dumps({"cudagraph_capture_sizes": [8]})
    server_args = [
-        "--max_model_len", "8192", "--tensor_parallel_size", "2",
-        "--enable_expert_parallel", "--quantization", "ascend", "--port",
-        str(port), "--compilation-config", compilation_config
+        "--max_model_len",
+        "8192",
+        "--tensor_parallel_size",
+        "2",
+        "--enable_expert_parallel",
+        "--quantization",
+        "ascend",
+        "--port",
+        str(port),
+        "--compilation-config",
+        compilation_config,
    ]
    env_dict = {"HCCL_BUFFSIZE": "1024"}
-    with RemoteOpenAIServer(model,
-                            server_args,
-                            server_port=port,
-                            auto_port=False,
-                            env_dict=env_dict) as server:
+    with RemoteOpenAIServer(model, server_args, server_port=port, auto_port=False, env_dict=env_dict) as server:
        client = server.get_async_client()
-        batch = await client.completions.create(model=model,
-                                                prompt="What is deeplearning?",
-                                                max_tokens=400,
-                                                temperature=0,
-                                                top_p=1.0,
-                                                n=1)
+        batch = await client.completions.create(
+            model=model, prompt="What is deeplearning?", max_tokens=400, temperature=0, top_p=1.0, n=1
+        )
        gt_choices: list[openai.types.CompletionChoice] = batch.choices

    # dynamic eplb test
@@ -108,22 +109,14 @@ async def test_qwen3_moe_w8a8_distributed_tp2_ep_dynamic_eplb():
            "dynamic_eplb": True,
            "expert_heat_collection_interval": 100,
            "algorithm_execution_interval": 20,
-            "num_redundant_experts": 2
+            "num_redundant_experts": 2,
        }
    }
    server_args.extend(["--additional-config", json.dumps(additional_config)])
-    with RemoteOpenAIServer(model,
-                            server_args,
-                            server_port=port,
-                            auto_port=False,
-                            env_dict=env_dict) as server:
+    with RemoteOpenAIServer(model, server_args, server_port=port, auto_port=False, env_dict=env_dict) as server:
        client = server.get_async_client()
-        batch = await client.completions.create(model=model,
-                                                prompt="What is deeplearning?",
-                                                max_tokens=400,
-                                                temperature=0,
-                                                top_p=1.0,
-                                                n=1)
+        batch = await client.completions.create(
+            model=model, prompt="What is deeplearning?", max_tokens=400, temperature=0, top_p=1.0, n=1
+        )
        eplb_choices: list[openai.types.CompletionChoice] = batch.choices
-    assert gt_choices[0].text == eplb_choices[
-        0].text, f"{gt_choices[0].text=} \n {eplb_choices[0].text=}"
+    assert gt_choices[0].text == eplb_choices[0].text, f"{gt_choices[0].text=} \n {eplb_choices[0].text=}"
--- a/tests/e2e/multicard/2-cards/test_qwen3_moe_routing_replay.py
+++ b/tests/e2e/multicard/2-cards/test_qwen3_moe_routing_replay.py
@@ -1,10 +1,11 @@
 import os
 from unittest.mock import patch

-from tests.e2e.conftest import VllmRunner
 from vllm import SamplingParams
 from vllm.sampling_params import RequestOutputKind

+from tests.e2e.conftest import VllmRunner
+

@patch.dict(os.environ, {"OMP_NUM_THREADS": "1"})
 def test_qwen3_moe_routing_replay():
@@ -12,18 +13,15 @@ def test_qwen3_moe_routing_replay():
        "Hello, please introduce yourself.",
    ]
    with VllmRunner(
-            "Qwen/Qwen3-30B-A3B",
-            tensor_parallel_size=2,
-            enable_expert_parallel=True,
-            cudagraph_capture_sizes=[1, 2, 4, 8],
-            distributed_executor_backend="mp",
-            enable_return_routed_experts=True,
+        "Qwen/Qwen3-30B-A3B",
+        tensor_parallel_size=2,
+        enable_expert_parallel=True,
+        cudagraph_capture_sizes=[1, 2, 4, 8],
+        distributed_executor_backend="mp",
+        enable_return_routed_experts=True,
    ) as vllm_model:
        sampling_params = SamplingParams(
-            max_tokens=5,
-            temperature=0.8,
-            top_p=0.95,
-            output_kind=RequestOutputKind.FINAL_ONLY
+            max_tokens=5, temperature=0.8, top_p=0.95, output_kind=RequestOutputKind.FINAL_ONLY
        )
        inputs = vllm_model.get_inputs(prompts=prompts)
        outputs = vllm_model.model.generate(prompts=inputs, sampling_params=sampling_params)
--- a/tests/e2e/multicard/2-cards/test_qwen3_performance.py
+++ b/tests/e2e/multicard/2-cards/test_qwen3_performance.py
@@ -84,11 +84,7 @@ async def test_models(model: str) -> None:
    request_keyword_args: dict[str, Any] = {
        **api_keyword_args,
    }
-    with RemoteOpenAIServer(model,
-                            server_args,
-                            server_port=port,
-                            env_dict=env_dict,
-                            auto_port=False) as server:
+    with RemoteOpenAIServer(model, server_args, server_port=port, env_dict=env_dict, auto_port=False) as server:
        client = server.get_async_client()
        batch = await client.completions.create(
            model=model,
--- a/tests/e2e/multicard/2-cards/test_shared_expert_dp.py
+++ b/tests/e2e/multicard/2-cards/test_shared_expert_dp.py
@@ -13,69 +13,65 @@ MODELS = [

@pytest.mark.parametrize("model", MODELS)
 def test_deepseek_v2_lite_enable_shared_expert_dp_tp2(model: str) -> None:
-
-    if 'HCCL_OP_EXPANSION_MODE' in os.environ:
-        del os.environ['HCCL_OP_EXPANSION_MODE']
+    if "HCCL_OP_EXPANSION_MODE" in os.environ:
+        del os.environ["HCCL_OP_EXPANSION_MODE"]

    prompts = [
-        "Hello, my name is", "The capital of the United States is",
-        "The capital of France is", "The future of AI is"
+        "Hello, my name is",
+        "The capital of the United States is",
+        "The capital of France is",
+        "The future of AI is",
    ]
    sampling_params = SamplingParams(max_tokens=32, temperature=0.0)

    with VllmRunner(
-            model,
-            max_model_len=1024,
-            enforce_eager=True,
-            tensor_parallel_size=2,
-            enable_expert_parallel=True,
+        model,
+        max_model_len=1024,
+        enforce_eager=True,
+        tensor_parallel_size=2,
+        enable_expert_parallel=True,
    ) as runner:
        vllm_eager_outputs = runner.model.generate(prompts, sampling_params)

    os.environ["VLLM_ASCEND_ENABLE_FLASHCOMM1"] = "1"
    with VllmRunner(
-            model,
-            max_model_len=1024,
-            enforce_eager=True,
-            tensor_parallel_size=2,
-            enable_expert_parallel=True,
-            additional_config={
-                "enable_shared_expert_dp": True,
-            },
+        model,
+        max_model_len=1024,
+        enforce_eager=True,
+        tensor_parallel_size=2,
+        enable_expert_parallel=True,
+        additional_config={
+            "enable_shared_expert_dp": True,
+        },
    ) as runner:
-        shared_expert_dp_eager_outputs = runner.model.generate(
-            prompts, sampling_params)
+        shared_expert_dp_eager_outputs = runner.model.generate(prompts, sampling_params)

    with VllmRunner(
-            model,
-            max_model_len=1024,
-            tensor_parallel_size=2,
-            enable_expert_parallel=True,
-            compilation_config={
-                "cudagraph_capture_sizes": [1, 4, 8, 16],
-                "cudagraph_mode": "FULL_DECODE_ONLY",
-            },
-            additional_config={
-                "enable_shared_expert_dp": True,
-            },
+        model,
+        max_model_len=1024,
+        tensor_parallel_size=2,
+        enable_expert_parallel=True,
+        compilation_config={
+            "cudagraph_capture_sizes": [1, 4, 8, 16],
+            "cudagraph_mode": "FULL_DECODE_ONLY",
+        },
+        additional_config={
+            "enable_shared_expert_dp": True,
+        },
    ) as runner:
-        shared_expert_dp_aclgraph_outputs = runner.model.generate(
-            prompts, sampling_params)
+        shared_expert_dp_aclgraph_outputs = runner.model.generate(prompts, sampling_params)

    vllm_eager_outputs_list = []
    for output in vllm_eager_outputs:
-        vllm_eager_outputs_list.append(
-            (output.outputs[0].index, output.outputs[0].text))
+        vllm_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))

    shared_expert_dp_eager_outputs_list = []
    for output in shared_expert_dp_eager_outputs:
-        shared_expert_dp_eager_outputs_list.append(
-            (output.outputs[0].index, output.outputs[0].text))
+        shared_expert_dp_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))

    shared_expert_dp_aclgraph_outputs_list = []
    for output in shared_expert_dp_aclgraph_outputs:
-        shared_expert_dp_aclgraph_outputs_list.append(
-            (output.outputs[0].index, output.outputs[0].text))
+        shared_expert_dp_aclgraph_outputs_list.append((output.outputs[0].index, output.outputs[0].text))

    check_outputs_equal(
        outputs_0_lst=vllm_eager_outputs_list,
--- a/tests/e2e/multicard/2-cards/test_single_request_aclgraph.py
+++ b/tests/e2e/multicard/2-cards/test_single_request_aclgraph.py
@@ -39,8 +39,7 @@ api_keyword_args = {
@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dp_size", DATA_PARALLELS)
-async def test_models_single_request_aclgraph_dp2(model: str,
-                                                  dp_size: int) -> None:
+async def test_models_single_request_aclgraph_dp2(model: str, dp_size: int) -> None:
    port = get_open_port()
    env_dict = {
        "TASK_QUEUE_ENABLE": "1",
@@ -48,36 +47,51 @@ async def test_models_single_request_aclgraph_dp2(model: str,
    }
    if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
        server_args = [
-            "--no-enable-prefix-caching", "--tensor-parallel-size", "1",
+            "--no-enable-prefix-caching",
+            "--tensor-parallel-size",
+            "1",
            "--data-parallel-size",
-            str(dp_size), "--quantization", "ascend", "--max-model-len",
-            "1024", "--port",
-            str(port), "--trust-remote-code", "--gpu-memory-utilization", "0.9"
+            str(dp_size),
+            "--quantization",
+            "ascend",
+            "--max-model-len",
+            "1024",
+            "--port",
+            str(port),
+            "--trust-remote-code",
+            "--gpu-memory-utilization",
+            "0.9",
        ]
    else:
        server_args = [
-            "--no-enable-prefix-caching", "--tensor-parallel-size", "1",
+            "--no-enable-prefix-caching",
+            "--tensor-parallel-size",
+            "1",
            "--data-parallel-size",
-            str(dp_size), "--port",
-            str(port), "--trust-remote-code", "--gpu-memory-utilization", "0.9"
+            str(dp_size),
+            "--port",
+            str(port),
+            "--trust-remote-code",
+            "--gpu-memory-utilization",
+            "0.9",
        ]
    request_keyword_args: dict[str, Any] = {
        **api_keyword_args,
    }
-    with RemoteOpenAIServer(model,
-                            vllm_serve_args=server_args,
-                            server_port=port,
-                            env_dict=env_dict,
-                            auto_port=False) as server:
+    with RemoteOpenAIServer(
+        model, vllm_serve_args=server_args, server_port=port, env_dict=env_dict, auto_port=False
+    ) as server:
        client = server.get_async_client()

        try:
-            batch = await asyncio.wait_for(client.completions.create(
-                model=model,
-                prompt=prompts,
-                **request_keyword_args,
-            ),
-                                           timeout=10.0)
+            batch = await asyncio.wait_for(
+                client.completions.create(
+                    model=model,
+                    prompt=prompts,
+                    **request_keyword_args,
+                ),
+                timeout=10.0,
+            )
        except asyncio.TimeoutError:
            pytest.fail("Model did not return response within 10 seconds")

--- a/tests/e2e/multicard/2-cards/test_sp_pass.py
+++ b/tests/e2e/multicard/2-cards/test_sp_pass.py
@@ -1,5 +1,3 @@
-import os
-
 import pytest
 from vllm import SamplingParams

@@ -14,47 +12,46 @@ MODELS = [
@pytest.mark.parametrize("model", MODELS)
 def test_qwen3_vl_sp_tp2(model: str) -> None:
    prompts = [
-        "Hello, my name is", "The capital of the United States is",
-        "The capital of France is", "The future of AI is"
+        "Hello, my name is",
+        "The capital of the United States is",
+        "The capital of France is",
+        "The future of AI is",
    ]
    sampling_params = SamplingParams(max_tokens=10, temperature=0.0)

    with VllmRunner(
-            model,
-            max_model_len=1024,
-            tensor_parallel_size=2,
-            compilation_config={
-                "cudagraph_capture_sizes": [2, 4],
-                "cudagraph_mode": "FULL_DECODE_ONLY",
-                "pass_config": {"enable_sp": False}
-            },
-            additional_config={"ascend_compilation_config": {"enable_npugraph_ex": False}}
+        model,
+        max_model_len=1024,
+        tensor_parallel_size=2,
+        compilation_config={
+            "cudagraph_capture_sizes": [2, 4],
+            "cudagraph_mode": "FULL_DECODE_ONLY",
+            "pass_config": {"enable_sp": False},
+        },
+        additional_config={"ascend_compilation_config": {"enable_npugraph_ex": False}},
    ) as runner:
        no_sp_outputs = runner.model.generate(prompts, sampling_params)

    with VllmRunner(
-            model,
-            max_model_len=1024,
-            tensor_parallel_size=2,
-            compilation_config={
-                "cudagraph_capture_sizes": [2, 4],
-                "cudagraph_mode": "FULL_DECODE_ONLY",
-                "pass_config": {"enable_sp": True}
-            },
-            additional_config={"sp_threshold": 10, "ascend_compilation_config": {"enable_npugraph_ex": False}}
+        model,
+        max_model_len=1024,
+        tensor_parallel_size=2,
+        compilation_config={
+            "cudagraph_capture_sizes": [2, 4],
+            "cudagraph_mode": "FULL_DECODE_ONLY",
+            "pass_config": {"enable_sp": True},
+        },
+        additional_config={"sp_threshold": 10, "ascend_compilation_config": {"enable_npugraph_ex": False}},
    ) as runner:
-        sp_outputs = runner.model.generate(
-            prompts, sampling_params)
+        sp_outputs = runner.model.generate(prompts, sampling_params)

    no_sp_outputs_list = []
    for output in no_sp_outputs:
-        no_sp_outputs_list.append(
-            (output.outputs[0].index, output.outputs[0].text))
+        no_sp_outputs_list.append((output.outputs[0].index, output.outputs[0].text))

    sp_outputs_list = []
    for output in sp_outputs:
-        sp_outputs_list.append(
-            (output.outputs[0].index, output.outputs[0].text))
+        sp_outputs_list.append((output.outputs[0].index, output.outputs[0].text))

    check_outputs_equal(
        outputs_0_lst=no_sp_outputs_list,