diff --git a/pyproject.toml b/pyproject.toml
index 7e90ef2d..665950c5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,11 +46,41 @@ plugins.md024.allow_different_nesting = true # no-duplicate-headers
 plugins.md029.enabled = false # ol-prefix
 
 [tool.ruff]
-# TODO: according to PEP8, there should be 80 characters per line
+# TODO: according to PEP8, there should be 120 characters per line
 line-length = 120
 # Folder to be modified
 exclude = [
-    "tests/**",
+    # Batch (1)
+    "tests/e2e/__init__.py",
+    "tests/e2e/310p/",
+    "tests/e2e/conftest.py",
+    "tests/e2e/doctests/",
+    "tests/e2e/model_utils.py",
+    "tests/e2e/models/",
+    "tests/e2e/multicard/2-cards/",
+    
+    # Batch (2)
+    "tests/e2e/multicard/4-cards/",
+    "tests/e2e/nightly/multi_node/",
+    
+    # Batch (3)
+    "tests/e2e/nightly/single_node/models/",
+    
+    # Batch (4)
+    "tests/e2e/nightly/single_node/ops/",
+    
+    # Batch (5)
+    # "tests/e2e/singlecard/",
+    
+    # Batch (6)
+    "tests/e2e/nightly/single_node/ops/singlecard_ops/triton/",
+    "tests/e2e/singlecard/pooling/",
+    "tests/e2e/singlecard/spec_decode/",
+    "tests/e2e/utils.py",
+    "tests/e2e/vllm_interface/",
+    "tests/e2e/weekly/",
+
+    "tests/ut/",
 ]
 
 [tool.ruff.lint]
diff --git a/tests/e2e/singlecard/compile/backend.py b/tests/e2e/singlecard/compile/backend.py
index 3776a252..e0fde30c 100644
--- a/tests/e2e/singlecard/compile/backend.py
+++ b/tests/e2e/singlecard/compile/backend.py
@@ -14,8 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+from collections.abc import Callable, Sequence
 from copy import deepcopy
-from typing import Any, Callable, List, Optional, Sequence
+from typing import Any
 
 import torch.fx as fx
 from torch._inductor.decomposition import select_decomp_table
@@ -37,7 +38,7 @@ class TestBackend:
     records the FX graph before and after the transformation.
     """
 
-    def __init__(self, custom_passes: Optional[List[Any]] = None):
+    def __init__(self, custom_passes: list[Any] | None = None):
         vllm_config = get_current_vllm_config()
         compile_config = vllm_config.compilation_config
         self.inductor_config = compile_config.inductor_compile_config
@@ -48,9 +49,7 @@ class TestBackend:
         self.graph_pre_pass = None
         self.graph_post_pass = None
 
-    def post_pass(self,
-                  graph: fx.Graph,
-                  runtime_shape: int | None = None) -> fx.Graph:
+    def post_pass(self, graph: fx.Graph, runtime_shape: int | None = None) -> fx.Graph:
         """
         Apply custom graph transformation passes.
         """
@@ -62,13 +61,13 @@ class TestBackend:
         return graph
 
     def compile(
-            self,
-            graph: fx.GraphModule,
-            example_inputs: list[Any],
-            compiler_config: dict[str, Any],
-            runtime_shape: Optional[int] = None,
-            key: Optional[str] = None
-    ) -> tuple[Optional[Callable], Optional[Any]]:
+        self,
+        graph: fx.GraphModule,
+        example_inputs: list[Any],
+        compiler_config: dict[str, Any],
+        runtime_shape: int | None = None,
+        key: str | None = None,
+    ) -> tuple[Callable | None, Any | None]:
         """
         Compile the FX graph using vLLM's Ascend compiler interface.
         Wraps the post-pass logic into the inner_compile callback.
@@ -87,8 +86,7 @@ class TestBackend:
         )
         return compiled_fn, None
 
-    def __call__(self, gm: fx.GraphModule,
-                 example_inputs: Optional[List[Any]]):
+    def __call__(self, gm: fx.GraphModule, example_inputs: list[Any] | None):
         """
         Make the backend callable by torch.compile().
         Returns a compiled executable function.
@@ -103,17 +101,11 @@ class TestBackend:
         )
         return compiled_fn
 
-    def find_nodes_by_target(self, graph: fx.GraphModule,
-                             target: OpOverload) -> List[fx.Node]:
+    def find_nodes_by_target(self, graph: fx.GraphModule, target: OpOverload) -> list[fx.Node]:
         """Helper to find all FX nodes that call a specific operator."""
-        return [
-            node for node in graph.graph.nodes
-            if hasattr(node, 'target') and node.target == target
-        ]
+        return [node for node in graph.graph.nodes if hasattr(node, "target") and node.target == target]
 
-    def check_before_ops(self,
-                         ops: Sequence[OpOverload],
-                         fully_replaced: bool = True):
+    def check_before_ops(self, ops: Sequence[OpOverload], fully_replaced: bool = True):
         """
         Verify that the original (unfused) operators exist before the pass
         and are fully removed afterward (if fully_replaced=True).
diff --git a/tests/e2e/singlecard/compile/test_graphex_norm_quant_fusion.py b/tests/e2e/singlecard/compile/test_graphex_norm_quant_fusion.py
index 2b231a4d..1fa40e36 100644
--- a/tests/e2e/singlecard/compile/test_graphex_norm_quant_fusion.py
+++ b/tests/e2e/singlecard/compile/test_graphex_norm_quant_fusion.py
@@ -215,6 +215,7 @@ def register_pattern_safe(pattern_class, vllm_config, eps, pattern_key):
     try:
         # Import the required pass class
         from torch._inductor.pattern_matcher import PatternMatcherPass
+
         pm_pass = PatternMatcherPass()
         pattern.register(pm_pass)
         _registered_patterns.add(pattern_key)
@@ -243,7 +244,7 @@ def test_rmsnorm_quant_fusion(
     sp_enable: bool,
 ):
     # Check if fusion operator is available
-    if not hasattr(torch.ops.npu, 'npu_add_rms_norm_quant'):
+    if not hasattr(torch.ops.npu, "npu_add_rms_norm_quant"):
         pytest.skip("Fusion operator npu_add_rms_norm_quant not available, skipping test")
 
     vllm_config = VllmConfig(model_config=ModelConfig(dtype=dtype))
@@ -266,7 +267,7 @@ def test_rmsnorm_quant_fusion(
             if not enable_custom_op():
                 pytest.skip("Custom ops not available, skipping bias test")
             # Check if the bias operator exists
-            if not hasattr(torch.ops._C_ascend, 'npu_add_rms_norm_bias'):
+            if not hasattr(torch.ops._C_ascend, "npu_add_rms_norm_bias"):
                 pytest.skip("Operator npu_add_rms_norm_bias not available, skipping bias test")
             if sp_enable:
                 model = ModelSPWithBias(hidden_size, dtype, eps, device="npu")
@@ -281,13 +282,11 @@ def test_rmsnorm_quant_fusion(
         else:
             # The non-bias patterns currently use npu_add_rms_norm_bias in their pattern matching
             # so we need to skip if it's not available
-            if not hasattr(torch.ops._C_ascend, 'npu_add_rms_norm_bias'):
+            if not hasattr(torch.ops._C_ascend, "npu_add_rms_norm_bias"):
                 pytest.skip("Operator npu_add_rms_norm_bias not available, skipping test")
             if sp_enable:
                 model = ModelSPWithoutBias(hidden_size, dtype, eps, device="npu")
-                register_pattern_safe(
-                    AddRMSNormQuantSPPattern, vllm_config, eps, "GraphEXAddRMSNormQuantSPPattern"
-                )
+                register_pattern_safe(AddRMSNormQuantSPPattern, vllm_config, eps, "GraphEXAddRMSNormQuantSPPattern")
             else:
                 model = ModelWithoutBias(hidden_size, dtype, eps, device="npu")
                 register_pattern_safe(AddRMSNormQuantPattern, vllm_config, eps, "GraphEXAddRMSNormQuantPattern")
@@ -302,5 +301,9 @@ def test_rmsnorm_quant_fusion(
             compiled_out, compiled_res = compiled_model(x)
 
             # Verify output shapes are correct
-            assert compiled_out.shape == (num_tokens, hidden_size), f"Expected shape {(num_tokens, hidden_size)}, got {compiled_out.shape}"
-            assert compiled_res.shape == (num_tokens, hidden_size), f"Expected shape {(num_tokens, hidden_size)}, got {compiled_res.shape}"
+            assert compiled_out.shape == (num_tokens, hidden_size), (
+                f"Expected shape {(num_tokens, hidden_size)}, got {compiled_out.shape}"
+            )
+            assert compiled_res.shape == (num_tokens, hidden_size), (
+                f"Expected shape {(num_tokens, hidden_size)}, got {compiled_res.shape}"
+            )
diff --git a/tests/e2e/singlecard/compile/test_graphex_qknorm_rope_fusion.py b/tests/e2e/singlecard/compile/test_graphex_qknorm_rope_fusion.py
index 7bd36880..7298ecb9 100644
--- a/tests/e2e/singlecard/compile/test_graphex_qknorm_rope_fusion.py
+++ b/tests/e2e/singlecard/compile/test_graphex_qknorm_rope_fusion.py
@@ -201,6 +201,7 @@ def test_rmsnorm_quant_fusion(
                 vllm_config=vllm_config, head_dim=head_dim, num_heads=num_heads, num_kv_heads=num_kv_heads, eps=eps
             )
         from torch._inductor.pattern_matcher import PatternMatcherPass
+
         pm_pass = PatternMatcherPass()
         fusion_pattern.register(pm_pass)
         model = model.to("npu")
diff --git a/tests/e2e/singlecard/compile/test_norm_quant_fusion.py b/tests/e2e/singlecard/compile/test_norm_quant_fusion.py
index b272c64f..00b2b123 100644
--- a/tests/e2e/singlecard/compile/test_norm_quant_fusion.py
+++ b/tests/e2e/singlecard/compile/test_norm_quant_fusion.py
@@ -14,25 +14,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from typing import List
 
 import pytest
 import torch
 import torch.nn as nn
-import torch_npu
 import vllm.config
 from vllm.config import ModelConfig, VllmConfig
-from vllm.distributed import (ensure_model_parallel_initialized,
-                              init_distributed_environment)
+from vllm.distributed import ensure_model_parallel_initialized, init_distributed_environment
 from vllm.utils.system_utils import update_environment_variables
 
 import vllm_ascend.ops.register_custom_ops  # noqa
 from tests.e2e.singlecard.compile.backend import TestBackend
 from vllm_ascend.ascend_forward_context import set_ascend_forward_context
-from vllm_ascend.compilation.passes.norm_quant_fusion_pass import \
-    AddRMSNormQuantFusionPass
-from vllm_ascend.utils import enable_custom_op
-from vllm_ascend.utils import vllm_version_is
+from vllm_ascend.compilation.passes.norm_quant_fusion_pass import AddRMSNormQuantFusionPass
+from vllm_ascend.utils import enable_custom_op, vllm_version_is
 
 if vllm_version_is("0.15.0"):
     from vllm.compilation.fx_utils import OpOverload  # type: ignore
@@ -48,34 +43,24 @@ def get_or_create_backend(vllm_config):
     """Get or create backend with fusion passes (cached to avoid duplicate pattern registration)."""
     global _backend_cache
     if _backend_cache is None:
-        _backend_cache = TestBackend(custom_passes=[
-            AddRMSNormQuantFusionPass(vllm_config=vllm_config)
-        ])
+        _backend_cache = TestBackend(custom_passes=[AddRMSNormQuantFusionPass(vllm_config=vllm_config)])
     return _backend_cache
 
+
 class TestModelWithoutBias(nn.Module):
     """
     A minimal test model that simulates the pattern:
         AddRMSNorm → Quantization (without bias)
     """
 
-    def __init__(self,
-                 hidden_size: int,
-                 dtype: torch.dtype,
-                 eps: float = 1e-6,
-                 device="npu"):
+    def __init__(self, hidden_size: int, dtype: torch.dtype, eps: float = 1e-6, device="npu"):
         super().__init__()
         self.hidden_size = hidden_size
         self.eps = eps
-        self.rms_norm_weight = nn.Parameter(
-            torch.randn(hidden_size, device=device))
+        self.rms_norm_weight = nn.Parameter(torch.randn(hidden_size, device=device))
         self.quant_scale = torch.ones(hidden_size, dtype=dtype, device=device)
-        self.quant_scale_reciprocal = torch.ones(hidden_size,
-                                                 dtype=dtype,
-                                                 device=device)
-        self.quant_offset = torch.zeros(hidden_size,
-                                        dtype=dtype,
-                                        device=device)
+        self.quant_scale_reciprocal = torch.ones(hidden_size, dtype=dtype, device=device)
+        self.quant_offset = torch.zeros(hidden_size, dtype=dtype, device=device)
 
     def forward(self, x):
         """
@@ -87,23 +72,20 @@ class TestModelWithoutBias(nn.Module):
         residual = torch.zeros_like(x)
 
         norm_output, _, new_residual = torch.ops._C_ascend.npu_add_rms_norm_bias(
-            x, residual, self.rms_norm_weight, None, self.eps)
+            x, residual, self.rms_norm_weight, None, self.eps
+        )
 
-        quantized_output = torch.ops.vllm.quantize(norm_output,
-                                                   self.quant_scale,
-                                                   self.quant_scale_reciprocal,
-                                                   self.quant_offset)
+        quantized_output = torch.ops.vllm.quantize(
+            norm_output, self.quant_scale, self.quant_scale_reciprocal, self.quant_offset
+        )
 
         return quantized_output, new_residual
 
-    def ops_in_model_before(self) -> List[OpOverload]:
+    def ops_in_model_before(self) -> list[OpOverload]:
         """Return the list of expected operators BEFORE fusion."""
-        return [
-            torch.ops._C_ascend.npu_add_rms_norm_bias.default,
-            torch.ops.vllm.quantize.default
-        ]
+        return [torch.ops._C_ascend.npu_add_rms_norm_bias.default, torch.ops.vllm.quantize.default]
 
-    def ops_in_model_after(self) -> List[OpOverload]:
+    def ops_in_model_after(self) -> list[OpOverload]:
         """Return the list of expected operators AFTER successful fusion."""
         return [torch.ops.npu.npu_add_rms_norm_quant.default]
 
@@ -114,24 +96,15 @@ class TestModelWithBias(nn.Module):
         AddRMSNorm → Add Bias → Quantization (with bias)
     """
 
-    def __init__(self,
-                 hidden_size: int,
-                 dtype: torch.dtype,
-                 eps: float = 1e-6,
-                 device="npu"):
+    def __init__(self, hidden_size: int, dtype: torch.dtype, eps: float = 1e-6, device="npu"):
         super().__init__()
         self.hidden_size = hidden_size
         self.eps = eps
-        self.rms_norm_weight = nn.Parameter(
-            torch.randn(hidden_size, device=device))
+        self.rms_norm_weight = nn.Parameter(torch.randn(hidden_size, device=device))
         self.bias = nn.Parameter(torch.randn(hidden_size, device=device))
         self.quant_scale = torch.ones(hidden_size, dtype=dtype, device=device)
-        self.quant_scale_reciprocal = torch.ones(hidden_size,
-                                                 dtype=dtype,
-                                                 device=device)
-        self.quant_offset = torch.zeros(hidden_size,
-                                        dtype=dtype,
-                                        device=device)
+        self.quant_scale_reciprocal = torch.ones(hidden_size, dtype=dtype, device=device)
+        self.quant_offset = torch.zeros(hidden_size, dtype=dtype, device=device)
 
     def forward(self, x):
         """
@@ -144,23 +117,20 @@ class TestModelWithBias(nn.Module):
         residual = torch.zeros_like(x)
 
         norm_output_with_bias, _, new_residual = torch.ops._C_ascend.npu_add_rms_norm_bias(
-            x, residual, self.rms_norm_weight, self.bias, self.eps)
+            x, residual, self.rms_norm_weight, self.bias, self.eps
+        )
 
-        quantized_output = torch.ops.vllm.quantize(norm_output_with_bias,
-                                                   self.quant_scale,
-                                                   self.quant_scale_reciprocal,
-                                                   self.quant_offset)
+        quantized_output = torch.ops.vllm.quantize(
+            norm_output_with_bias, self.quant_scale, self.quant_scale_reciprocal, self.quant_offset
+        )
 
         return quantized_output, new_residual
 
-    def ops_in_model_before(self) -> List[OpOverload]:
+    def ops_in_model_before(self) -> list[OpOverload]:
         """Return the list of expected operators BEFORE fusion."""
-        return [
-            torch.ops._C_ascend.npu_add_rms_norm_bias.default,
-            torch.ops.vllm.quantize.default
-        ]
+        return [torch.ops._C_ascend.npu_add_rms_norm_bias.default, torch.ops.vllm.quantize.default]
 
-    def ops_in_model_after(self) -> List[OpOverload]:
+    def ops_in_model_after(self) -> list[OpOverload]:
         """Return the list of expected operators AFTER successful fusion."""
         return [torch.ops.npu.npu_add_rms_norm_quant.default]
 
@@ -171,23 +141,14 @@ class TestModelSPWithoutBias(nn.Module):
         AddRMSNorm → maybe_allgather → Quantization (without bias)
     """
 
-    def __init__(self,
-                 hidden_size: int,
-                 dtype: torch.dtype,
-                 eps: float = 1e-6,
-                 device="npu"):
+    def __init__(self, hidden_size: int, dtype: torch.dtype, eps: float = 1e-6, device="npu"):
         super().__init__()
         self.hidden_size = hidden_size
         self.eps = eps
-        self.rms_norm_weight = nn.Parameter(
-            torch.randn(hidden_size, device=device))
+        self.rms_norm_weight = nn.Parameter(torch.randn(hidden_size, device=device))
         self.quant_scale = torch.ones(hidden_size, dtype=dtype, device=device)
-        self.quant_scale_reciprocal = torch.ones(hidden_size,
-                                                 dtype=dtype,
-                                                 device=device)
-        self.quant_offset = torch.zeros(hidden_size,
-                                        dtype=dtype,
-                                        device=device)
+        self.quant_scale_reciprocal = torch.ones(hidden_size, dtype=dtype, device=device)
+        self.quant_offset = torch.zeros(hidden_size, dtype=dtype, device=device)
 
     def forward(self, x):
         """
@@ -200,32 +161,28 @@ class TestModelSPWithoutBias(nn.Module):
         residual = torch.zeros_like(x)
 
         norm_output, _, new_residual = torch.ops._C_ascend.npu_add_rms_norm_bias(
-            x, residual, self.rms_norm_weight, None, self.eps)
+            x, residual, self.rms_norm_weight, None, self.eps
+        )
 
-        norm_output = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(
-            norm_output, True)
+        norm_output = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(norm_output, True)
 
-        quantized_output = torch.ops.vllm.quantize(norm_output,
-                                                   self.quant_scale,
-                                                   self.quant_scale_reciprocal,
-                                                   self.quant_offset)
+        quantized_output = torch.ops.vllm.quantize(
+            norm_output, self.quant_scale, self.quant_scale_reciprocal, self.quant_offset
+        )
 
         return quantized_output, new_residual
 
-    def ops_in_model_before(self) -> List[OpOverload]:
+    def ops_in_model_before(self) -> list[OpOverload]:
         """Return the list of expected operators BEFORE fusion."""
         return [
             torch.ops._C_ascend.npu_add_rms_norm_bias.default,
             torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default,
-            torch.ops.vllm.quantize.default
+            torch.ops.vllm.quantize.default,
         ]
 
-    def ops_in_model_after(self) -> List[OpOverload]:
+    def ops_in_model_after(self) -> list[OpOverload]:
         """Return the list of expected operators AFTER successful fusion."""
-        return [
-            torch.ops.npu.npu_add_rms_norm_quant.default,
-            torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default
-        ]
+        return [torch.ops.npu.npu_add_rms_norm_quant.default, torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default]
 
 
 class TestModelSPWithBias(nn.Module):
@@ -234,24 +191,15 @@ class TestModelSPWithBias(nn.Module):
         AddRMSNorm → Add bias → maybe_allgather → Quantization (without bias)
     """
 
-    def __init__(self,
-                 hidden_size: int,
-                 dtype: torch.dtype,
-                 eps: float = 1e-6,
-                 device="npu"):
+    def __init__(self, hidden_size: int, dtype: torch.dtype, eps: float = 1e-6, device="npu"):
         super().__init__()
         self.hidden_size = hidden_size
         self.eps = eps
-        self.rms_norm_weight = nn.Parameter(
-            torch.randn(hidden_size, device=device))
+        self.rms_norm_weight = nn.Parameter(torch.randn(hidden_size, device=device))
         self.bias = nn.Parameter(torch.randn(hidden_size, device=device))
         self.quant_scale = torch.ones(hidden_size, dtype=dtype, device=device)
-        self.quant_scale_reciprocal = torch.ones(hidden_size,
-                                                 dtype=dtype,
-                                                 device=device)
-        self.quant_offset = torch.zeros(hidden_size,
-                                        dtype=dtype,
-                                        device=device)
+        self.quant_scale_reciprocal = torch.ones(hidden_size, dtype=dtype, device=device)
+        self.quant_offset = torch.zeros(hidden_size, dtype=dtype, device=device)
 
     def forward(self, x):
         """
@@ -265,32 +213,28 @@ class TestModelSPWithBias(nn.Module):
         residual = torch.zeros_like(x)
 
         norm_output_with_bias, _, new_residual = torch.ops._C_ascend.npu_add_rms_norm_bias(
-            x, residual, self.rms_norm_weight, self.bias, self.eps)
+            x, residual, self.rms_norm_weight, self.bias, self.eps
+        )
 
-        norm_output_with_bias = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(
-            norm_output_with_bias, True)
+        norm_output_with_bias = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(norm_output_with_bias, True)
 
-        quantized_output = torch.ops.vllm.quantize(norm_output_with_bias,
-                                                   self.quant_scale,
-                                                   self.quant_scale_reciprocal,
-                                                   self.quant_offset)
+        quantized_output = torch.ops.vllm.quantize(
+            norm_output_with_bias, self.quant_scale, self.quant_scale_reciprocal, self.quant_offset
+        )
 
         return quantized_output, new_residual
 
-    def ops_in_model_before(self) -> List[OpOverload]:
+    def ops_in_model_before(self) -> list[OpOverload]:
         """Return the list of expected operators BEFORE fusion."""
         return [
             torch.ops._C_ascend.npu_add_rms_norm_bias.default,
             torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default,
-            torch.ops.vllm.quantize.default
+            torch.ops.vllm.quantize.default,
         ]
 
-    def ops_in_model_after(self) -> List[OpOverload]:
+    def ops_in_model_after(self) -> list[OpOverload]:
         """Return the list of expected operators AFTER successful fusion."""
-        return [
-            torch.ops.npu.npu_add_rms_norm_quant.default,
-            torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default
-        ]
+        return [torch.ops.npu.npu_add_rms_norm_quant.default, torch.ops.vllm.maybe_all_gather_and_maybe_unpad.default]
 
 
 @pytest.mark.parametrize("dtype", [torch.bfloat16])
@@ -317,58 +261,42 @@ def test_rmsnorm_quant_fusion(
     vllm_config = VllmConfig(model_config=ModelConfig(dtype=dtype))
 
     with vllm.config.set_current_vllm_config(vllm_config):
-        update_environment_variables({
-            "RANK": "0",
-            "LOCAL_RANK": "0",
-            "WORLD_SIZE": "1",
-            "MASTER_ADDR": "localhost",
-            "MASTER_PORT": "12345",
-        })
+        update_environment_variables(
+            {
+                "RANK": "0",
+                "LOCAL_RANK": "0",
+                "WORLD_SIZE": "1",
+                "MASTER_ADDR": "localhost",
+                "MASTER_PORT": "12345",
+            }
+        )
         init_distributed_environment()
         ensure_model_parallel_initialized(1, 1)
 
-    with vllm.config.set_current_vllm_config(vllm_config):
-        with set_ascend_forward_context(None, vllm_config):
-            backend = get_or_create_backend(vllm_config)
-            if use_bias:
-                if not enable_custom_op():
-                    return
-                if sp_enable:
-                    model = TestModelSPWithBias(hidden_size,
-                                                dtype,
-                                                eps,
-                                                device="npu")
-                else:
-                    model = TestModelWithBias(hidden_size,
-                                              dtype,
-                                              eps,
-                                              device="npu")
+    with vllm.config.set_current_vllm_config(vllm_config), set_ascend_forward_context(None, vllm_config):
+        backend = get_or_create_backend(vllm_config)
+        if use_bias:
+            if not enable_custom_op():
+                return
+            if sp_enable:
+                model = TestModelSPWithBias(hidden_size, dtype, eps, device="npu")
             else:
-                if sp_enable:
-                    model = TestModelSPWithoutBias(hidden_size,
-                                                   dtype,
-                                                   eps,
-                                                   device="npu")
-                else:
-                    model = TestModelWithoutBias(hidden_size,
-                                                 dtype,
-                                                 eps,
-                                                 device="npu")
-            model = model.to("npu")
+                model = TestModelWithBias(hidden_size, dtype, eps, device="npu")
+        else:
+            if sp_enable:
+                model = TestModelSPWithoutBias(hidden_size, dtype, eps, device="npu")
+            else:
+                model = TestModelWithoutBias(hidden_size, dtype, eps, device="npu")
+        model = model.to("npu")
 
-            x = torch.rand(num_tokens,
-                           hidden_size,
-                           device="npu",
-                           dtype=dtype,
-                           requires_grad=False)
+        x = torch.rand(num_tokens, hidden_size, device="npu", dtype=dtype, requires_grad=False)
 
-            result_unfused = model(x)
-            print("Unfused result:", [t.shape for t in result_unfused])
-            model_fused = torch.compile(model, backend=backend)
-            result_fused = model_fused(x)
-            print("Fused result:", [t.shape for t in result_fused])
+        result_unfused = model(x)
+        print("Unfused result:", [t.shape for t in result_unfused])
+        model_fused = torch.compile(model, backend=backend)
+        result_fused = model_fused(x)
+        print("Fused result:", [t.shape for t in result_fused])
 
-            print("=== Checking operator fusion ===")
-            backend.check_before_ops(model.ops_in_model_before(),
-                                     fully_replaced=not sp_enable)
-            backend.check_after_ops(model.ops_in_model_after())
+        print("=== Checking operator fusion ===")
+        backend.check_before_ops(model.ops_in_model_before(), fully_replaced=not sp_enable)
+        backend.check_after_ops(model.ops_in_model_after())
diff --git a/tests/e2e/singlecard/model_runner_v2/test_basic.py b/tests/e2e/singlecard/model_runner_v2/test_basic.py
index 672cd274..dc019a8b 100644
--- a/tests/e2e/singlecard/model_runner_v2/test_basic.py
+++ b/tests/e2e/singlecard/model_runner_v2/test_basic.py
@@ -47,9 +47,9 @@ def test_qwen3_dense_eager_mode(
 
     sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
     with VllmRunner(
-            model,
-            max_model_len=1024,
-            enforce_eager=enforce_eager,
+        model,
+        max_model_len=1024,
+        enforce_eager=enforce_eager,
     ) as runner:
         runner.model.generate(prompts, sampling_params)
 
@@ -74,14 +74,14 @@ def test_egale_spec_decoding(
 
     sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
     with VllmRunner(
-            model,
-            max_model_len=1024,
-            enforce_eager=enforce_eager,
-            async_scheduling=True,
-            speculative_config={
-                "model": eagle_model,
-                "method": "eagle",
-                "num_speculative_tokens": 3,
-            },
+        model,
+        max_model_len=1024,
+        enforce_eager=enforce_eager,
+        async_scheduling=True,
+        speculative_config={
+            "model": eagle_model,
+            "method": "eagle",
+            "num_speculative_tokens": 3,
+        },
     ) as runner:
         runner.model.generate(prompts, sampling_params)
diff --git a/tests/e2e/singlecard/test_aclgraph_accuracy.py b/tests/e2e/singlecard/test_aclgraph_accuracy.py
index ac5c0de8..e031e93f 100644
--- a/tests/e2e/singlecard/test_aclgraph_accuracy.py
+++ b/tests/e2e/singlecard/test_aclgraph_accuracy.py
@@ -15,20 +15,22 @@
 # limitations under the License.
 #
 
-import pytest
+# ruff: noqa: E501
+
 import os
 
-from tests.e2e.singlecard.utils import (PROMPTS_LONG, PROMPTS_SHORT,
-                                        LLMTestCase, gen_and_valid)
+import pytest
+
+from tests.e2e.singlecard.utils import PROMPTS_LONG, PROMPTS_SHORT, LLMTestCase, gen_and_valid
 
 CASE_QWEN_ACLGRAPH = LLMTestCase(
     model="Qwen/Qwen3-0.6B",
     prompts=PROMPTS_SHORT,
     golden_answers=[
         " Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the",
-        ' the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president',
-        ' Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of',
-        ' not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and'
+        " the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president",
+        " Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of",
+        " not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and",
     ],
 )
 
@@ -37,10 +39,10 @@ CASE_DS_ACLGRAPH = LLMTestCase(
     quantization="ascend",
     prompts=PROMPTS_SHORT,
     golden_answers=[
-        '\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2',
-        ' a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the',
-        ' Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art',
-        ' here, and it’s not what you think.\nThe future of AI is here, and it’s not what you think.\nThe future of'
+        "\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2",
+        " a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the",
+        " Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art",
+        " here, and it’s not what you think.\nThe future of AI is here, and it’s not what you think.\nThe future of",
     ],
 )
 
@@ -49,9 +51,9 @@ CASE_QWEN_FULL = LLMTestCase(
     prompts=PROMPTS_SHORT,
     golden_answers=[
         " Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the",
-        ' the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president',
-        ' Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of',
-        ' not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and'
+        " the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president",
+        " Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of",
+        " not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and",
     ],
 )
 
@@ -60,10 +62,10 @@ CASE_DS_FULL = LLMTestCase(
     quantization="ascend",
     prompts=PROMPTS_SHORT,
     golden_answers=[
-        '\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2',
-        ' a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the',
-        ' Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art',
-        ' here, and it’s not what you think.\nThe future of AI is here, and it’s not what you think.\nThe future of'
+        "\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2",
+        " a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the",
+        " Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art",
+        " here, and it’s not what you think.\nThe future of AI is here, and it’s not what you think.\nThe future of",
     ],
 )
 
@@ -71,10 +73,11 @@ CASE_QWEN_FULL_DECODE_ONLY = LLMTestCase(
     model="Qwen/Qwen3-0.6B",
     prompts=PROMPTS_LONG,
     golden_answers=[
-        ' \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the',
+        " \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the",
         " \n\nTo solve this problem, we can use the fact that the expected value of the area of a triangle with vertices on a square can be calculated by integrating over",
-        ' \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can'
-    ])
+        " \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can",
+    ],
+)
 
 CASE_DS_FULL_DECODE_ONLY = LLMTestCase(
     model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
@@ -83,26 +86,31 @@ CASE_DS_FULL_DECODE_ONLY = LLMTestCase(
     golden_answers=[
         "\n\nSelect an assignment template",
         "\n\nI'm not sure how to approach this problem. I'm not sure if I should use the law of total probability or if I should use",
-        "\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x$ be the common root of the equations"
-    ])
+        "\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x$ be the common root of the equations",
+    ],
+)
 
 CASE_QWEN_EX = LLMTestCase(
     model="Qwen/Qwen3-0.6B",
     prompts=PROMPTS_LONG,
     golden_answers=[
-        ' \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the',
+        " \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the",
         " \n\nTo solve this problem, we can use the fact that the expected value of the area of a triangle with vertices on a square can be calculated by integrating over",
-        ' \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can'
-    ])
+        " \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can",
+    ],
+)
+
+CASE_DS_EX = LLMTestCase(
+    model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
+    quantization="ascend",
+    prompts=PROMPTS_LONG,
+    golden_answers=[
+        "\n\nSelect an assignment template",
+        "\n\nI'm not sure how to approach this problem. I'm not sure if I should use the law of total probability or if I should use",
+        "\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x$ be the common root of the equations",
+    ],
+)
 
-CASE_DS_EX = LLMTestCase(model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
-                         quantization="ascend",
-                         prompts=PROMPTS_LONG,
-                         golden_answers=[
-                             "\n\nSelect an assignment template",
-                             "\n\nI'm not sure how to approach this problem. I'm not sure if I should use the law of total probability or if I should use",
-                             "\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x$ be the common root of the equations"
-                         ])
 
 @pytest.mark.parametrize("cur_case", [CASE_QWEN_ACLGRAPH, CASE_DS_ACLGRAPH])
 def test_piecewise_res_consistency(cur_case: LLMTestCase):
@@ -112,51 +120,48 @@ def test_piecewise_res_consistency(cur_case: LLMTestCase):
         "cudagraph_capture_sizes": [1, 2, 4, 8],
         "quantization": cur_case.quantization,
     }
-    gen_and_valid(runner_kwargs=runner_kwargs,
-                  prompts=cur_case.prompts,
-                  sampling_params=cur_case.sampling_params,
-                  golden_answers=cur_case.golden_answers)
+    gen_and_valid(
+        runner_kwargs=runner_kwargs,
+        prompts=cur_case.prompts,
+        sampling_params=cur_case.sampling_params,
+        golden_answers=cur_case.golden_answers,
+    )
 
-@pytest.mark.parametrize(
-    "cur_case", [CASE_QWEN_FULL, CASE_DS_FULL])
+
+@pytest.mark.parametrize("cur_case", [CASE_QWEN_FULL, CASE_DS_FULL])
 def test_full_res_consistency(cur_case: LLMTestCase, monkeypatch):
     monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
     runner_kwargs = {
         "model_name": cur_case.model,
         "max_model_len": 1024,
-        "compilation_config": {
-            "cudagraph_capture_sizes": [4, 8, 32, 64],
-            "cudagraph_mode": "FULL_DECODE_ONLY"
-        },
+        "compilation_config": {"cudagraph_capture_sizes": [4, 8, 32, 64], "cudagraph_mode": "FULL_DECODE_ONLY"},
         "quantization": cur_case.quantization,
     }
-    gen_and_valid(runner_kwargs=runner_kwargs,
-                  prompts=cur_case.prompts,
-                  sampling_params=cur_case.sampling_params,
-                  golden_answers=cur_case.golden_answers)
+    gen_and_valid(
+        runner_kwargs=runner_kwargs,
+        prompts=cur_case.prompts,
+        sampling_params=cur_case.sampling_params,
+        golden_answers=cur_case.golden_answers,
+    )
 
-@pytest.mark.parametrize(
-    "cur_case", [CASE_QWEN_FULL_DECODE_ONLY, CASE_DS_FULL_DECODE_ONLY])
+
+@pytest.mark.parametrize("cur_case", [CASE_QWEN_FULL_DECODE_ONLY, CASE_DS_FULL_DECODE_ONLY])
 def test_full_decode_only_res_consistency(cur_case: LLMTestCase, monkeypatch):
     monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
     runner_kwargs = {
         "model_name": cur_case.model,
         "max_model_len": 1024,
-        "compilation_config": {
-            "cudagraph_capture_sizes": [4, 8, 32, 64],
-            "cudagraph_mode": "FULL_DECODE_ONLY"
-        },
+        "compilation_config": {"cudagraph_capture_sizes": [4, 8, 32, 64], "cudagraph_mode": "FULL_DECODE_ONLY"},
         "quantization": cur_case.quantization,
-        "additional_config": {
-            "npugraph_ex_config": {
-                "enable": False
-            }
-        },
+        "additional_config": {"npugraph_ex_config": {"enable": False}},
     }
-    gen_and_valid(runner_kwargs=runner_kwargs,
-                  prompts=cur_case.prompts,
-                  sampling_params=cur_case.sampling_params,
-                  golden_answers=cur_case.golden_answers)
+    gen_and_valid(
+        runner_kwargs=runner_kwargs,
+        prompts=cur_case.prompts,
+        sampling_params=cur_case.sampling_params,
+        golden_answers=cur_case.golden_answers,
+    )
+
 
 @pytest.mark.parametrize("cur_case", [CASE_QWEN_EX, CASE_DS_EX])
 def test_npugraph_ex_res_consistency(cur_case: LLMTestCase, monkeypatch):
@@ -165,20 +170,16 @@ def test_npugraph_ex_res_consistency(cur_case: LLMTestCase, monkeypatch):
         "model_name": cur_case.model,
         "quantization": cur_case.quantization,
         "max_model_len": 1024,
-        "compilation_config": {
-            "cudagraph_capture_sizes": [4, 8, 32, 64],
-            "cudagraph_mode": "FULL_DECODE_ONLY"
-        },
-        "additional_config": {
-            "npugraph_ex_config": {
-                "enable": True
-            }
-        },
+        "compilation_config": {"cudagraph_capture_sizes": [4, 8, 32, 64], "cudagraph_mode": "FULL_DECODE_ONLY"},
+        "additional_config": {"npugraph_ex_config": {"enable": True}},
     }
-    gen_and_valid(runner_kwargs=runner_kwargs,
-                  prompts=cur_case.prompts,
-                  sampling_params=cur_case.sampling_params,
-                  golden_answers=cur_case.golden_answers)
+    gen_and_valid(
+        runner_kwargs=runner_kwargs,
+        prompts=cur_case.prompts,
+        sampling_params=cur_case.sampling_params,
+        golden_answers=cur_case.golden_answers,
+    )
+
 
 # The accuracy has already been verified in the previous test case.
 # This test case is used to check whether the functionality works properly
@@ -190,10 +191,7 @@ def test_npugraph_ex_with_static_kernel(cur_case: LLMTestCase, monkeypatch):
         "model_name": cur_case.model,
         "quantization": cur_case.quantization,
         "max_model_len": 1024,
-        "compilation_config": {
-            "cudagraph_capture_sizes": [4, 8],
-            "cudagraph_mode": "FULL_DECODE_ONLY"
-        },
+        "compilation_config": {"cudagraph_capture_sizes": [4, 8], "cudagraph_mode": "FULL_DECODE_ONLY"},
         "additional_config": {
             "npugraph_ex_config": {
                 "enable": True,
@@ -201,12 +199,14 @@ def test_npugraph_ex_with_static_kernel(cur_case: LLMTestCase, monkeypatch):
             }
         },
     }
-    gen_and_valid(runner_kwargs=runner_kwargs,
-                  prompts=cur_case.prompts,
-                  sampling_params=cur_case.sampling_params,
-                  golden_answers=cur_case.golden_answers)
+    gen_and_valid(
+        runner_kwargs=runner_kwargs,
+        prompts=cur_case.prompts,
+        sampling_params=cur_case.sampling_params,
+        golden_answers=cur_case.golden_answers,
+    )
 
     # Check whether the static kernel is properly uninstall
     ascend_home_path = os.environ["ASCEND_HOME_PATH"]
-    static_kernel_install_path = os.path.join(ascend_home_path, 'opp/static_kernel/ai_core')
+    static_kernel_install_path = os.path.join(ascend_home_path, "opp/static_kernel/ai_core")
     assert not os.path.exists(static_kernel_install_path)
diff --git a/tests/e2e/singlecard/test_aclgraph_batch_invariant.py b/tests/e2e/singlecard/test_aclgraph_batch_invariant.py
index 048400c8..47413ba4 100644
--- a/tests/e2e/singlecard/test_aclgraph_batch_invariant.py
+++ b/tests/e2e/singlecard/test_aclgraph_batch_invariant.py
@@ -22,6 +22,7 @@ import random
 import pytest
 import torch
 from vllm import SamplingParams
+
 from tests.e2e.conftest import VllmRunner
 
 DEFAULT_MODEL = "Qwen/Qwen3-0.6B"
@@ -69,9 +70,7 @@ def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
 
     if target_words > 50:
         # For longer prompts, repeat context
-        padding_text = (
-            " This is an interesting topic that deserves more explanation. " *
-            (target_words // 50))
+        padding_text = " This is an interesting topic that deserves more explanation. " * (target_words // 50)
         base_prompt = base_prompt + padding_text
 
     return base_prompt
@@ -107,8 +106,7 @@ def _extract_step_logprobs(generate_output):
 
 
 @pytest.mark.timeout(1000)
-def test_aclgraph_v1_generation_is_deterministic_across_batch_sizes_with_needle(
-        monkeypatch: pytest.MonkeyPatch):
+def test_aclgraph_v1_generation_is_deterministic_across_batch_sizes_with_needle(monkeypatch: pytest.MonkeyPatch):
     """
     Ensures that the same request (the 'needle' prompt) yields identical output
     whether run alone (bs=1) or mixed into a larger batch (e.g., bs=64),
@@ -162,20 +160,16 @@ def test_aclgraph_v1_generation_is_deterministic_across_batch_sizes_with_needle(
     needle_prompt = "There once was a "
 
     with VllmRunner(
-            model_name=model,
-            max_num_seqs=max_batch_size,
-            gpu_memory_utilization=gpu_mem_util,
-            max_model_len=max_model_len,
-            dtype="bfloat16",
-            tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
-            enable_prefix_caching=False,
-            distributed_executor_backend="mp",
-            compilation_config={
-                "cudagraph_mode": "FULL_DECODE_ONLY",
-                "cudagraph_capture_sizes": [1, 32, 64]
-            }
+        model_name=model,
+        max_num_seqs=max_batch_size,
+        gpu_memory_utilization=gpu_mem_util,
+        max_model_len=max_model_len,
+        dtype="bfloat16",
+        tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
+        enable_prefix_caching=False,
+        distributed_executor_backend="mp",
+        compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 32, 64]},
     ) as vllm_model:
-
         # Baseline generation for the needle prompt alone.
         baseline_out = vllm_model.generate([needle_prompt], sampling)
         assert len(baseline_out) == 1
@@ -194,8 +188,7 @@ def test_aclgraph_v1_generation_is_deterministic_across_batch_sizes_with_needle(
                 if i == needle_pos:
                     prompts.append(needle_prompt)
                 else:
-                    prompts.append(
-                        _random_prompt(min_random_prompt, max_random_prompt))
+                    prompts.append(_random_prompt(min_random_prompt, max_random_prompt))
 
             # Generate with the larger-batch engine
             outputs = vllm_model.generate(prompts, sampling)
@@ -204,24 +197,23 @@ def test_aclgraph_v1_generation_is_deterministic_across_batch_sizes_with_needle(
             text = needle_output[0]
 
             if text != baseline_text:
-                print(
-                    f"{text}\n\n== Not the same as ==\n\n{baseline_text}\n\n")
+                print(f"{text}\n\n== Not the same as ==\n\n{baseline_text}\n\n")
                 mismatches += 1
 
         passes = num_trials - mismatches
         # Dump how many passed vs failed
-        print(f"[determinism] total={num_trials}, passed={passes}, "
-              f"failed={mismatches}, max_batch_size={max_batch_size}")
+        print(
+            f"[determinism] total={num_trials}, passed={passes}, failed={mismatches}, max_batch_size={max_batch_size}"
+        )
 
         if mismatches > 0:
             pytest.fail(
                 f"Nondeterministic outputs detected: {mismatches} failed out "
-                f"of {num_trials} trials (max_batch_size={max_batch_size}).")
+                f"of {num_trials} trials (max_batch_size={max_batch_size})."
+            )
 
 
-
-def test_aclgraph_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
-        monkeypatch: pytest.MonkeyPatch):
+def test_aclgraph_logprobs_bitwise_batch_invariance_bs1_vs_bsN(monkeypatch: pytest.MonkeyPatch):
     seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
     random.seed(seed)
     model_name = DEFAULT_MODEL
@@ -235,24 +227,19 @@ def test_aclgraph_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
 
     if disable_custom_ar:
         print(f"\n{'=' * 80}")
-        print(
-            f"BATCH INVARIANCE MODE: Disabling custom all-reduce (TP={tp_size})"
-        )
+        print(f"BATCH INVARIANCE MODE: Disabling custom all-reduce (TP={tp_size})")
         print(f"{'=' * 80}\n")
 
     with VllmRunner(
-            model_name=model_name,
-            tensor_parallel_size=tp_size,
-            enable_prefix_caching=False,
-            max_num_seqs=32,
-            max_model_len=8192,
-            dtype="bfloat16",
-            gpu_memory_utilization=0.9,
-            distributed_executor_backend="mp",
-            compilation_config={
-                "cudagraph_mode": "FULL_DECODE_ONLY",
-                "cudagraph_capture_sizes": [1, 32, 64]
-            }
+        model_name=model_name,
+        tensor_parallel_size=tp_size,
+        enable_prefix_caching=False,
+        max_num_seqs=32,
+        max_model_len=8192,
+        dtype="bfloat16",
+        gpu_memory_utilization=0.9,
+        distributed_executor_backend="mp",
+        compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 32, 64]},
     ) as vllm_model:
         # Use more realistic prompts for better token generation
         prompts = [_random_prompt(10, 50) for i in range(32)]
@@ -273,16 +260,13 @@ def test_aclgraph_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
         bs1_logprobs_per_prompt = []
         bs1_tokens_per_prompt = []
         for idx, p in enumerate(prompts):
-            print(
-                f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}..."
-            )
+            print(f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}...")
             outs = vllm_model.generate_w_logprobs([p], sp, use_tqdm=False)
             assert len(outs) == 1
             # print(outs)
             step_logprobs, token_ids = _extract_step_logprobs(outs[0])
             if step_logprobs is None:
-                pytest.skip("Logits are not available on RequestOutput; "
-                            "enable logprobs return to run this test.")
+                pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.")
             bs1_logprobs_per_prompt.append(step_logprobs)
             bs1_tokens_per_prompt.append(token_ids)
             print(f"[BS=1] Prompt {idx} generated tokens: {token_ids}")
@@ -304,108 +288,91 @@ def test_aclgraph_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
             print(f"[BS={len(prompts)}] Prompt {idx} generated tokens: {tokens}")
             step_logprobs, token_ids = _extract_step_logprobs(o)
             if step_logprobs is None:
-                pytest.skip("Logits are not available on RequestOutput; "
-                            "enable logprobs return to run this test.")
+                pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.")
             bsN_logprobs_per_prompt.append(step_logprobs)
             bsN_tokens_per_prompt.append(token_ids)
 
         # Compare step-by-step logprobs for each prompt between BS=1 and BS=N runs.
         failed_prompts = []
         for i, (logprobs_bs1, logprobs_bsN, tokens_bs1, tokens_bsN) in enumerate(
-                zip(
-                    bs1_logprobs_per_prompt,
-                    bsN_logprobs_per_prompt,
-                    bs1_tokens_per_prompt,
-                    bsN_tokens_per_prompt,
-                )):
+            zip(
+                bs1_logprobs_per_prompt,
+                bsN_logprobs_per_prompt,
+                bs1_tokens_per_prompt,
+                bsN_tokens_per_prompt,
+            )
+        ):
             if len(logprobs_bs1) != len(logprobs_bsN):
-                reason = (f"Different number of steps: {len(logprobs_bs1)} (BS=1) "
-                          f"vs {len(logprobs_bsN)} (BS=N)")
-                failed_prompts.append({
-                    "prompt_idx": i,
-                    "step": "all",
-                    "reason": reason,
-                    "prompt_preview": prompts[i][:100],
-                    "bs1_tokens": tokens_bs1,
-                    "bsN_tokens": tokens_bsN,
-                })
+                reason = f"Different number of steps: {len(logprobs_bs1)} (BS=1) vs {len(logprobs_bsN)} (BS=N)"
+                failed_prompts.append(
+                    {
+                        "prompt_idx": i,
+                        "step": "all",
+                        "reason": reason,
+                        "prompt_preview": prompts[i][:100],
+                        "bs1_tokens": tokens_bs1,
+                        "bsN_tokens": tokens_bsN,
+                    }
+                )
                 continue
 
             # Check if tokens match first
             if tokens_bs1 != tokens_bsN:
-                failed_prompts.append({
-                    "prompt_idx":
-                    i,
-                    "step":
-                    "sampling",
-                    "reason":
-                    "Different tokens sampled",
-                    "prompt_preview":
-                    prompts[i][:100],
-                    "bs1_tokens":
-                    tokens_bs1,
-                    "bsN_tokens":
-                    tokens_bsN,
-                    "bs1_all_logprobs":
-                    [logprobs_bs1[s].tolist() for s in range(len(logprobs_bs1))],
-                    "bsN_all_logprobs":
-                    [logprobs_bsN[s].tolist() for s in range(len(logprobs_bsN))],
-                })
+                failed_prompts.append(
+                    {
+                        "prompt_idx": i,
+                        "step": "sampling",
+                        "reason": "Different tokens sampled",
+                        "prompt_preview": prompts[i][:100],
+                        "bs1_tokens": tokens_bs1,
+                        "bsN_tokens": tokens_bsN,
+                        "bs1_all_logprobs": [logprobs_bs1[s].tolist() for s in range(len(logprobs_bs1))],
+                        "bsN_all_logprobs": [logprobs_bsN[s].tolist() for s in range(len(logprobs_bsN))],
+                    }
+                )
                 continue
 
             for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)):
                 if a.shape != b.shape:
-                    failed_prompts.append({
-                        "prompt_idx": i,
-                        "step": t,
-                        "reason": f"Shape mismatch: {a.shape} vs {b.shape}",
-                        "prompt_preview": prompts[i][:100],
-                        "bs1_tokens": tokens_bs1,
-                        "bsN_tokens": tokens_bsN,
-                    })
+                    failed_prompts.append(
+                        {
+                            "prompt_idx": i,
+                            "step": t,
+                            "reason": f"Shape mismatch: {a.shape} vs {b.shape}",
+                            "prompt_preview": prompts[i][:100],
+                            "bs1_tokens": tokens_bs1,
+                            "bsN_tokens": tokens_bsN,
+                        }
+                    )
                     break
 
                 if not torch.equal(a, b):
                     max_diff = torch.abs(a - b).max().item()
                     # Print which token failed
-                    print(
-                        f"\n[DIVERGENCE] Prompt {i}, Token {t}: max_diff={max_diff:.6e}"
-                    )
+                    print(f"\n[DIVERGENCE] Prompt {i}, Token {t}: max_diff={max_diff:.6e}")
                     bs1_tok = tokens_bs1[t] if t < len(tokens_bs1) else "N/A"
                     bsN_tok = tokens_bsN[t] if t < len(tokens_bsN) else "N/A"
                     print(f"  Token IDs: bs1={bs1_tok}, bsN={bsN_tok}")
                     print(f"  BS=1 logprob: {a.tolist()}")
                     print(f"  BS=N logprob: {b.tolist()}")
-                    failed_prompts.append({
-                        "prompt_idx":
-                        i,
-                        "step":
-                        t,
-                        "reason":
-                        f"Bitwise mismatch (max_diff={max_diff:.6e})",
-                        "prompt_preview":
-                        prompts[i][:100],
-                        "bs1_tokens":
-                        tokens_bs1,
-                        "bsN_tokens":
-                        tokens_bsN,
-                        "bs1_all_logprobs": [
-                            logprobs_bs1[s].tolist()
-                            for s in range(len(logprobs_bs1))
-                        ],
-                        "bsN_all_logprobs": [
-                            logprobs_bsN[s].tolist()
-                            for s in range(len(logprobs_bsN))
-                        ],
-                    })
+                    failed_prompts.append(
+                        {
+                            "prompt_idx": i,
+                            "step": t,
+                            "reason": f"Bitwise mismatch (max_diff={max_diff:.6e})",
+                            "prompt_preview": prompts[i][:100],
+                            "bs1_tokens": tokens_bs1,
+                            "bsN_tokens": tokens_bsN,
+                            "bs1_all_logprobs": [logprobs_bs1[s].tolist() for s in range(len(logprobs_bs1))],
+                            "bsN_all_logprobs": [logprobs_bsN[s].tolist() for s in range(len(logprobs_bsN))],
+                        }
+                    )
                     break
 
-
         # Print summary of all failures
     if failed_prompts:
         print(f"\n{'=' * 80}")
-        fail_msg = (f"BATCH INVARIANCE FAILURES: {len(failed_prompts)}/"
-                    f"{len(prompts)} prompts failed")
+        fail_msg = f"BATCH INVARIANCE FAILURES: {len(failed_prompts)}/{len(prompts)} prompts failed"
         print(fail_msg)
         print(f"{'=' * 80}")
         for fail in failed_prompts:
@@ -420,21 +387,18 @@ def test_aclgraph_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
                 print(f"  BS=N tokens: {fail['bsN_tokens']}")
 
             if "bs1_all_logprobs" in fail:
-                print(
-                    f"  BS=1 logprobs for all {len(fail['bs1_all_logprobs'])} steps:"
-                )
+                print(f"  BS=1 logprobs for all {len(fail['bs1_all_logprobs'])} steps:")
                 for step_idx, logprobs in enumerate(fail["bs1_all_logprobs"]):
                     print(f"    Step {step_idx}: {logprobs}")
-                print(
-                    f"  BS=N logprobs for all {len(fail['bsN_all_logprobs'])} steps:"
-                )
+                print(f"  BS=N logprobs for all {len(fail['bsN_all_logprobs'])} steps:")
                 for step_idx, logprobs in enumerate(fail["bsN_all_logprobs"]):
                     print(f"    Step {step_idx}: {logprobs}")
         print(f"{'=' * 80}\n")
 
         # Fail the test with summary
-        msg = (f"Batch invariance violated in {len(failed_prompts)}/"
-               f"{len(prompts)} prompts. See output above for details.")
+        msg = (
+            f"Batch invariance violated in {len(failed_prompts)}/{len(prompts)} prompts. See output above for details."
+        )
         pytest.fail(msg)
 
 
@@ -446,18 +410,15 @@ def test_aclgraph_simple_generation(monkeypatch: pytest.MonkeyPatch):
     model = DEFAULT_MODEL
 
     with VllmRunner(
-            model_name=model,
-            max_num_seqs=1,
-            tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
-            gpu_memory_utilization=0.9,
-            max_model_len=2048,
-            dtype="float16",
-            enable_prefix_caching=False,
-            compilation_config={
-                "cudagraph_mode": "FULL_DECODE_ONLY",
-                "cudagraph_capture_sizes": [1, 32, 64]
-            },
-            distributed_executor_backend="mp",
+        model_name=model,
+        max_num_seqs=1,
+        tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
+        gpu_memory_utilization=0.9,
+        max_model_len=2048,
+        dtype="float16",
+        enable_prefix_caching=False,
+        compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 32, 64]},
+        distributed_executor_backend="mp",
     ) as vllm_model:
         prompt = "The capital of France is"
         sampling_params = SamplingParams(
@@ -479,11 +440,7 @@ def test_aclgraph_simple_generation(monkeypatch: pytest.MonkeyPatch):
         print(f"{'=' * 80}\n")
 
 
-
-
-
-def test_aclgraph_logprobs_without_batch_invariance_should_fail(
-        monkeypatch: pytest.MonkeyPatch):
+def test_aclgraph_logprobs_without_batch_invariance_should_fail(monkeypatch: pytest.MonkeyPatch):
     """
     This test is the inverse of test_logprobs_bitwise_batch_invariance_bs1_vs_bsN.
     It DISABLES batch invariance mode and expects to see non-deterministic behavior
@@ -505,19 +462,15 @@ def test_aclgraph_logprobs_without_batch_invariance_should_fail(
     print(f"{'=' * 80}\n")
 
     with VllmRunner(
-            model_name=model_name,
-            tensor_parallel_size=tp_size,
-            enable_prefix_caching=False,
-            max_num_seqs=32,
-            max_model_len=8192,
-            dtype="bfloat16",
-            compilation_config={
-                "cudagraph_mode": "FULL_DECODE_ONLY",
-                "cudagraph_capture_sizes": [1, 32, 64]
-            },
-            distributed_executor_backend="mp",
+        model_name=model_name,
+        tensor_parallel_size=tp_size,
+        enable_prefix_caching=False,
+        max_num_seqs=32,
+        max_model_len=8192,
+        dtype="bfloat16",
+        compilation_config={"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 32, 64]},
+        distributed_executor_backend="mp",
     ) as vllm_model:
-
         # build ragged prompts to change shapes significantly across BS=1 vs BS=N
         long_min = int(os.getenv("VLLM_MIN_PROMPT", "768"))
         long_max = int(os.getenv("VLLM_MAX_PROMPT", "2048"))
@@ -549,16 +502,13 @@ def test_aclgraph_logprobs_without_batch_invariance_should_fail(
         bs1_logprobs_per_prompt = []
         bs1_tokens_per_prompt = []
         for idx, p in enumerate(prompts):
-            print(
-                f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}..."
-            )
+            print(f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}...")
             outs = vllm_model.generate_w_logprobs([p], sp, use_tqdm=False)
 
             assert len(outs) == 1
             step_logprobs, token_ids = _extract_step_logprobs(outs[0])
             if step_logprobs is None:
-                pytest.skip("Logits are not available on RequestOutput; "
-                            "enable logprobs return to run this test.")
+                pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.")
             bs1_logprobs_per_prompt.append(step_logprobs)
             bs1_tokens_per_prompt.append(token_ids)
             print(f"[BS=1] Prompt {idx} generated tokens: {token_ids}")
@@ -579,84 +529,90 @@ def test_aclgraph_logprobs_without_batch_invariance_should_fail(
             print(f"[BS={len(prompts)}] Prompt {idx} generated tokens: {tokens}")
             step_logprobs, token_ids = _extract_step_logprobs(o)
             if step_logprobs is None:
-                pytest.skip("Logits are not available on RequestOutput; "
-                            "enable logprobs return to run this test.")
+                pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.")
             bsN_logprobs_per_prompt.append(step_logprobs)
             bsN_tokens_per_prompt.append(token_ids)
 
         # Compare step-by-step logprobs for each prompt between BS=1 and BS=N runs.
         differences_found = []
         for i, (logprobs_bs1, logprobs_bsN, tokens_bs1, tokens_bsN) in enumerate(
-                zip(
-                    bs1_logprobs_per_prompt,
-                    bsN_logprobs_per_prompt,
-                    bs1_tokens_per_prompt,
-                    bsN_tokens_per_prompt,
-                )):
+            zip(
+                bs1_logprobs_per_prompt,
+                bsN_logprobs_per_prompt,
+                bs1_tokens_per_prompt,
+                bsN_tokens_per_prompt,
+            )
+        ):
             if len(logprobs_bs1) != len(logprobs_bsN):
-                reason = (f"Different number of steps: {len(logprobs_bs1)} (BS=1) "
-                          f"vs {len(logprobs_bsN)} (BS=N)")
-                differences_found.append({
-                    "prompt_idx": i,
-                    "step": "all",
-                    "reason": reason,
-                    "prompt_preview": prompts[i][:100],
-                    "bs1_tokens": tokens_bs1,
-                    "bsN_tokens": tokens_bsN,
-                })
+                reason = f"Different number of steps: {len(logprobs_bs1)} (BS=1) vs {len(logprobs_bsN)} (BS=N)"
+                differences_found.append(
+                    {
+                        "prompt_idx": i,
+                        "step": "all",
+                        "reason": reason,
+                        "prompt_preview": prompts[i][:100],
+                        "bs1_tokens": tokens_bs1,
+                        "bsN_tokens": tokens_bsN,
+                    }
+                )
                 continue
 
             # Check if tokens match first
             if tokens_bs1 != tokens_bsN:
-                differences_found.append({
-                    "prompt_idx": i,
-                    "step": "sampling",
-                    "reason": "Different tokens sampled",
-                    "prompt_preview": prompts[i][:100],
-                    "bs1_tokens": tokens_bs1,
-                    "bsN_tokens": tokens_bsN,
-                })
+                differences_found.append(
+                    {
+                        "prompt_idx": i,
+                        "step": "sampling",
+                        "reason": "Different tokens sampled",
+                        "prompt_preview": prompts[i][:100],
+                        "bs1_tokens": tokens_bs1,
+                        "bsN_tokens": tokens_bsN,
+                    }
+                )
                 continue
 
             for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)):
                 if a.shape != b.shape:
-                    differences_found.append({
-                        "prompt_idx": i,
-                        "step": t,
-                        "reason": f"Shape mismatch: {a.shape} vs {b.shape}",
-                        "prompt_preview": prompts[i][:100],
-                        "bs1_tokens": tokens_bs1,
-                        "bsN_tokens": tokens_bsN,
-                    })
+                    differences_found.append(
+                        {
+                            "prompt_idx": i,
+                            "step": t,
+                            "reason": f"Shape mismatch: {a.shape} vs {b.shape}",
+                            "prompt_preview": prompts[i][:100],
+                            "bs1_tokens": tokens_bs1,
+                            "bsN_tokens": tokens_bsN,
+                        }
+                    )
                     break
 
                 if not torch.equal(a, b):
                     max_diff = torch.abs(a - b).max().item()
-                    print(f"\n[EXPECTED DIVERGENCE FOUND] Prompt {i}, "
-                          f"Token {t}: max_diff={max_diff:.6e}")
+                    print(f"\n[EXPECTED DIVERGENCE FOUND] Prompt {i}, Token {t}: max_diff={max_diff:.6e}")
                     bs1_tok = tokens_bs1[t] if t < len(tokens_bs1) else "N/A"
                     bsN_tok = tokens_bsN[t] if t < len(tokens_bsN) else "N/A"
                     print(f"  Token IDs: bs1={bs1_tok}, bsN={bsN_tok}")
                     print(f"  BS=1 logprob: {a.tolist()}")
                     print(f"  BS=N logprob: {b.tolist()}")
-                    differences_found.append({
-                        "prompt_idx": i,
-                        "step": t,
-                        "reason": f"Bitwise mismatch (max_diff={max_diff:.6e})",
-                        "prompt_preview": prompts[i][:100],
-                        "bs1_tokens": tokens_bs1,
-                        "bsN_tokens": tokens_bsN,
-                    })
+                    differences_found.append(
+                        {
+                            "prompt_idx": i,
+                            "step": t,
+                            "reason": f"Bitwise mismatch (max_diff={max_diff:.6e})",
+                            "prompt_preview": prompts[i][:100],
+                            "bs1_tokens": tokens_bs1,
+                            "bsN_tokens": tokens_bsN,
+                        }
+                    )
                     break
 
-
     # Print summary
     print(f"\n{'=' * 80}")
     if differences_found:
         success_msg = (
             f"✓ SUCCESS: Batch invariance is doing something! "
             f"Found {len(differences_found)}/{len(prompts)} prompts "
-            f"with differences when batch invariance was DISABLED.")
+            f"with differences when batch invariance was DISABLED."
+        )
         print(success_msg)
         print(f"{'=' * 80}")
         for diff in differences_found:
@@ -676,7 +632,8 @@ def test_aclgraph_logprobs_without_batch_invariance_should_fail(
             f"✗ UNEXPECTED: All {len(prompts)} prompts matched "
             f"between BS=1 and BS=N even with batch invariance DISABLED. "
             f"This suggests batch invariance might not be necessary, "
-            f"or the test needs more sensitive prompts.")
+            f"or the test needs more sensitive prompts."
+        )
         print(fail_msg)
         print(f"{'=' * 80}\n")
         pytest.fail(fail_msg)
diff --git a/tests/e2e/singlecard/test_aclgraph_mem.py b/tests/e2e/singlecard/test_aclgraph_mem.py
index 25d09786..ff73b168 100644
--- a/tests/e2e/singlecard/test_aclgraph_mem.py
+++ b/tests/e2e/singlecard/test_aclgraph_mem.py
@@ -40,7 +40,6 @@ def test_aclgraph_mem_use(model: str, max_tokens: int) -> None:
     capture_mem_after = multiprocessing.Value("q", -1)  # long long
 
     def capture_model_wrapper(original_method):
-
         def wrapped(self):
             mem_before = torch.npu.mem_get_info()[0]  # free memory
             result = original_method(self)
@@ -55,19 +54,16 @@ def test_aclgraph_mem_use(model: str, max_tokens: int) -> None:
 
     original_capture = NPUModelRunner.capture_model
 
-    with patch.object(NPUModelRunner,
-                      'capture_model',
-                      new=capture_model_wrapper(original_capture)):
+    with patch.object(NPUModelRunner, "capture_model", new=capture_model_wrapper(original_capture)):
         prompts = [
-            "Hello, my name is", "The president of the United States is",
-            "The capital of France is", "The future of AI is"
+            "Hello, my name is",
+            "The president of the United States is",
+            "The capital of France is",
+            "The future of AI is",
         ]
-        sampling_params = SamplingParams(max_tokens=max_tokens,
-                                         temperature=0.0)
+        sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
         if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
-            vllm_model = VllmRunner(model,
-                                    max_model_len=1024,
-                                    quantization="ascend")
+            vllm_model = VllmRunner(model, max_model_len=1024, quantization="ascend")
         else:
             vllm_model = VllmRunner(model)
         _ = vllm_model.generate(prompts, sampling_params)
@@ -94,5 +90,6 @@ def test_aclgraph_mem_use(model: str, max_tokens: int) -> None:
     assert mem_used_by_capture < max_mem_expected, (
         f"capture_model used more memory than expected. "
         f"Used: {mem_used_by_capture / (1024**3):.2f} GiB, "
-        f"Expected: < {max_capture_mem_gib:.2f} GiB")
-    os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = 'spawn'
+        f"Expected: < {max_capture_mem_gib:.2f} GiB"
+    )
+    os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
diff --git a/tests/e2e/singlecard/test_async_scheduling.py b/tests/e2e/singlecard/test_async_scheduling.py
index b8d53b84..e815d90c 100644
--- a/tests/e2e/singlecard/test_async_scheduling.py
+++ b/tests/e2e/singlecard/test_async_scheduling.py
@@ -15,8 +15,7 @@ from tests.e2e.model_utils import check_outputs_equal
 MODEL = "Qwen/Qwen3-0.6B"
 MTP_MODEL = "wemaster/deepseek_mtp_main_random_bf16"
 
-first_prompt = ("The following numbers of the sequence " +
-                ", ".join(str(i) for i in range(10)) + " are:")
+first_prompt = "The following numbers of the sequence " + ", ".join(str(i) for i in range(10)) + " are:"
 example_prompts = [
     "Hello, my name is",
     "The president of the United States is",
@@ -31,7 +30,9 @@ default_params = dict(
 )
 
 
-def test_without_spec_decoding(monkeypatch: pytest.MonkeyPatch, ):
+def test_without_spec_decoding(
+    monkeypatch: pytest.MonkeyPatch,
+):
     """Test consistency of combos of async scheduling, preemption,
     uni/multiproc executor, prefill chunking."""
     test_sampling_params: list[dict[str, Any]] = [
@@ -85,11 +86,11 @@ def run_tests(
         # avoid precision errors
         outputs: list[tuple[str, list, list]] = []
         for n, (
-                test_preemption,
-                executor,
-                async_scheduling,
-                spec_config,
-                test_prefill_chunking,
+            test_preemption,
+            executor,
+            async_scheduling,
+            spec_config,
+            test_prefill_chunking,
         ) in enumerate(test_configs, 1):
             test_str = f"{n}/{len(test_configs)}"
             test_results = run_test(
@@ -105,21 +106,18 @@ def run_tests(
             outputs.append(test_results)
 
     baseline_config, baseline_tests, _ = outputs[0]
-    _, _, baseline_acceptances = next((o for o in outputs if o[2] is not None),
-                                      (None, None, None))
+    _, _, baseline_acceptances = next((o for o in outputs if o[2] is not None), (None, None, None))
 
-    print(
-        f"BASELINE: config=[{baseline_config}], accept_rates={baseline_acceptances}"
-    )
+    print(f"BASELINE: config=[{baseline_config}], accept_rates={baseline_acceptances}")
 
     failure = None
     for test_config, test_outputs, test_acceptance_rates in outputs[1:]:
         for base_outs, base_acceptance_rate, test_outs, test_acceptance_rate, params in zip(
-                baseline_tests,
-                baseline_acceptances or repeat(None),
-                test_outputs,
-                test_acceptance_rates or repeat(None),
-                test_sampling_params,
+            baseline_tests,
+            baseline_acceptances or repeat(None),
+            test_outputs,
+            test_acceptance_rates or repeat(None),
+            test_sampling_params,
         ):
             try:
                 check_outputs_equal(
@@ -129,21 +127,18 @@ def run_tests(
                     name_1=f"config=[{test_config}], params={params}",
                 )
 
-                if (base_acceptance_rate is not None
-                        and test_acceptance_rate is not None):
+                if base_acceptance_rate is not None and test_acceptance_rate is not None:
                     if "spec_mml=None" in test_config:
-                        assert (test_acceptance_rate > base_acceptance_rate
-                                or test_acceptance_rate == pytest.approx(
-                                    base_acceptance_rate, rel=5e-2))
+                        assert test_acceptance_rate > base_acceptance_rate or test_acceptance_rate == pytest.approx(
+                            base_acceptance_rate, rel=5e-2
+                        )
                     else:
                         # Currently the reported acceptance rate is expected to be
                         # lower when we sometimes skip drafting altogether.
                         assert test_acceptance_rate > 0.1
-                print(f"PASSED: config=[{test_config}], params={params}"
-                      f" accept_rate={test_acceptance_rate}")
+                print(f"PASSED: config=[{test_config}], params={params} accept_rate={test_acceptance_rate}")
             except AssertionError as e:
-                print(f"FAILED: config=[{test_config}], params={params}"
-                      f" accept_rate={test_acceptance_rate}")
+                print(f"FAILED: config=[{test_config}], params={params} accept_rate={test_acceptance_rate}")
                 if failure is None:
                     failure = e
 
@@ -161,33 +156,35 @@ def run_test(
     spec_config: dict[str, Any] | None,
     test_prefill_chunking: bool,
 ):
-    os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
+    os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
     spec_decoding = spec_config is not None
     cache_arg: dict[str, Any] = (
         # Force preemptions
-        dict(num_gpu_blocks_override=2) if test_preemption else dict(
-            gpu_memory_utilization=0.9))
+        dict(num_gpu_blocks_override=2) if test_preemption else dict(gpu_memory_utilization=0.9)
+    )
     spec_mml = (spec_config or {}).get("max_model_len")
-    test_config = (f"executor={executor}, preemption={test_preemption}, "
-                   f"async_sched={async_scheduling}, "
-                   f"chunk_prefill={test_prefill_chunking}, "
-                   f"spec_decoding={spec_decoding}, spec_mml={spec_mml}")
+    test_config = (
+        f"executor={executor}, preemption={test_preemption}, "
+        f"async_sched={async_scheduling}, "
+        f"chunk_prefill={test_prefill_chunking}, "
+        f"spec_decoding={spec_decoding}, spec_mml={spec_mml}"
+    )
     print("-" * 80)
     print(f"---- TESTING {test_str}: {test_config}")
     print("-" * 80)
     with VllmRunner(
-            model,
-            max_model_len=512,
-            enable_chunked_prefill=test_prefill_chunking,
-            # Force prefill chunking
-            max_num_batched_tokens=48 if test_prefill_chunking else None,
-            enforce_eager=True,
-            async_scheduling=async_scheduling,
-            distributed_executor_backend=executor,
-            dtype="float16",  # avoid precision errors
-            speculative_config=spec_config,
-            disable_log_stats=False,
-            **cache_arg,
+        model,
+        max_model_len=512,
+        enable_chunked_prefill=test_prefill_chunking,
+        # Force prefill chunking
+        max_num_batched_tokens=48 if test_prefill_chunking else None,
+        enforce_eager=True,
+        async_scheduling=async_scheduling,
+        distributed_executor_backend=executor,
+        dtype="float16",  # avoid precision errors
+        speculative_config=spec_config,
+        disable_log_stats=False,
+        **cache_arg,
     ) as vllm_model:
         results = []
         acceptance_rates: list[float] | None = [] if spec_decoding else None
@@ -197,26 +194,23 @@ def run_test(
             results.append(
                 vllm_model.generate(
                     example_prompts,
-                    sampling_params=SamplingParams(**default_params,
-                                                   **override_params),
-                ))
+                    sampling_params=SamplingParams(**default_params, **override_params),
+                )
+            )
             metrics_after = vllm_model.model.get_metrics()
             if acceptance_rates is not None:
-                acceptance_rate = _get_acceptance_rate(metrics_before,
-                                                       metrics_after)
+                acceptance_rate = _get_acceptance_rate(metrics_before, metrics_after)
                 acceptance_rates.append(acceptance_rate)
                 print(f"ACCEPTANCE RATE {acceptance_rate}")
 
             if test_preemption:
-                preemptions = _get_count(metrics_before, metrics_after,
-                                         "vllm:num_preemptions")
+                preemptions = _get_count(metrics_before, metrics_after, "vllm:num_preemptions")
                 assert preemptions > 0, "preemption test had no preemptions"
 
     if len(results) > 1:
         # First check that the different parameter configs
         # actually result in different output.
-        for other_test_outs, params in zip(results[1:],
-                                           sampling_param_tests[1:]):
+        for other_test_outs, params in zip(results[1:], sampling_param_tests[1:]):
             with pytest.raises(AssertionError):
                 check_outputs_equal(
                     outputs_0_lst=results[0][0],
diff --git a/tests/e2e/singlecard/test_auto_fit_max_mode_len.py b/tests/e2e/singlecard/test_auto_fit_max_mode_len.py
index a576f132..814147ab 100644
--- a/tests/e2e/singlecard/test_auto_fit_max_mode_len.py
+++ b/tests/e2e/singlecard/test_auto_fit_max_mode_len.py
@@ -42,6 +42,7 @@ def new_kv_cache_spec(
         attention_chunk_size=attention_chunk_size,
     )
 
+
 def test_auto_fit_max_model_len():
     """Test that max_model_len=-1 auto-fits to available NPU memory."""
     # Create config with original_max_model_len=-1 to trigger auto-fit
@@ -59,9 +60,7 @@ def test_auto_fit_max_model_len():
 
     # With enough memory, max_model_len stays at the derived max
     large_available_memory = mem_per_block_per_layer * 2 * 1024  # plenty of memory
-    _kv_cache_configs = get_kv_cache_configs(
-        vllm_config, [kv_cache_specs], [large_available_memory]
-    )
+    _kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [large_available_memory])
     assert vllm_config.model_config.max_model_len == 1024
 
     # Reset for next test
@@ -73,9 +72,7 @@ def test_auto_fit_max_model_len():
     # Need memory for at least max_model_len tokens
     # 32 blocks worth of memory for 2 layers = can fit 32*16=512 tokens
     limited_memory = mem_per_block_per_layer * 2 * 32
-    _kv_cache_configs = get_kv_cache_configs(
-        vllm_config, [kv_cache_specs], [limited_memory]
-    )
+    _kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [limited_memory])
     # Should be reduced to fit in memory
     assert vllm_config.model_config.max_model_len < 1024
     assert vllm_config.model_config.max_model_len > 0
@@ -94,7 +91,5 @@ def test_auto_fit_max_model_len_not_triggered():
     }
 
     # This should work normally without auto-fit
-    _kv_cache_configs = get_kv_cache_configs(
-        vllm_config, [kv_cache_specs], [mem_per_block_per_layer * 2 * 32]
-    )
+    _kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [mem_per_block_per_layer * 2 * 32])
     assert vllm_config.model_config.max_model_len == 16
diff --git a/tests/e2e/singlecard/test_batch_invariant.py b/tests/e2e/singlecard/test_batch_invariant.py
index d4fd423c..50a86cdb 100644
--- a/tests/e2e/singlecard/test_batch_invariant.py
+++ b/tests/e2e/singlecard/test_batch_invariant.py
@@ -70,9 +70,7 @@ def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
 
     if target_words > 50:
         # For longer prompts, repeat context
-        padding_text = (
-            " This is an interesting topic that deserves more explanation. " *
-            (target_words // 50))
+        padding_text = " This is an interesting topic that deserves more explanation. " * (target_words // 50)
         base_prompt = base_prompt + padding_text
 
     return base_prompt
@@ -83,10 +81,7 @@ def _extract_step_logprobs(request_output):
         inner = request_output.outputs[0]
         if hasattr(inner, "logprobs") and inner.logprobs is not None:
             t = torch.tensor(
-                [
-                    inner.logprobs[i][tid].logprob
-                    for i, tid in enumerate(inner.token_ids)
-                ],
+                [inner.logprobs[i][tid].logprob for i, tid in enumerate(inner.token_ids)],
                 dtype=torch.float32,
             )
             return t, inner.token_ids
@@ -95,8 +90,7 @@ def _extract_step_logprobs(request_output):
 
 
 @pytest.mark.timeout(1000)
-def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
-        monkeypatch: pytest.MonkeyPatch):
+def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(monkeypatch: pytest.MonkeyPatch):
     """
     Ensures that the same request (the 'needle' prompt) yields identical output
     whether run alone (bs=1) or mixed into a larger batch (e.g., bs=64),
@@ -184,8 +178,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
                 if i == needle_pos:
                     prompts.append(needle_prompt)
                 else:
-                    prompts.append(
-                        _random_prompt(min_random_prompt, max_random_prompt))
+                    prompts.append(_random_prompt(min_random_prompt, max_random_prompt))
 
             # Generate with the larger-batch engine
             outputs = llm.generate(prompts, sampling)
@@ -196,27 +189,27 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
             text = needle_output.outputs[0].text
 
             if text != baseline_text:
-                print(
-                    f"{text}\n\n== Not the same as ==\n\n{baseline_text}\n\n")
+                print(f"{text}\n\n== Not the same as ==\n\n{baseline_text}\n\n")
                 mismatches += 1
 
         passes = num_trials - mismatches
         # Dump how many passed vs failed
-        print(f"[determinism] total={num_trials}, passed={passes}, "
-              f"failed={mismatches}, max_batch_size={max_batch_size}")
+        print(
+            f"[determinism] total={num_trials}, passed={passes}, failed={mismatches}, max_batch_size={max_batch_size}"
+        )
 
         if mismatches > 0:
             pytest.fail(
                 f"Nondeterministic outputs detected: {mismatches} failed out "
-                f"of {num_trials} trials (max_batch_size={max_batch_size}).")
+                f"of {num_trials} trials (max_batch_size={max_batch_size})."
+            )
 
     finally:
         del llm
         cleanup_dist_env_and_memory()
 
 
-def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
-        monkeypatch: pytest.MonkeyPatch):
+def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(monkeypatch: pytest.MonkeyPatch):
     seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
     random.seed(seed)
     model_name = DEFAULT_MODEL
@@ -230,9 +223,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
 
     if disable_custom_ar:
         print(f"\n{'=' * 80}")
-        print(
-            f"BATCH INVARIANCE MODE: Disabling custom all-reduce (TP={tp_size})"
-        )
+        print(f"BATCH INVARIANCE MODE: Disabling custom all-reduce (TP={tp_size})")
         print(f"{'=' * 80}\n")
 
     llm = LLM(
@@ -266,15 +257,12 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
     bs1_logprobs_per_prompt = []
     bs1_tokens_per_prompt = []
     for idx, p in enumerate(prompts):
-        print(
-            f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}..."
-        )
+        print(f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}...")
         outs = llm.generate([p], sp, use_tqdm=False)
         assert len(outs) == 1
         step_logprobs, token_ids = _extract_step_logprobs(outs[0])
         if step_logprobs is None:
-            pytest.skip("Logits are not available on RequestOutput; "
-                        "enable logprobs return to run this test.")
+            pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.")
         bs1_logprobs_per_prompt.append(step_logprobs)
         bs1_tokens_per_prompt.append(token_ids)
         print(f"[BS=1] Prompt {idx} generated tokens: {token_ids}")
@@ -296,108 +284,92 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
         print(f"[BS={len(prompts)}] Prompt {idx} generated tokens: {tokens}")
         step_logprobs, token_ids = _extract_step_logprobs(o)
         if step_logprobs is None:
-            pytest.skip("Logits are not available on RequestOutput; "
-                        "enable logprobs return to run this test.")
+            pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.")
         bsN_logprobs_per_prompt.append(step_logprobs)
         bsN_tokens_per_prompt.append(token_ids)
 
     # Compare step-by-step logprobs for each prompt between BS=1 and BS=N runs.
     failed_prompts = []
     for i, (logprobs_bs1, logprobs_bsN, tokens_bs1, tokens_bsN) in enumerate(
-            zip(
-                bs1_logprobs_per_prompt,
-                bsN_logprobs_per_prompt,
-                bs1_tokens_per_prompt,
-                bsN_tokens_per_prompt,
-            )):
+        zip(
+            bs1_logprobs_per_prompt,
+            bsN_logprobs_per_prompt,
+            bs1_tokens_per_prompt,
+            bsN_tokens_per_prompt,
+        )
+    ):
         if len(logprobs_bs1) != len(logprobs_bsN):
-            reason = (f"Different number of steps: {len(logprobs_bs1)} (BS=1) "
-                      f"vs {len(logprobs_bsN)} (BS=N)")
-            failed_prompts.append({
-                "prompt_idx": i,
-                "step": "all",
-                "reason": reason,
-                "prompt_preview": prompts[i][:100],
-                "bs1_tokens": tokens_bs1,
-                "bsN_tokens": tokens_bsN,
-            })
+            reason = f"Different number of steps: {len(logprobs_bs1)} (BS=1) vs {len(logprobs_bsN)} (BS=N)"
+            failed_prompts.append(
+                {
+                    "prompt_idx": i,
+                    "step": "all",
+                    "reason": reason,
+                    "prompt_preview": prompts[i][:100],
+                    "bs1_tokens": tokens_bs1,
+                    "bsN_tokens": tokens_bsN,
+                }
+            )
             continue
 
         # Check if tokens match first
         if tokens_bs1 != tokens_bsN:
-            failed_prompts.append({
-                "prompt_idx":
-                i,
-                "step":
-                "sampling",
-                "reason":
-                "Different tokens sampled",
-                "prompt_preview":
-                prompts[i][:100],
-                "bs1_tokens":
-                tokens_bs1,
-                "bsN_tokens":
-                tokens_bsN,
-                "bs1_all_logprobs":
-                [logprobs_bs1[s].tolist() for s in range(len(logprobs_bs1))],
-                "bsN_all_logprobs":
-                [logprobs_bsN[s].tolist() for s in range(len(logprobs_bsN))],
-            })
+            failed_prompts.append(
+                {
+                    "prompt_idx": i,
+                    "step": "sampling",
+                    "reason": "Different tokens sampled",
+                    "prompt_preview": prompts[i][:100],
+                    "bs1_tokens": tokens_bs1,
+                    "bsN_tokens": tokens_bsN,
+                    "bs1_all_logprobs": [logprobs_bs1[s].tolist() for s in range(len(logprobs_bs1))],
+                    "bsN_all_logprobs": [logprobs_bsN[s].tolist() for s in range(len(logprobs_bsN))],
+                }
+            )
             continue
 
         for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)):
             if a.shape != b.shape:
-                failed_prompts.append({
-                    "prompt_idx": i,
-                    "step": t,
-                    "reason": f"Shape mismatch: {a.shape} vs {b.shape}",
-                    "prompt_preview": prompts[i][:100],
-                    "bs1_tokens": tokens_bs1,
-                    "bsN_tokens": tokens_bsN,
-                })
+                failed_prompts.append(
+                    {
+                        "prompt_idx": i,
+                        "step": t,
+                        "reason": f"Shape mismatch: {a.shape} vs {b.shape}",
+                        "prompt_preview": prompts[i][:100],
+                        "bs1_tokens": tokens_bs1,
+                        "bsN_tokens": tokens_bsN,
+                    }
+                )
                 break
 
             if not torch.equal(a, b):
                 max_diff = torch.abs(a - b).max().item()
                 # Print which token failed
-                print(
-                    f"\n[DIVERGENCE] Prompt {i}, Token {t}: max_diff={max_diff:.6e}"
-                )
+                print(f"\n[DIVERGENCE] Prompt {i}, Token {t}: max_diff={max_diff:.6e}")
                 bs1_tok = tokens_bs1[t] if t < len(tokens_bs1) else "N/A"
                 bsN_tok = tokens_bsN[t] if t < len(tokens_bsN) else "N/A"
                 print(f"  Token IDs: bs1={bs1_tok}, bsN={bsN_tok}")
                 print(f"  BS=1 logprob: {a.tolist()}")
                 print(f"  BS=N logprob: {b.tolist()}")
-                failed_prompts.append({
-                    "prompt_idx":
-                    i,
-                    "step":
-                    t,
-                    "reason":
-                    f"Bitwise mismatch (max_diff={max_diff:.6e})",
-                    "prompt_preview":
-                    prompts[i][:100],
-                    "bs1_tokens":
-                    tokens_bs1,
-                    "bsN_tokens":
-                    tokens_bsN,
-                    "bs1_all_logprobs": [
-                        logprobs_bs1[s].tolist()
-                        for s in range(len(logprobs_bs1))
-                    ],
-                    "bsN_all_logprobs": [
-                        logprobs_bsN[s].tolist()
-                        for s in range(len(logprobs_bsN))
-                    ],
-                })
+                failed_prompts.append(
+                    {
+                        "prompt_idx": i,
+                        "step": t,
+                        "reason": f"Bitwise mismatch (max_diff={max_diff:.6e})",
+                        "prompt_preview": prompts[i][:100],
+                        "bs1_tokens": tokens_bs1,
+                        "bsN_tokens": tokens_bsN,
+                        "bs1_all_logprobs": [logprobs_bs1[s].tolist() for s in range(len(logprobs_bs1))],
+                        "bsN_all_logprobs": [logprobs_bsN[s].tolist() for s in range(len(logprobs_bsN))],
+                    }
+                )
                 break
     del llm
     cleanup_dist_env_and_memory()
     # Print summary of all failures
     if failed_prompts:
         print(f"\n{'=' * 80}")
-        fail_msg = (f"BATCH INVARIANCE FAILURES: {len(failed_prompts)}/"
-                    f"{len(prompts)} prompts failed")
+        fail_msg = f"BATCH INVARIANCE FAILURES: {len(failed_prompts)}/{len(prompts)} prompts failed"
         print(fail_msg)
         print(f"{'=' * 80}")
         for fail in failed_prompts:
@@ -412,21 +384,18 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
                 print(f"  BS=N tokens: {fail['bsN_tokens']}")
 
             if "bs1_all_logprobs" in fail:
-                print(
-                    f"  BS=1 logprobs for all {len(fail['bs1_all_logprobs'])} steps:"
-                )
+                print(f"  BS=1 logprobs for all {len(fail['bs1_all_logprobs'])} steps:")
                 for step_idx, logprobs in enumerate(fail["bs1_all_logprobs"]):
                     print(f"    Step {step_idx}: {logprobs}")
-                print(
-                    f"  BS=N logprobs for all {len(fail['bsN_all_logprobs'])} steps:"
-                )
+                print(f"  BS=N logprobs for all {len(fail['bsN_all_logprobs'])} steps:")
                 for step_idx, logprobs in enumerate(fail["bsN_all_logprobs"]):
                     print(f"    Step {step_idx}: {logprobs}")
         print(f"{'=' * 80}\n")
 
         # Fail the test with summary
-        msg = (f"Batch invariance violated in {len(failed_prompts)}/"
-               f"{len(prompts)} prompts. See output above for details.")
+        msg = (
+            f"Batch invariance violated in {len(failed_prompts)}/{len(prompts)} prompts. See output above for details."
+        )
         pytest.fail(msg)
 
 
@@ -476,8 +445,7 @@ def test_simple_generation(monkeypatch: pytest.MonkeyPatch):
         cleanup_dist_env_and_memory()
 
 
-def test_logprobs_without_batch_invariance_should_fail(
-        monkeypatch: pytest.MonkeyPatch):
+def test_logprobs_without_batch_invariance_should_fail(monkeypatch: pytest.MonkeyPatch):
     """
     This test is the inverse of test_logprobs_bitwise_batch_invariance_bs1_vs_bsN.
     It DISABLES batch invariance mode and expects to see non-deterministic behavior
@@ -540,15 +508,12 @@ def test_logprobs_without_batch_invariance_should_fail(
     bs1_logprobs_per_prompt = []
     bs1_tokens_per_prompt = []
     for idx, p in enumerate(prompts):
-        print(
-            f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}..."
-        )
+        print(f"\n[BS=1] Running prompt {idx}/{len(prompts)} - Preview: {p[:80]}...")
         outs = llm.generate([p], sp, use_tqdm=False)
         assert len(outs) == 1
         step_logprobs, token_ids = _extract_step_logprobs(outs[0])
         if step_logprobs is None:
-            pytest.skip("Logits are not available on RequestOutput; "
-                        "enable logprobs return to run this test.")
+            pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.")
         bs1_logprobs_per_prompt.append(step_logprobs)
         bs1_tokens_per_prompt.append(token_ids)
         print(f"[BS=1] Prompt {idx} generated tokens: {token_ids}")
@@ -569,74 +534,80 @@ def test_logprobs_without_batch_invariance_should_fail(
         print(f"[BS={len(prompts)}] Prompt {idx} generated tokens: {tokens}")
         step_logprobs, token_ids = _extract_step_logprobs(o)
         if step_logprobs is None:
-            pytest.skip("Logits are not available on RequestOutput; "
-                        "enable logprobs return to run this test.")
+            pytest.skip("Logits are not available on RequestOutput; enable logprobs return to run this test.")
         bsN_logprobs_per_prompt.append(step_logprobs)
         bsN_tokens_per_prompt.append(token_ids)
 
     # Compare step-by-step logprobs for each prompt between BS=1 and BS=N runs.
     differences_found = []
     for i, (logprobs_bs1, logprobs_bsN, tokens_bs1, tokens_bsN) in enumerate(
-            zip(
-                bs1_logprobs_per_prompt,
-                bsN_logprobs_per_prompt,
-                bs1_tokens_per_prompt,
-                bsN_tokens_per_prompt,
-            )):
+        zip(
+            bs1_logprobs_per_prompt,
+            bsN_logprobs_per_prompt,
+            bs1_tokens_per_prompt,
+            bsN_tokens_per_prompt,
+        )
+    ):
         if len(logprobs_bs1) != len(logprobs_bsN):
-            reason = (f"Different number of steps: {len(logprobs_bs1)} (BS=1) "
-                      f"vs {len(logprobs_bsN)} (BS=N)")
-            differences_found.append({
-                "prompt_idx": i,
-                "step": "all",
-                "reason": reason,
-                "prompt_preview": prompts[i][:100],
-                "bs1_tokens": tokens_bs1,
-                "bsN_tokens": tokens_bsN,
-            })
+            reason = f"Different number of steps: {len(logprobs_bs1)} (BS=1) vs {len(logprobs_bsN)} (BS=N)"
+            differences_found.append(
+                {
+                    "prompt_idx": i,
+                    "step": "all",
+                    "reason": reason,
+                    "prompt_preview": prompts[i][:100],
+                    "bs1_tokens": tokens_bs1,
+                    "bsN_tokens": tokens_bsN,
+                }
+            )
             continue
 
         # Check if tokens match first
         if tokens_bs1 != tokens_bsN:
-            differences_found.append({
-                "prompt_idx": i,
-                "step": "sampling",
-                "reason": "Different tokens sampled",
-                "prompt_preview": prompts[i][:100],
-                "bs1_tokens": tokens_bs1,
-                "bsN_tokens": tokens_bsN,
-            })
+            differences_found.append(
+                {
+                    "prompt_idx": i,
+                    "step": "sampling",
+                    "reason": "Different tokens sampled",
+                    "prompt_preview": prompts[i][:100],
+                    "bs1_tokens": tokens_bs1,
+                    "bsN_tokens": tokens_bsN,
+                }
+            )
             continue
 
         for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)):
             if a.shape != b.shape:
-                differences_found.append({
-                    "prompt_idx": i,
-                    "step": t,
-                    "reason": f"Shape mismatch: {a.shape} vs {b.shape}",
-                    "prompt_preview": prompts[i][:100],
-                    "bs1_tokens": tokens_bs1,
-                    "bsN_tokens": tokens_bsN,
-                })
+                differences_found.append(
+                    {
+                        "prompt_idx": i,
+                        "step": t,
+                        "reason": f"Shape mismatch: {a.shape} vs {b.shape}",
+                        "prompt_preview": prompts[i][:100],
+                        "bs1_tokens": tokens_bs1,
+                        "bsN_tokens": tokens_bsN,
+                    }
+                )
                 break
 
             if not torch.equal(a, b):
                 max_diff = torch.abs(a - b).max().item()
-                print(f"\n[EXPECTED DIVERGENCE FOUND] Prompt {i}, "
-                      f"Token {t}: max_diff={max_diff:.6e}")
+                print(f"\n[EXPECTED DIVERGENCE FOUND] Prompt {i}, Token {t}: max_diff={max_diff:.6e}")
                 bs1_tok = tokens_bs1[t] if t < len(tokens_bs1) else "N/A"
                 bsN_tok = tokens_bsN[t] if t < len(tokens_bsN) else "N/A"
                 print(f"  Token IDs: bs1={bs1_tok}, bsN={bsN_tok}")
                 print(f"  BS=1 logprob: {a.tolist()}")
                 print(f"  BS=N logprob: {b.tolist()}")
-                differences_found.append({
-                    "prompt_idx": i,
-                    "step": t,
-                    "reason": f"Bitwise mismatch (max_diff={max_diff:.6e})",
-                    "prompt_preview": prompts[i][:100],
-                    "bs1_tokens": tokens_bs1,
-                    "bsN_tokens": tokens_bsN,
-                })
+                differences_found.append(
+                    {
+                        "prompt_idx": i,
+                        "step": t,
+                        "reason": f"Bitwise mismatch (max_diff={max_diff:.6e})",
+                        "prompt_preview": prompts[i][:100],
+                        "bs1_tokens": tokens_bs1,
+                        "bsN_tokens": tokens_bsN,
+                    }
+                )
                 break
     del llm
     cleanup_dist_env_and_memory()
@@ -646,7 +617,8 @@ def test_logprobs_without_batch_invariance_should_fail(
         success_msg = (
             f"✓ SUCCESS: Batch invariance is doing something! "
             f"Found {len(differences_found)}/{len(prompts)} prompts "
-            f"with differences when batch invariance was DISABLED.")
+            f"with differences when batch invariance was DISABLED."
+        )
         print(success_msg)
         print(f"{'=' * 80}")
         for diff in differences_found:
@@ -666,7 +638,8 @@ def test_logprobs_without_batch_invariance_should_fail(
             f"✗ UNEXPECTED: All {len(prompts)} prompts matched "
             f"between BS=1 and BS=N even with batch invariance DISABLED. "
             f"This suggests batch invariance might not be necessary, "
-            f"or the test needs more sensitive prompts.")
+            f"or the test needs more sensitive prompts."
+        )
         print(fail_msg)
         print(f"{'=' * 80}\n")
         pytest.fail(fail_msg)
diff --git a/tests/e2e/singlecard/test_camem.py b/tests/e2e/singlecard/test_camem.py
index 5bdf68b7..db9762ae 100644
--- a/tests/e2e/singlecard/test_camem.py
+++ b/tests/e2e/singlecard/test_camem.py
@@ -37,10 +37,7 @@ def test_end_to_end():
     prompt = "How are you?"
     sampling_params = SamplingParams(temperature=0, max_tokens=10)
 
-    with VllmRunner("Qwen/Qwen3-0.6B",
-                    enable_sleep_mode=True,
-                    cudagraph_capture_sizes=[1, 2, 4, 8]) as runner:
-
+    with VllmRunner("Qwen/Qwen3-0.6B", enable_sleep_mode=True, cudagraph_capture_sizes=[1, 2, 4, 8]) as runner:
         output = runner.model.generate(prompt, sampling_params)
         # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
         # which is difficult to measure in the test. therefore, we only
diff --git a/tests/e2e/singlecard/test_completion_with_prompt_embeds.py b/tests/e2e/singlecard/test_completion_with_prompt_embeds.py
index 0e8ececa..2d6993f6 100644
--- a/tests/e2e/singlecard/test_completion_with_prompt_embeds.py
+++ b/tests/e2e/singlecard/test_completion_with_prompt_embeds.py
@@ -30,9 +30,7 @@ MODELS = ["Qwen/Qwen3-0.6B"]
 
 def get_prompt_embeds(chat, tokenizer, embedding_layer):
     """Convert chat messages to prompt embeddings."""
-    token_ids = tokenizer.apply_chat_template(chat,
-                                              add_generation_prompt=True,
-                                              return_tensors='pt')
+    token_ids = tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors="pt")
     prompt_embeds = embedding_layer(token_ids).squeeze(0)
     return prompt_embeds
 
@@ -53,15 +51,16 @@ def test_mixed_prompt_embeds_and_text(model_name):
 
     # Run inference with mixed inputs
     with VllmRunner(
-            model_name,
-            enable_prompt_embeds=True,
-            cudagraph_capture_sizes=[1, 2, 4, 8],
+        model_name,
+        enable_prompt_embeds=True,
+        cudagraph_capture_sizes=[1, 2, 4, 8],
     ) as vllm_runner:
         # Test prompt embeddings
-        embeds_output = vllm_runner.model.generate({
-            "prompt_embeds":
-            prompt_embeds,
-        })
+        embeds_output = vllm_runner.model.generate(
+            {
+                "prompt_embeds": prompt_embeds,
+            }
+        )
 
         # Test text prompt
         text_output = vllm_runner.model.generate(text_prompt)
diff --git a/tests/e2e/singlecard/test_cpu_offloading.py b/tests/e2e/singlecard/test_cpu_offloading.py
index e51a70d9..61b15597 100644
--- a/tests/e2e/singlecard/test_cpu_offloading.py
+++ b/tests/e2e/singlecard/test_cpu_offloading.py
@@ -107,15 +107,13 @@ def _latency_test(llm: LLM, subscriber: MockSubscriber):
 
 def _accuracy_test(llm: LLM, subscriber: MockSubscriber):
     sampling_params = SamplingParams(max_tokens=1)
-    cpu_block_size = (llm.llm_engine.vllm_config.kv_transfer_config.
-                      kv_connector_extra_config["block_size"])
+    cpu_block_size = llm.llm_engine.vllm_config.kv_transfer_config.kv_connector_extra_config["block_size"]
 
     subscriber.get_new_cpu_stored_events()
 
     # prepend prompt to be cpu block aligned
     prompt = "Let's count to 10. One, two, three, four,"
-    while (len(llm.generate(prompt, use_tqdm=False)[0].prompt_token_ids) %
-           cpu_block_size != 0):
+    while len(llm.generate(prompt, use_tqdm=False)[0].prompt_token_ids) % cpu_block_size != 0:
         prompt = ". " + prompt
 
     assert subscriber.get_new_cpu_stored_events()
@@ -123,8 +121,7 @@ def _accuracy_test(llm: LLM, subscriber: MockSubscriber):
     test_count = 100
     success_count = 0
     for i in range(test_count):
-        if (llm.generate(prompt, sampling_params,
-                         use_tqdm=False)[0].outputs[0].text == " five"):
+        if llm.generate(prompt, sampling_params, use_tqdm=False)[0].outputs[0].text == " five":
             success_count += 1
 
     assert success_count >= 0.5 * test_count
@@ -143,7 +140,7 @@ def test_cpu_offloading() -> None:
             "num_cpu_blocks": 1000,
             "block_size": 128,
             "spec_name": "NPUOffloadingSpec",
-            "spec_module_path": "vllm_ascend.kv_offload.npu"
+            "spec_module_path": "vllm_ascend.kv_offload.npu",
         },
     )
 
diff --git a/tests/e2e/singlecard/test_guided_decoding.py b/tests/e2e/singlecard/test_guided_decoding.py
index 8b1d83a8..fb1546ce 100644
--- a/tests/e2e/singlecard/test_guided_decoding.py
+++ b/tests/e2e/singlecard/test_guided_decoding.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 #
 import json
-from typing import Any, Dict
+from typing import Any
 
 import jsonschema
 import pytest
@@ -34,8 +34,10 @@ GuidedDecodingBackend = ["xgrammar", "guidance", "outlines"]
 
 @pytest.fixture(scope="module")
 def sample_regex():
-    return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
-            r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
+    return (
+        r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+        r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
+    )
 
 
 @pytest.fixture(scope="module")
@@ -43,66 +45,41 @@ def sample_json_schema():
     return {
         "type": "object",
         "properties": {
-            "name": {
-                "type": "string"
-            },
-            "age": {
-                "type": "integer"
-            },
-            "skills": {
-                "type": "array",
-                "items": {
-                    "type": "string",
-                    "maxLength": 10
-                },
-                "minItems": 3
-            },
+            "name": {"type": "string"},
+            "age": {"type": "integer"},
+            "skills": {"type": "array", "items": {"type": "string", "maxLength": 10}, "minItems": 3},
             "work_history": {
                 "type": "array",
                 "items": {
                     "type": "object",
                     "properties": {
-                        "company": {
-                            "type": "string"
-                        },
-                        "duration": {
-                            "type": "number"
-                        },
-                        "position": {
-                            "type": "string"
-                        }
+                        "company": {"type": "string"},
+                        "duration": {"type": "number"},
+                        "position": {"type": "string"},
                     },
-                    "required": ["company", "position"]
-                }
-            }
+                    "required": ["company", "position"],
+                },
+            },
         },
-        "required": ["name", "age", "skills", "work_history"]
+        "required": ["name", "age", "skills", "work_history"],
     }
 
 
 @pytest.mark.parametrize("guided_decoding_backend", GuidedDecodingBackend)
-def test_guided_json_completion(guided_decoding_backend: str,
-                                sample_json_schema):
-    runner_kwargs: Dict[str, Any] = {}
+def test_guided_json_completion(guided_decoding_backend: str, sample_json_schema):
+    runner_kwargs: dict[str, Any] = {}
     sampling_params = SamplingParams(
-        temperature=1.0,
-        max_tokens=500,
-        structured_outputs=StructuredOutputsParams(json=sample_json_schema))
+        temperature=1.0, max_tokens=500, structured_outputs=StructuredOutputsParams(json=sample_json_schema)
+    )
     runner_kwargs = {
         "cudagraph_capture_sizes": [1, 2, 4, 8],
         "seed": 0,
-        "structured_outputs_config": {
-            "backend": guided_decoding_backend
-        },
+        "structured_outputs_config": {"backend": guided_decoding_backend},
     }
     with VllmRunner(MODEL_NAME, **runner_kwargs) as vllm_model:
-        prompts = [
-            f"Give an example JSON for an employee profile "
-            f"that fits this schema: {sample_json_schema}"
-        ] * 2
+        prompts = [f"Give an example JSON for an employee profile that fits this schema: {sample_json_schema}"] * 2
         inputs = vllm_model.get_inputs(prompts)
-        outputs = vllm_model.model.generate(inputs,
-                                            sampling_params=sampling_params)
+        outputs = vllm_model.model.generate(inputs, sampling_params=sampling_params)
 
         assert outputs is not None
 
@@ -115,34 +92,27 @@ def test_guided_json_completion(guided_decoding_backend: str,
             assert generated_text is not None
             print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
             output_json = json.loads(generated_text)
-            jsonschema.validate(instance=output_json,
-                                schema=sample_json_schema)
+            jsonschema.validate(instance=output_json, schema=sample_json_schema)
 
 
 @pytest.mark.parametrize("guided_decoding_backend", GuidedDecodingBackend)
 def test_guided_regex(guided_decoding_backend: str, sample_regex):
     if guided_decoding_backend == "outlines":
         pytest.skip("Outlines doesn't support regex-based guided decoding.")
-    runner_kwargs: Dict[str, Any] = {}
+    runner_kwargs: dict[str, Any] = {}
     sampling_params = SamplingParams(
-        temperature=0.8,
-        top_p=0.95,
-        structured_outputs=StructuredOutputsParams(regex=sample_regex))
+        temperature=0.8, top_p=0.95, structured_outputs=StructuredOutputsParams(regex=sample_regex)
+    )
     runner_kwargs = {
         "cudagraph_capture_sizes": [1, 2, 4, 8],
         "seed": 0,
-        "structured_outputs_config": {
-            "backend": guided_decoding_backend
-        },
+        "structured_outputs_config": {"backend": guided_decoding_backend},
     }
 
     with VllmRunner(MODEL_NAME, **runner_kwargs) as vllm_model:
-        prompts = [
-            f"Give an example IPv4 address with this regex: {sample_regex}"
-        ] * 2
+        prompts = [f"Give an example IPv4 address with this regex: {sample_regex}"] * 2
         inputs = vllm_model.get_inputs(prompts)
-        outputs = vllm_model.model.generate(inputs,
-                                            sampling_params=sampling_params)
+        outputs = vllm_model.model.generate(inputs, sampling_params=sampling_params)
         assert outputs is not None
         for output in outputs:
             assert output is not None
diff --git a/tests/e2e/singlecard/test_ilama_lora.py b/tests/e2e/singlecard/test_ilama_lora.py
index d59acd30..bf578cb0 100644
--- a/tests/e2e/singlecard/test_ilama_lora.py
+++ b/tests/e2e/singlecard/test_ilama_lora.py
@@ -19,20 +19,16 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         PROMPT_TEMPLATE.format(query="How many singers do we have?"),
         PROMPT_TEMPLATE.format(
-            query=
-            "What is the average, minimum, and maximum age of all singers from France?"  # noqa: E501
+            query="What is the average, minimum, and maximum age of all singers from France?"  # noqa: E501
         ),
         PROMPT_TEMPLATE.format(
-            query=
-            "What are all distinct countries where singers above age 20 are from?"  # noqa: E501
+            query="What are all distinct countries where singers above age 20 are from?"  # noqa: E501
         ),
     ]
     sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
     outputs = llm.generate(
-        prompts,
-        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None)
+        prompts, sampling_params, lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None
+    )
     # Print the outputs.
     generated_texts: list[str] = []
     for output in outputs:
@@ -45,16 +41,15 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
 
 def test_ilama_lora(ilama_lora_files):
     with VllmRunner(
-            MODEL_PATH,
-            enable_lora=True,
-            dtype="half",
-            max_loras=4,
-            max_model_len=1024,
-            cudagraph_capture_sizes=[1, 2, 4, 8],
-            max_num_seqs=16,
-            enforce_eager=True,
+        MODEL_PATH,
+        enable_lora=True,
+        dtype="half",
+        max_loras=4,
+        max_model_len=1024,
+        cudagraph_capture_sizes=[1, 2, 4, 8],
+        max_num_seqs=16,
+        enforce_eager=True,
     ) as vllm_model:
-
         output1 = do_sample(vllm_model.model, ilama_lora_files, lora_id=1)
         for i in range(len(EXPECTED_LORA_OUTPUT)):
             assert output1[i] == EXPECTED_LORA_OUTPUT[i]
diff --git a/tests/e2e/singlecard/test_llama32_lora.py b/tests/e2e/singlecard/test_llama32_lora.py
index ead2827e..ab7015b2 100644
--- a/tests/e2e/singlecard/test_llama32_lora.py
+++ b/tests/e2e/singlecard/test_llama32_lora.py
@@ -1,12 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import pytest
+from unittest.mock import patch
 
+import pytest
 import vllm
 import vllm.config
 from vllm.lora.request import LoRARequest
-from unittest.mock import patch
 
 from tests.e2e.conftest import VllmRunner
 from vllm_ascend.utils import enable_custom_op
@@ -53,17 +53,12 @@ def do_sample(
         PROMPT_TEMPLATE.format(context="How many candidates are there?"),
         PROMPT_TEMPLATE.format(context="Count the number of candidates."),
         PROMPT_TEMPLATE.format(
-            context=
-            "Which poll resource provided the most number of candidate information?"  # noqa: E501
+            context="Which poll resource provided the most number of candidate information?"  # noqa: E501
         ),
-        PROMPT_TEMPLATE.format(
-            context=
-            "Return the poll resource associated with the most candidates."),
+        PROMPT_TEMPLATE.format(context="Return the poll resource associated with the most candidates."),
     ]
 
-    sampling_params = vllm.SamplingParams(temperature=0,
-                                          max_tokens=64,
-                                          stop=["<|im_end|>"])
+    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=64, stop=["<|im_end|>"])
     if tensorizer_config_dict is not None:
         outputs = llm.generate(
             prompts,
@@ -73,14 +68,15 @@ def do_sample(
                 lora_id,
                 lora_path,
                 tensorizer_config_dict=tensorizer_config_dict,
-            ) if lora_id else None,
+            )
+            if lora_id
+            else None,
         )
     else:
         outputs = llm.generate(
             prompts,
             sampling_params,
-            lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-            if lora_id else None,
+            lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
         )
 
     generated_texts: list[str] = []
@@ -92,33 +88,40 @@ def do_sample(
     return generated_texts
 
 
-def generate_and_test(llm,
-                      llama32_lora_files,
-                      tensorizer_config_dict: dict | None = None):
+def generate_and_test(llm, llama32_lora_files, tensorizer_config_dict: dict | None = None):
     print("lora adapter created")
     print("lora 1")
-    assert (do_sample(
-        llm,
-        llama32_lora_files,
-        tensorizer_config_dict=tensorizer_config_dict,
-        lora_id=1,
-    ) == EXPECTED_LORA_OUTPUT)
+    assert (
+        do_sample(
+            llm,
+            llama32_lora_files,
+            tensorizer_config_dict=tensorizer_config_dict,
+            lora_id=1,
+        )
+        == EXPECTED_LORA_OUTPUT
+    )
 
     print("lora 2")
-    assert (do_sample(
-        llm,
-        llama32_lora_files,
-        tensorizer_config_dict=tensorizer_config_dict,
-        lora_id=2,
-    ) == EXPECTED_LORA_OUTPUT)
+    assert (
+        do_sample(
+            llm,
+            llama32_lora_files,
+            tensorizer_config_dict=tensorizer_config_dict,
+            lora_id=2,
+        )
+        == EXPECTED_LORA_OUTPUT
+    )
 
     print("base model")
-    assert (do_sample(
-        llm,
-        llama32_lora_files,
-        tensorizer_config_dict=tensorizer_config_dict,
-        lora_id=0,
-    ) == EXPECTED_BASE_MODEL_OUTPUT)
+    assert (
+        do_sample(
+            llm,
+            llama32_lora_files,
+            tensorizer_config_dict=tensorizer_config_dict,
+            lora_id=0,
+        )
+        == EXPECTED_BASE_MODEL_OUTPUT
+    )
 
     print("removing lora")
 
diff --git a/tests/e2e/singlecard/test_models.py b/tests/e2e/singlecard/test_models.py
index 659b5f69..fcbde3b6 100644
--- a/tests/e2e/singlecard/test_models.py
+++ b/tests/e2e/singlecard/test_models.py
@@ -45,9 +45,7 @@ def test_minicpm(model) -> None:
     ]
     max_tokens = 5
 
-    with VllmRunner(model,
-                    max_model_len=512,
-                    gpu_memory_utilization=0.7) as runner:
+    with VllmRunner(model, max_model_len=512, gpu_memory_utilization=0.7) as runner:
         runner.generate_greedy(example_prompts, max_tokens)
 
 
@@ -56,19 +54,12 @@ def test_whisper(model) -> None:
     prompts = ["<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"]
     audios = [AudioAsset("mary_had_lamb").audio_and_sample_rate]
 
-    sampling_params = SamplingParams(temperature=0.2,
-                                     max_tokens=10,
-                                     stop_token_ids=None)
+    sampling_params = SamplingParams(temperature=0.2, max_tokens=10, stop_token_ids=None)
 
-    with VllmRunner(model,
-                    max_model_len=448,
-                    max_num_seqs=5,
-                    dtype="bfloat16",
-                    block_size=128,
-                    gpu_memory_utilization=0.9) as runner:
-        outputs = runner.generate(prompts=prompts,
-                                  audios=audios,
-                                  sampling_params=sampling_params)
+    with VllmRunner(
+        model, max_model_len=448, max_num_seqs=5, dtype="bfloat16", block_size=128, gpu_memory_utilization=0.9
+    ) as runner:
+        outputs = runner.generate(prompts=prompts, audios=audios, sampling_params=sampling_params)
 
     assert outputs is not None, "Generated outputs should not be None."
     assert len(outputs) > 0, "Generated outputs should not be empty."
diff --git a/tests/e2e/singlecard/test_multistream_overlap_shared_expert.py b/tests/e2e/singlecard/test_multistream_overlap_shared_expert.py
index caf09bd9..f673b022 100644
--- a/tests/e2e/singlecard/test_multistream_overlap_shared_expert.py
+++ b/tests/e2e/singlecard/test_multistream_overlap_shared_expert.py
@@ -39,59 +39,56 @@ def test_models_with_multistream_overlap_shared_expert(
     max_tokens: int,
 ) -> None:
     prompts = [
-        "Hello, my name is", "The president of the United States is",
-        "The capital of France is", "The future of AI is"
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
     ]
 
     sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
     with VllmRunner(
-            model,
-            max_model_len=1024,
-            enforce_eager=True,
-            cudagraph_capture_sizes=[4, 8, 16, 32],
-            additional_config={
-                "multistream_overlap_shared_expert": True,
-            },
-            quantization="ascend",
+        model,
+        max_model_len=1024,
+        enforce_eager=True,
+        cudagraph_capture_sizes=[4, 8, 16, 32],
+        additional_config={
+            "multistream_overlap_shared_expert": True,
+        },
+        quantization="ascend",
     ) as runner:
-        vllm_moe_ms_eager_outputs = runner.model.generate(
-            prompts, sampling_params)
+        vllm_moe_ms_eager_outputs = runner.model.generate(prompts, sampling_params)
 
     with VllmRunner(
-            model,
-            max_model_len=1024,
-            cudagraph_capture_sizes=[4, 8, 16, 32],
-            additional_config={
-                "multistream_overlap_shared_expert": True,
-            },
-            quantization="ascend",
+        model,
+        max_model_len=1024,
+        cudagraph_capture_sizes=[4, 8, 16, 32],
+        additional_config={
+            "multistream_overlap_shared_expert": True,
+        },
+        quantization="ascend",
     ) as runner:
-        vllm_moe_ms_aclgraph_outputs = runner.model.generate(
-            prompts, sampling_params)
+        vllm_moe_ms_aclgraph_outputs = runner.model.generate(prompts, sampling_params)
 
     with VllmRunner(
-            model,
-            max_model_len=1024,
-            enforce_eager=True,
-            cudagraph_capture_sizes=[4, 8, 16, 32],
-            quantization="ascend",
+        model,
+        max_model_len=1024,
+        enforce_eager=True,
+        cudagraph_capture_sizes=[4, 8, 16, 32],
+        quantization="ascend",
     ) as runner:
         vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
 
     vllm_moe_ms_eager_outputs_list = []
     for output in vllm_moe_ms_eager_outputs:
-        vllm_moe_ms_eager_outputs_list.append(
-            (output.outputs[0].index, output.outputs[0].text))
+        vllm_moe_ms_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
 
     vllm_moe_ms_aclgraph_outputs_list = []
     for output in vllm_moe_ms_aclgraph_outputs:
-        vllm_moe_ms_aclgraph_outputs_list.append(
-            (output.outputs[0].index, output.outputs[0].text))
+        vllm_moe_ms_aclgraph_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
 
     vllm_eager_outputs_list = []
     for output in vllm_eager_outputs:
-        vllm_eager_outputs_list.append(
-            (output.outputs[0].index, output.outputs[0].text))
+        vllm_eager_outputs_list.append((output.outputs[0].index, output.outputs[0].text))
 
     check_outputs_equal(
         outputs_0_lst=vllm_eager_outputs_list,
diff --git a/tests/e2e/singlecard/test_quantization.py b/tests/e2e/singlecard/test_quantization.py
index 119be0c2..b50ac3cf 100644
--- a/tests/e2e/singlecard/test_quantization.py
+++ b/tests/e2e/singlecard/test_quantization.py
@@ -19,6 +19,7 @@ from tests.e2e.conftest import VllmRunner
 from tests.e2e.model_utils import check_outputs_equal
 
 
+# fmt: off
 def test_qwen3_w8a8_quant():
     max_tokens = 5
     example_prompts = [
@@ -29,6 +30,7 @@ def test_qwen3_w8a8_quant():
         13480, 4712, 369, 444, 10994, 82, 13, 1084, 374, 6188, 311, 387
     ], 'vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be'
                             )]
+# fmt: on
 
     with VllmRunner(
             "vllm-ascend/Qwen3-0.6B-W8A8",
@@ -47,7 +49,7 @@ def test_qwen3_w8a8_quant():
         name_1="vllm_quant_w8a8_outputs",
     )
 
-
+# fmt: off
 def test_qwen3_dense_w8a16():
     max_tokens = 5
     example_prompts = [
@@ -58,6 +60,7 @@ def test_qwen3_dense_w8a16():
         13480, 4712, 369, 444, 10994, 82, 13, 1084, 374, 6188, 311, 387
     ], 'vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be'
                             )]
+# fmt: on
 
     with VllmRunner(
             "vllm-ascend/Qwen3-0.6B-W8A16",
diff --git a/tests/e2e/singlecard/test_qwen3_multi_loras.py b/tests/e2e/singlecard/test_qwen3_multi_loras.py
index 60d61325..53beaf43 100644
--- a/tests/e2e/singlecard/test_qwen3_multi_loras.py
+++ b/tests/e2e/singlecard/test_qwen3_multi_loras.py
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from unittest.mock import patch
+
 from vllm import SamplingParams
 from vllm.lora.request import LoRARequest
-from unittest.mock import patch
 
 from tests.e2e.conftest import VllmRunner
 from vllm_ascend.utils import enable_custom_op
@@ -27,16 +28,11 @@ LORA_TEST_EXPECTED = [
 
 def format_chatml_messages(prompt: str):
     return [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant."
-        },
-        {
-            "role": "user",
-            "content": prompt
-        },
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": prompt},
     ]
 
+
 @patch.dict("os.environ", {"VLLM_USE_MODELSCOPE": "False"})
 def test_multi_loras_with_tp_sync():
     lora_name_id_map = {}
@@ -102,9 +98,7 @@ def test_multi_loras_with_tp_sync():
         outputs = llm.chat(
             [messages],
             sampling_params,
-            chat_template_kwargs={
-                "enable_thinking": False
-            },  # for those loras, ensure enable_thinking=False
+            chat_template_kwargs={"enable_thinking": False},  # for those loras, ensure enable_thinking=False
             lora_request=lora_request,
             use_tqdm=False,
         )
@@ -113,15 +107,13 @@ def test_multi_loras_with_tp_sync():
 
     def reload_lora(name: str):
         """
-        reload a lora to simulate the case: 
-        setting `VLLM_ALLOW_RUNTIME_LORA_UPDATING=true` 
+        reload a lora to simulate the case:
+        setting `VLLM_ALLOW_RUNTIME_LORA_UPDATING=true`
         for dynamic lora loading and unloading
         """
-        remove_lora_response = llm.llm_engine.remove_lora(
-            lora_id=lora_name_id_map[name])
+        remove_lora_response = llm.llm_engine.remove_lora(lora_id=lora_name_id_map[name])
 
-        add_lora_response = llm.llm_engine.add_lora(
-            make_add_lora_request(name, LORA_NAME_PATH_MAP[name]))
+        add_lora_response = llm.llm_engine.add_lora(make_add_lora_request(name, LORA_NAME_PATH_MAP[name]))
 
         print(f"{remove_lora_response=}, {add_lora_response=}")
 
@@ -131,7 +123,6 @@ def test_multi_loras_with_tp_sync():
         assert outputs == expected
 
     for prompt, expected_output in zip(LORA_TEST_PROMPTS, LORA_TEST_EXPECTED):
-
         output_text = call_llm_get_outputs(prompt, "Alice")
         check_outputs(output_text, expected_output, prompt)
 
diff --git a/tests/e2e/singlecard/test_sampler.py b/tests/e2e/singlecard/test_sampler.py
index 894977b8..9e64276a 100644
--- a/tests/e2e/singlecard/test_sampler.py
+++ b/tests/e2e/singlecard/test_sampler.py
@@ -25,15 +25,11 @@ def test_qwen3_topk() -> None:
     example_prompts = [
         "Hello, my name is",
     ]
-    sampling_params = SamplingParams(max_tokens=5,
-                                     temperature=0.0,
-                                     top_k=50,
-                                     top_p=0.9)
+    sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)
 
-    with VllmRunner("Qwen/Qwen3-0.6B",
-                    max_model_len=8192,
-                    cudagraph_capture_sizes=[1, 2, 4, 8],
-                    gpu_memory_utilization=0.7) as runner:
+    with VllmRunner(
+        "Qwen/Qwen3-0.6B", max_model_len=8192, cudagraph_capture_sizes=[1, 2, 4, 8], gpu_memory_utilization=0.7
+    ) as runner:
         runner.generate(example_prompts, sampling_params)
 
 
@@ -42,29 +38,25 @@ def test_qwen3_prompt_logprobs() -> None:
         "Hello, my name is",
     ]
 
-    with VllmRunner("Qwen/Qwen3-0.6B",
-                    max_model_len=8192,
-                    cudagraph_capture_sizes=[1, 2, 4, 8],
-                    gpu_memory_utilization=0.7) as runner:
-        runner.generate_greedy_logprobs(example_prompts,
-                                        max_tokens=5,
-                                        num_logprobs=1)
+    with VllmRunner(
+        "Qwen/Qwen3-0.6B", max_model_len=8192, cudagraph_capture_sizes=[1, 2, 4, 8], gpu_memory_utilization=0.7
+    ) as runner:
+        runner.generate_greedy_logprobs(example_prompts, max_tokens=5, num_logprobs=1)
 
 
 def test_qwen3_exponential_overlap() -> None:
     example_prompts = [
         "Hello, my name is",
     ]
-    sampling_params = SamplingParams(max_tokens=5,
-                                     temperature=1.0,
-                                     top_k=50,
-                                     top_p=0.9)
+    sampling_params = SamplingParams(max_tokens=5, temperature=1.0, top_k=50, top_p=0.9)
 
-    with VllmRunner("Qwen/Qwen3-0.6B",
-                    max_model_len=8192,
-                    cudagraph_capture_sizes=[1, 2, 4, 8],
-                    gpu_memory_utilization=0.7,
-                    additional_config={
-                        "enable_async_exponential": True,
-                    }) as runner:
+    with VllmRunner(
+        "Qwen/Qwen3-0.6B",
+        max_model_len=8192,
+        cudagraph_capture_sizes=[1, 2, 4, 8],
+        gpu_memory_utilization=0.7,
+        additional_config={
+            "enable_async_exponential": True,
+        },
+    ) as runner:
         runner.generate(example_prompts, sampling_params)
diff --git a/tests/e2e/singlecard/test_vlm.py b/tests/e2e/singlecard/test_vlm.py
index 33e896ff..e878f78a 100644
--- a/tests/e2e/singlecard/test_vlm.py
+++ b/tests/e2e/singlecard/test_vlm.py
@@ -20,6 +20,7 @@
 
 Run `pytest tests/test_offline_inference.py`.
 """
+
 import os
 from unittest.mock import patch
 
@@ -44,11 +45,13 @@ def test_multimodal_vl(vl_config):
     images = [image] * len(img_questions)
     prompts = vl_config["prompt_fn"](img_questions)
 
-    with VllmRunner(vl_config["model"],
-                    mm_processor_kwargs=vl_config["mm_processor_kwargs"],
-                    max_model_len=8192,
-                    cudagraph_capture_sizes=[1, 2, 4, 8],
-                    limit_mm_per_prompt={"image": 1}) as vllm_model:
+    with VllmRunner(
+        vl_config["model"],
+        mm_processor_kwargs=vl_config["mm_processor_kwargs"],
+        max_model_len=8192,
+        cudagraph_capture_sizes=[1, 2, 4, 8],
+        limit_mm_per_prompt={"image": 1},
+    ) as vllm_model:
         outputs = vllm_model.generate_greedy(
             prompts=prompts,
             images=images,
@@ -63,35 +66,30 @@ def test_multimodal_vl(vl_config):
 
 @patch.dict(os.environ, {"VLLM_WORKER_MULTIPROC_METHOD": "spawn"})
 def test_multimodal_audio():
-    audio_prompt = "".join([
-        f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
-        for idx in range(2)
-    ])
+    audio_prompt = "".join([f"Audio {idx + 1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(2)])
     question = "What sport and what nursery rhyme are referenced?"
-    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-              "<|im_start|>user\n"
-              f"{audio_prompt}{question}<|im_end|>\n"
-              "<|im_start|>assistant\n")
+    prompt = (
+        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+        "<|im_start|>user\n"
+        f"{audio_prompt}{question}<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
     mm_data = {
-        "audio": [
-            asset.audio_and_sample_rate for asset in
-            [AudioAsset("mary_had_lamb"),
-             AudioAsset("winning_call")]
-        ]
+        "audio": [asset.audio_and_sample_rate for asset in [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]]
     }
     inputs = {"prompt": prompt, "multi_modal_data": mm_data}
 
-    sampling_params = SamplingParams(temperature=0.2,
-                                     max_tokens=10,
-                                     stop_token_ids=None)
+    sampling_params = SamplingParams(temperature=0.2, max_tokens=10, stop_token_ids=None)
 
-    with VllmRunner("Qwen/Qwen2-Audio-7B-Instruct",
-                    max_model_len=4096,
-                    max_num_seqs=5,
-                    dtype="bfloat16",
-                    limit_mm_per_prompt={"audio": 2},
-                    cudagraph_capture_sizes=[1, 2, 4, 8],
-                    gpu_memory_utilization=0.9) as runner:
+    with VllmRunner(
+        "Qwen/Qwen2-Audio-7B-Instruct",
+        max_model_len=4096,
+        max_num_seqs=5,
+        dtype="bfloat16",
+        limit_mm_per_prompt={"audio": 2},
+        cudagraph_capture_sizes=[1, 2, 4, 8],
+        gpu_memory_utilization=0.9,
+    ) as runner:
         outputs = runner.generate(inputs, sampling_params=sampling_params)
 
         assert outputs is not None, "Generated outputs should not be None."
diff --git a/tests/e2e/singlecard/test_xlite.py b/tests/e2e/singlecard/test_xlite.py
index 8de3972b..231cb408 100644
--- a/tests/e2e/singlecard/test_xlite.py
+++ b/tests/e2e/singlecard/test_xlite.py
@@ -20,13 +20,14 @@ Compare the outputs of vLLM with and without xlite.
 Run `pytest tests/e2e/singlecard/test_xlite.py`.
 """
 
+# ruff: noqa: E501
+
 import os
 
 import pytest
 from vllm import SamplingParams
 
-from tests.e2e.singlecard.utils import (PROMPTS_SHORT, LLMTestCase,
-                                        gen_and_valid)
+from tests.e2e.singlecard.utils import PROMPTS_SHORT, LLMTestCase, gen_and_valid
 
 os.environ["VLLM_ASCEND_ENABLE_NZ"] = "2"
 
@@ -35,9 +36,9 @@ CASE_DECODE_ONLY = LLMTestCase(
     prompts=PROMPTS_SHORT,
     golden_answers=[
         "Hello, my name is Lina. I'm a 22-year-old student from China.",
-        'The president of the United States is the same as the president of the United Nations. This is because the president',
-        'The capital of France is Paris. The capital of France is also the capital of the French Republic.',
-        'The future of AI is not just a technological challenge but a profound transformation of how we live, work'
+        "The president of the United States is the same as the president of the United Nations. This is because the president",
+        "The capital of France is Paris. The capital of France is also the capital of the French Republic.",
+        "The future of AI is not just a technological challenge but a profound transformation of how we live, work",
     ],
     sampling_params=SamplingParams(
         max_tokens=15,
@@ -45,19 +46,22 @@ CASE_DECODE_ONLY = LLMTestCase(
         top_p=1.0,
         top_k=0,
         n=1,
-    ))
+    ),
+)
 
 CASE_FULL = LLMTestCase(
     model="Qwen/Qwen3-0.6B",
     prompts=[
-        "Hello, my name is", "The president of the United States is",
-        "The capital of France is", "The future of AI is"
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
     ],
     golden_answers=[
         " Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the",
-        ' the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president',
-        ' Paris. The capital of Italy is Rome. The capital of Spain is Madrid. The capital of China is Beijing. The capital of Japan is Tokyo. The capital',
-        " not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and"
+        " the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president",
+        " Paris. The capital of Italy is Rome. The capital of Spain is Madrid. The capital of China is Beijing. The capital of Japan is Tokyo. The capital",
+        " not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and",
     ],
     sampling_params=SamplingParams(
         max_tokens=32,
@@ -65,27 +69,25 @@ CASE_FULL = LLMTestCase(
         top_p=1.0,
         top_k=0,
         n=1,
-    ))
+    ),
+)
 
 
-@pytest.mark.skip(
-    reason="TODO: Re-enable xlite_decode_only e2e test when stable.")
+@pytest.mark.skip(reason="TODO: Re-enable xlite_decode_only e2e test when stable.")
 @pytest.mark.parametrize("cur_case", [CASE_DECODE_ONLY])
 def test_models_with_xlite_decode_only(cur_case: LLMTestCase):
     runner_kwargs = {
         "model_name": cur_case.model,
         "max_model_len": 1024,
         "block_size": 128,
-        "additional_config": {
-            "xlite_graph_config": {
-                "enabled": True
-            }
-        },
+        "additional_config": {"xlite_graph_config": {"enabled": True}},
     }
-    gen_and_valid(runner_kwargs=runner_kwargs,
-                  prompts=cur_case.prompts,
-                  sampling_params=cur_case.sampling_params,
-                  golden_answers=cur_case.golden_answers)
+    gen_and_valid(
+        runner_kwargs=runner_kwargs,
+        prompts=cur_case.prompts,
+        sampling_params=cur_case.sampling_params,
+        golden_answers=cur_case.golden_answers,
+    )
 
 
 @pytest.mark.parametrize("cur_case", [CASE_FULL])
@@ -94,14 +96,11 @@ def test_models_with_xlite_full_mode(cur_case: LLMTestCase):
         "model_name": cur_case.model,
         "max_model_len": 1024,
         "block_size": 128,
-        "additional_config": {
-            "xlite_graph_config": {
-                "enabled": True,
-                "full_mode": True
-            }
-        },
+        "additional_config": {"xlite_graph_config": {"enabled": True, "full_mode": True}},
     }
-    gen_and_valid(runner_kwargs=runner_kwargs,
-                  prompts=cur_case.prompts,
-                  sampling_params=cur_case.sampling_params,
-                  golden_answers=cur_case.golden_answers)
+    gen_and_valid(
+        runner_kwargs=runner_kwargs,
+        prompts=cur_case.prompts,
+        sampling_params=cur_case.sampling_params,
+        golden_answers=cur_case.golden_answers,
+    )
diff --git a/tests/e2e/singlecard/utils.py b/tests/e2e/singlecard/utils.py
index b9ada6c8..1ac30acb 100644
--- a/tests/e2e/singlecard/utils.py
+++ b/tests/e2e/singlecard/utils.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import Optional
 
 from vllm import SamplingParams
 
@@ -7,37 +6,44 @@ from tests.e2e.conftest import VllmRunner
 from tests.e2e.model_utils import check_outputs_equal
 
 PROMPTS_SHORT = [
-    "Hello, my name is", "The president of the United States is",
-    "The capital of France is", "The future of AI is"
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
 ]
 
 # NOTE: Randomly fill the prompt with the requested amount for
 # the specified capture shape to prevent accuracy issues caused by padding
 PROMPTS_LONG = [
-    ('Solve the following math problem step by step.'
-     'The last line of your response should be of the form Answer: '
-     '$Answer (without quotes) where $Answer is the answer to the problem.\n\n'
-     'In triangle $ABC$, $\\sin \\angle A = \\frac{4}{5}$ and $\\angle A < 90^\\circ$. Let $D$'
-     'be a point outside triangle $ABC$ such that $\\angle BAD = \\angle DAC$,'
-     '$\\angle BDC = 90^\\circ$. Suppose $AD = 1$ and $\\frac{BD}{CD} = \\frac{3}{2}$.'
-     'If $AB + AC$ can be expressed in the form $\\frac{a\\sqrt{b}}{c}$,'
-     'where $a, b, c$ are pairwise relatively prime integers, find $a + b + c$.'
-     ),
-    ('Solve the following math problem step by step.'
-     'The last line of your response should be of the form Answer: '
-     '$Answer (without quotes) where $Answer is the answer to the problem.\n\n'
-     'Let $ABCD$ be a unit square in the plane. Points $X$ and $Y$ are chosen'
-     'independently and uniformly at random on the perimeter of $ABCD$.'
-     'If the expected value of the area of triangle $\\triangle AXY$'
-     'can be expressed as $\\frac{m}{n}$, for relatively prime positive'
-     'integers $m$ and $n$, compute $m+n$.'),
-    ('Solve the following math problem step by step.'
-     'The last line of your response should be of the form Answer: '
-     '$Answer (without quotes) where $Answer is the answer to the problem.\n\n'
-     'Let $a, b, c$ be distinct numbers such that the equations $x^2 + ax + 1 = 0$'
-     'and $x^2 + bx + c = 0$ have a common real root, and the equations $x^2 + x + a = 0$'
-     'and $x^2 + cx + b = 0$ also have a common real root.'
-     'Compute the sum $a + b + c$.')
+    (
+        "Solve the following math problem step by step."
+        "The last line of your response should be of the form Answer: "
+        "$Answer (without quotes) where $Answer is the answer to the problem.\n\n"
+        "In triangle $ABC$, $\\sin \\angle A = \\frac{4}{5}$ and $\\angle A < 90^\\circ$. Let $D$"
+        "be a point outside triangle $ABC$ such that $\\angle BAD = \\angle DAC$,"
+        "$\\angle BDC = 90^\\circ$. Suppose $AD = 1$ and $\\frac{BD}{CD} = \\frac{3}{2}$."
+        "If $AB + AC$ can be expressed in the form $\\frac{a\\sqrt{b}}{c}$,"
+        "where $a, b, c$ are pairwise relatively prime integers, find $a + b + c$."
+    ),
+    (
+        "Solve the following math problem step by step."
+        "The last line of your response should be of the form Answer: "
+        "$Answer (without quotes) where $Answer is the answer to the problem.\n\n"
+        "Let $ABCD$ be a unit square in the plane. Points $X$ and $Y$ are chosen"
+        "independently and uniformly at random on the perimeter of $ABCD$."
+        "If the expected value of the area of triangle $\\triangle AXY$"
+        "can be expressed as $\\frac{m}{n}$, for relatively prime positive"
+        "integers $m$ and $n$, compute $m+n$."
+    ),
+    (
+        "Solve the following math problem step by step."
+        "The last line of your response should be of the form Answer: "
+        "$Answer (without quotes) where $Answer is the answer to the problem.\n\n"
+        "Let $a, b, c$ be distinct numbers such that the equations $x^2 + ax + 1 = 0$"
+        "and $x^2 + bx + c = 0$ have a common real root, and the equations $x^2 + x + a = 0$"
+        "and $x^2 + cx + b = 0$ also have a common real root."
+        "Compute the sum $a + b + c$."
+    ),
 ]
 
 
@@ -46,7 +52,7 @@ class LLMTestCase:
     model: str
     prompts: list[str]
     golden_answers: list[str]
-    quantization: Optional[str] = None
+    quantization: str | None = None
     sampling_params: SamplingParams = field(
         default_factory=lambda: SamplingParams(
             max_tokens=32,
@@ -54,14 +60,13 @@ class LLMTestCase:
             top_p=1.0,
             top_k=0,
             n=1,
-        ))
+        )
+    )
 
 
-def gen_and_valid(runner_kwargs: dict, prompts: list[str],
-                  sampling_params: SamplingParams, golden_answers: list[str]):
+def gen_and_valid(runner_kwargs: dict, prompts: list[str], sampling_params: SamplingParams, golden_answers: list[str]):
     with VllmRunner(**runner_kwargs) as runner:
-        vllm_aclgraph_outputs = runner.model.generate(
-            prompts=prompts, sampling_params=sampling_params)
+        vllm_aclgraph_outputs = runner.model.generate(prompts=prompts, sampling_params=sampling_params)
     outputs_gen = []
     for output in vllm_aclgraph_outputs:
         outputs_gen.append(([output.outputs[0].index], output.outputs[0].text))