[Lint]Style: Convert test/ to ruff format(Batch #1) (#6738)

### What this PR does / why we need it? **Scope of Changes**: | File Path | | :--- | | `tests/e2e/310p/multicard/test_vl_model_multicard.py` | | `tests/e2e/310p/singlecard/test_vl_model_singlecard.py` | | `tests/e2e/310p/test_utils.py` | | `tests/e2e/conftest.py` | | `tests/e2e/model_utils.py` | | `tests/e2e/models/conftest.py` | | `tests/e2e/models/test_lm_eval_correctness.py` | | `tests/e2e/multicard/2-cards/spec_decode/test_spec_decode.py` | | `tests/e2e/multicard/2-cards/test_aclgraph_capture_replay.py` | | `tests/e2e/multicard/2-cards/test_data_parallel.py` | | `tests/e2e/multicard/2-cards/test_disaggregated_encoder.py` | | `tests/e2e/multicard/2-cards/test_expert_parallel.py` | | `tests/e2e/multicard/2-cards/test_external_launcher.py` | | `tests/e2e/multicard/2-cards/test_full_graph_mode.py` | | `tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py` | | `tests/e2e/multicard/2-cards/test_offline_inference_distributed.py` | | `tests/e2e/multicard/2-cards/test_offline_weight_load.py` | | `tests/e2e/multicard/2-cards/test_pipeline_parallel.py` | | `tests/e2e/multicard/2-cards/test_prefix_caching.py` | | `tests/e2e/multicard/2-cards/test_quantization.py` | | `tests/e2e/multicard/2-cards/test_qwen3_moe.py` | | `tests/e2e/multicard/2-cards/test_qwen3_moe_routing_replay.py` | | `tests/e2e/multicard/2-cards/test_qwen3_performance.py` | | `tests/e2e/multicard/2-cards/test_shared_expert_dp.py` | | `tests/e2e/multicard/2-cards/test_single_request_aclgraph.py` | | `tests/e2e/multicard/2-cards/test_sp_pass.py` | ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.15.0 - vLLM main: 9562912cea Signed-off-by: MrZ20 <2609716663@qq.com> Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
2026-03-10 09:52:50 +08:00
parent 9216e1b050
commit 43df2cb2fc
27 changed files with 753 additions and 859 deletions
--- a/tests/e2e/conftest.py
+++ b/tests/e2e/conftest.py
@@ -32,7 +32,7 @@ import threading
 import time
 import traceback
 from pathlib import Path
-from typing import Any, Optional, Tuple, TypeVar, Union
+from typing import Any, TypeVar

 import numpy as np
 import openai
@@ -44,23 +44,20 @@ from modelscope import snapshot_download  # type: ignore[import-untyped]
 from PIL import Image
 from requests.exceptions import RequestException
 from torch import nn
-from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
-                          BatchEncoding, BatchFeature)
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, BatchEncoding, BatchFeature
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 from vllm import LLM, SamplingParams
-from vllm.config.model import (ConvertOption, RunnerOption,
-                               _get_and_verify_dtype)
+from vllm.config.model import ConvertOption, RunnerOption, _get_and_verify_dtype
 from vllm.inputs import TextPrompt
 from vllm.outputs import RequestOutput
 from vllm.platforms import current_platform
 from vllm.transformers_utils.utils import maybe_model_redirect
 from vllm.utils.network_utils import get_open_port

-from tests.e2e.model_utils import (TokensTextLogprobs,
-                                   TokensTextLogprobsPromptLogprobs)
-from tests.e2e.nightly.multi_node.scripts.multi_node_config import (
-    DisaggregatedPrefillCfg, NodeInfo)
+from tests.e2e.model_utils import TokensTextLogprobs, TokensTextLogprobsPromptLogprobs
+from tests.e2e.nightly.multi_node.scripts.multi_node_config import DisaggregatedPrefillCfg, NodeInfo
 from vllm_ascend.ascend_config import clear_ascend_config
+
 # TODO: remove this part after the patch merged into vllm, if
 # we not explicitly patch here, some of them might be effectiveless
 # in pytest scenario
@@ -70,41 +67,41 @@ adapt_patch(True)
 adapt_patch(False)

 from vllm.distributed.parallel_state import (  # noqa E402
-    destroy_distributed_environment, destroy_model_parallel)
+    destroy_distributed_environment,
+    destroy_model_parallel,
+)

 _T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict)
 _M = TypeVar("_M")

-_PromptMultiModalInput = Union[list[_M], list[list[_M]]]
+_PromptMultiModalInput = list[_M] | list[list[_M]]

 PromptImageInput = _PromptMultiModalInput[Image.Image]
-PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]]
+PromptAudioInput = _PromptMultiModalInput[tuple[np.ndarray, int]]
 PromptVideoInput = _PromptMultiModalInput[np.ndarray]
 logger = logging.getLogger(__name__)

 _TEST_DIR = os.path.dirname(__file__)
 _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "long_prompt.txt")]

-DISAGG_EPD_PROXY_SCRIPT = Path(
-    __file__
-).parent.parent.parent / "examples" / "disaggregated_encoder" / "disagg_epd_proxy.py"
+DISAGG_EPD_PROXY_SCRIPT = (
+    Path(__file__).parent.parent.parent / "examples" / "disaggregated_encoder" / "disagg_epd_proxy.py"
+)


 def _check_npu_memory_worker(target_free_percentage: float, max_wait_seconds: float):
-    import torch_npu  # type: ignore
-    
    # We can try to clean up memory in this subprocess, though it mostly affects this process.
    # But if there are any lingering contexts in this process (unlikely for a fresh spawn), it helps.
    gc.collect()
    torch.npu.empty_cache()
-    
+
    _, total_npu_memory = torch.npu.mem_get_info()
    start_time = time.time()

    while True:
        free_bytes, _ = torch.npu.mem_get_info()
        if free_bytes / total_npu_memory >= target_free_percentage:
-            print(f'check_npu_memory_worker: npu free memory decreased target value.')
+            print("check_npu_memory_worker: npu free memory decreased target value.")
            return  # Success

        elapsed = time.time() - start_time
@@ -113,7 +110,7 @@ def _check_npu_memory_worker(target_free_percentage: float, max_wait_seconds: fl
            print(
                f"Timeout: NPU memory free size did not reach "
                f"{target_free_percentage} of total npu memory within {max_wait_seconds} seconds.",
-                file=sys.stderr
+                file=sys.stderr,
            )
            sys.exit(1)  # Failure

@@ -135,21 +132,19 @@ def wait_until_npu_memory_free(target_free_percentage: float = 0.5, max_wait_sec
        target_free_percentage (float): Target free memory percentage of total.
        max_wait_seconds (float): Maximum wait time in seconds.
    """
+
    def decorator(func):
        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            # Clean up non-NPU resources in the main process
            cleanup_dist_env_and_memory()
-            
+
            # Use a spawned subprocess to check NPU memory to avoid initializing NPU in the main process
            ctx = multiprocessing.get_context("spawn")
-            p = ctx.Process(
-                target=_check_npu_memory_worker,
-                args=(target_free_percentage, max_wait_seconds)
-            )
+            p = ctx.Process(target=_check_npu_memory_worker, args=(target_free_percentage, max_wait_seconds))
            p.start()
            p.join()
-            
+
            if p.exitcode != 0:
                raise TimeoutError(
                    f"Timeout: NPU memory free size did not reach "
@@ -157,7 +152,9 @@ def wait_until_npu_memory_free(target_free_percentage: float = 0.5, max_wait_sec
                )

            return func(*args, **kwargs)
+
        return wrapper
+
    return decorator


@@ -168,9 +165,10 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
        torch.distributed.destroy_process_group()
    if shutdown_ray:
        import ray  # Lazy import Ray
+
        ray.shutdown()
    gc.collect()
-    
+
    # Only clean NPU cache if NPU is already initialized/available in this process.
    # This prevents accidental initialization of NPU context in the main process,
    # which would break subsequent forks.
@@ -180,7 +178,6 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):


 class MooncakeLauncher:
-
    def __init__(
        self,
        mooncake_port,
@@ -228,14 +225,12 @@ class MooncakeLauncher:
 class RemoteOpenAIServer:
    DUMMY_API_KEY = "token-abc123"  # vLLM's OpenAI server does not need API key

-    def _start_server(self, model: str, server_cmd: list[str],
-                      env_dict: Optional[dict[str, str]]) -> None:
-        """Subclasses override this method to customize server process launch
-        """
+    def _start_server(self, model: str, server_cmd: list[str], env_dict: dict[str, str] | None) -> None:
+        """Subclasses override this method to customize server process launch"""
        env = os.environ.copy()
        # the current process might initialize npu,
        # to be safe, we should use spawn method
-        env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
+        env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
        if env_dict is not None:
            env.update(env_dict)
        logger.info(f"Starting server with command: {' '.join(server_cmd)}")
@@ -247,47 +242,41 @@ class RemoteOpenAIServer:
        )

    def __init__(
-            self,
-            model: str,
-            vllm_serve_args: Union[list[str], str],
-            *,
-            server_host: str = '0.0.0.0',
-            server_port: int = 8080,
-            env_dict: Optional[dict[str, str]] = None,
-            seed: Optional[int] = None,
-            auto_port: bool = True,
-            nodes_info: Optional[list[NodeInfo]] = None,
-            disaggregated_prefill: Optional[DisaggregatedPrefillCfg] = None,
-            proxy_port: Optional[int] = None,
-            max_wait_seconds: Optional[float] = None,
-            override_hf_configs: Optional[dict[str, Any]] = None) -> None:
+        self,
+        model: str,
+        vllm_serve_args: list[str] | str,
+        *,
+        server_host: str = "0.0.0.0",
+        server_port: int = 8080,
+        env_dict: dict[str, str] | None = None,
+        seed: int | None = None,
+        auto_port: bool = True,
+        nodes_info: list[NodeInfo] | None = None,
+        disaggregated_prefill: DisaggregatedPrefillCfg | None = None,
+        proxy_port: int | None = None,
+        max_wait_seconds: float | None = None,
+        override_hf_configs: dict[str, Any] | None = None,
+    ) -> None:
        if isinstance(vllm_serve_args, str):
            vllm_serve_args = shlex.split(vllm_serve_args)
        else:
            vllm_serve_args = ["vllm", "serve", model, *vllm_serve_args]
        if auto_port:
            if "-p" in vllm_serve_args or "--port" in vllm_serve_args:
-                raise ValueError("You have manually specified the port "
-                                 "when `auto_port=True`.")
+                raise ValueError("You have manually specified the port when `auto_port=True`.")

            # No need for a port if using unix sockets
            if "--uds" not in vllm_serve_args:
                # Don't mutate the input args
-                vllm_serve_args = vllm_serve_args + [
-                    "--port", str(get_open_port())
-                ]
+                vllm_serve_args = vllm_serve_args + ["--port", str(get_open_port())]
        if seed is not None:
            if "--seed" in vllm_serve_args:
-                raise ValueError("You have manually specified the seed "
-                                 f"when `seed={seed}`.")
+                raise ValueError(f"You have manually specified the seed when `seed={seed}`.")

            vllm_serve_args = vllm_serve_args + ["--seed", str(seed)]

        if override_hf_configs is not None:
-            vllm_serve_args = vllm_serve_args + [
-                "--hf-overrides",
-                json.dumps(override_hf_configs)
-            ]
+            vllm_serve_args = vllm_serve_args + ["--hf-overrides", json.dumps(override_hf_configs)]

        self.host = str(server_host)
        self.port = int(server_port)
@@ -303,9 +292,7 @@ class RemoteOpenAIServer:
            assert proxy_port is not None, "for disaggregated_prefill, proxy port must be provided"
            self._wait_for_server_pd(timeout=max_wait_seconds)
        else:
-            self._wait_for_multiple_servers(
-                [(self.host, self.url_for("health"))],
-                timeout=max_wait_seconds)
+            self._wait_for_multiple_servers([(self.host, self.url_for("health"))], timeout=max_wait_seconds)

    def __enter__(self):
        return self
@@ -313,7 +300,7 @@ class RemoteOpenAIServer:
    def __exit__(self, exc_type, exc_value, traceback):
        self._terminate_server()

-    def _poll(self) -> Optional[int]:
+    def _poll(self) -> int | None:
        """Subclasses override this method to customize process polling"""
        return self.proc.poll()

@@ -345,24 +332,23 @@ class RemoteOpenAIServer:
        def url_health(ip: str, port: int) -> str:
            return f"http://{ip}:{port}/health"

-        targets = [(node_info.ip, url_health(node_info.ip, self.port))
-                   for node_info in self.nodes_info if not node_info.headless]
+        targets = [
+            (node_info.ip, url_health(node_info.ip, self.port))
+            for node_info in self.nodes_info
+            if not node_info.headless
+        ]

        # Wait for proxy ready
        master_node = self.nodes_info[0]
        url_proxy = f"http://{master_node.ip}:{proxy_port}/healthcheck"

        # Wait for master node proxy first
-        self._wait_for_multiple_servers([(master_node.ip, url_proxy)],
-                                        timeout=timeout)
+        self._wait_for_multiple_servers([(master_node.ip, url_proxy)], timeout=timeout)

        # Then wait for all api_server nodes
        self._wait_for_multiple_servers(targets=targets, timeout=timeout)

-    def _wait_for_multiple_servers(self,
-                                   targets,
-                                   timeout: float,
-                                   log_interval: float = 30.0):
+    def _wait_for_multiple_servers(self, targets, timeout: float, log_interval: float = 30.0):
        """
        targets: List[(node_ip, url)]
        log_interval
@@ -396,9 +382,7 @@ class RemoteOpenAIServer:
                    # check unexpected exit
                    result = self._poll()
                    if result is not None and result != 0:
-                        raise RuntimeError(
-                            f"Server at {node_ip} exited unexpectedly."
-                        ) from None
+                        raise RuntimeError(f"Server at {node_ip} exited unexpectedly.") from None

            if should_log:
                last_log_time = now
@@ -444,35 +428,31 @@ class RemoteOpenAIServer:
    def get_async_client(self, **kwargs):
        if "timeout" not in kwargs:
            kwargs["timeout"] = 600
-        return openai.AsyncOpenAI(base_url=self.url_for("v1"),
-                                  api_key=self.DUMMY_API_KEY,
-                                  max_retries=0,
-                                  **kwargs)
+        return openai.AsyncOpenAI(base_url=self.url_for("v1"), api_key=self.DUMMY_API_KEY, max_retries=0, **kwargs)


 class RemoteEPDServer(RemoteOpenAIServer):
-    def _start_server(self, model: str, server_cmd: list[str],
-                      env_dict: Optional[dict[str, str]]) -> None:
-        """Subclasses override this method to customize server process launch
-        """
+    def _start_server(self, model: str, server_cmd: list[str], env_dict: dict[str, str] | None) -> None:
+        """Subclasses override this method to customize server process launch"""
        raise NotImplementedError("RemoteEPDServer should use _start_server_with_prefix instead")

-    def __init__(self,
-                 vllm_serve_args: Union[list[str], list[list[str]]],
-                 server_host: str = '0.0.0.0',
-                 env_dict: Optional[dict[str, str]] = None,
-                 max_wait_seconds: Optional[float] = 2800) -> None:
-
+    def __init__(
+        self,
+        vllm_serve_args: list[str] | list[list[str]],
+        server_host: str = "0.0.0.0",
+        env_dict: dict[str, str] | None = None,
+        max_wait_seconds: float | None = 2800,
+    ) -> None:
        self._proc_list = []

        self.env_dict: dict[str, str] = {}
        if env_dict is not None:
            self.env_dict.update(env_dict)

-        self.env_dict['VLLM_ALLOW_LONG_MAX_MODEL_LEN'] = "1"
-        self.env_dict['VLLM_USE_V1'] = "1"
-        self.env_dict['PYTORCH_NPU_ALLOC_CONF'] = "expandable_segments:True"
-        self.env_dict['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
+        self.env_dict["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"
+        self.env_dict["VLLM_USE_V1"] = "1"
+        self.env_dict["PYTORCH_NPU_ALLOC_CONF"] = "expandable_segments:True"
+        self.env_dict["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

        self.vllm_serve_args_list = []
        self.health_url_list = []
@@ -484,8 +464,7 @@ class RemoteEPDServer(RemoteOpenAIServer):
                self.vllm_serve_args_list.append([str(arg) for arg in args_copy])
            else:
                self.vllm_serve_args_list = [
-                    [str(arg) for arg in sublist]
-                    for sublist in copy.deepcopy(vllm_serve_args)
+                    [str(arg) for arg in sublist] for sublist in copy.deepcopy(vllm_serve_args)
                ]
        else:
            raise RuntimeError("vllm_serves_args must be a list")
@@ -493,7 +472,7 @@ class RemoteEPDServer(RemoteOpenAIServer):
        serve_arg_cmd = ["vllm", "serve"]

        for i, vllm_serve_arg in enumerate(self.vllm_serve_args_list):
-            self.env_dict['ASCEND_RT_VISIBLE_DEVICES'] = str(i)
+            self.env_dict["ASCEND_RT_VISIBLE_DEVICES"] = str(i)
            if isinstance(vllm_serve_arg, list):
                if "--port" not in vllm_serve_arg:
                    raise ValueError("You have manually specified the port ")
@@ -514,16 +493,13 @@ class RemoteEPDServer(RemoteOpenAIServer):

            self.health_url_list.append(super().url_for("health"))
            vllm_serve_arg = [*serve_arg_cmd, *vllm_serve_arg]
-            proc = self._start_server_with_prefix(vllm_serve_arg, self.env_dict,
-                                                  f"[VLLM_{i}] ")
+            proc = self._start_server_with_prefix(vllm_serve_arg, self.env_dict, f"[VLLM_{i}] ")
            self._proc_list.append(proc)

        timeout_value = float(max_wait_seconds) if max_wait_seconds is not None else 2800.0
-        super()._wait_for_multiple_servers([(self.host, url)
-                                            for url in self.health_url_list],
-                                           timeout=timeout_value)
+        super()._wait_for_multiple_servers([(self.host, url) for url in self.health_url_list], timeout=timeout_value)

-    def _poll(self) -> Optional[int]:
+    def _poll(self) -> int | None:
        return None

    def _delete_shm(self) -> None:
@@ -542,31 +518,23 @@ class RemoteEPDServer(RemoteOpenAIServer):
    def _read_output(self, pipe, prefix):
        try:
            with pipe:
-                for line in iter(pipe.readline, ''):
+                for line in iter(pipe.readline, ""):
                    if line:
-                        print(f"{prefix}: {line}", end='')
+                        print(f"{prefix}: {line}", end="")

        except Exception as e:
            print(f"error: {e}")
            traceback.print_exc()

-    def _start_server_with_prefix(self, server_cmd: list[str],
-                      env_dict: Optional[dict[str, str]], log_prefix: str):
+    def _start_server_with_prefix(self, server_cmd: list[str], env_dict: dict[str, str] | None, log_prefix: str):
        env = os.environ.copy()
        if env_dict is not None:
            env.update(env_dict)
-        proc = subprocess.Popen(server_cmd,
-                                env=env,
-                                stdout=subprocess.PIPE,
-                                stderr=subprocess.PIPE,
-                                universal_newlines=True,
-                                bufsize=1)
-        stdout_thread = threading.Thread(target=self._read_output,
-                                         args=(proc.stdout, log_prefix),
-                                         daemon=True)
-        stderr_thread = threading.Thread(target=self._read_output,
-                                         args=(proc.stderr, log_prefix),
-                                         daemon=True)
+        proc = subprocess.Popen(
+            server_cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, bufsize=1
+        )
+        stdout_thread = threading.Thread(target=self._read_output, args=(proc.stdout, log_prefix), daemon=True)
+        stderr_thread = threading.Thread(target=self._read_output, args=(proc.stderr, log_prefix), daemon=True)

        stdout_thread.start()
        stderr_thread.start()
@@ -579,27 +547,21 @@ class RemoteEPDServer(RemoteOpenAIServer):
            parent = psutil.Process(proc.pid)
            children = parent.children(recursive=True)
            for child in children:
-                try:
+                with contextlib.suppress(psutil.NoSuchProcess):
                    child.terminate()
-                except psutil.NoSuchProcess:
-                    pass

            gone, still_alive = psutil.wait_procs(children, timeout=10)

            for child in still_alive:
-                try:
+                with contextlib.suppress(psutil.NoSuchProcess):
                    child.kill()
-                except psutil.NoSuchProcess:
-                    pass

            try:
                parent.terminate()
                parent.wait(timeout=10)
            except (psutil.NoSuchProcess, psutil.TimeoutExpired):
-                try:
+                with contextlib.suppress(psutil.NoSuchProcess):
                    parent.kill()
-                except psutil.NoSuchProcess:
-                    pass

    def __enter__(self):
        """Context manager entry point."""
@@ -611,13 +573,13 @@ class RemoteEPDServer(RemoteOpenAIServer):


 class DisaggEpdProxy(RemoteEPDServer):
-
-    def __init__(self,
-                 proxy_args: Optional[Union[list[str], str]] = None,
-                 env_dict: Optional[dict[str, str]] = None,
-                 server_host: str = '0.0.0.0',
-                 max_wait_seconds: Optional[float] = 2800) -> None:
-
+    def __init__(
+        self,
+        proxy_args: list[str] | str | None = None,
+        env_dict: dict[str, str] | None = None,
+        server_host: str = "0.0.0.0",
+        max_wait_seconds: float | None = 2800,
+    ) -> None:
        if proxy_args is None:
            proxy_args_list: list[str] = []
        elif isinstance(proxy_args, str):
@@ -648,8 +610,7 @@ class DisaggEpdProxy(RemoteEPDServer):
            self.port = int(port_str)

        timeout_value = float(max_wait_seconds) if max_wait_seconds is not None else 2800.0
-        super()._wait_for_multiple_servers(
-            [(self.host, super().url_for("health"))], timeout=timeout_value)
+        super()._wait_for_multiple_servers([(self.host, super().url_for("health"))], timeout=timeout_value)

    def __enter__(self):
        """Context manager entry point."""
@@ -661,23 +622,22 @@ class DisaggEpdProxy(RemoteEPDServer):


 class VllmRunner:
-
    def __init__(
        self,
        model_name: str,
        runner: RunnerOption = "auto",
        convert: ConvertOption = "auto",
-        tokenizer_name: Optional[str] = None,
+        tokenizer_name: str | None = None,
        tokenizer_mode: str = "auto",
-        max_model_len: Optional[int] = 1024,
+        max_model_len: int | None = 1024,
        dtype: str = "auto",
        disable_log_stats: bool = True,
        tensor_parallel_size: int = 1,
        block_size: int = 16,
        enable_chunked_prefill: bool = True,
        swap_space: int = 4,
-        enforce_eager: Optional[bool] = False,
-        quantization: Optional[str] = None,
+        enforce_eager: bool | None = False,
+        quantization: str | None = None,
        **kwargs,
    ) -> None:
        self.model = LLM(
@@ -701,17 +661,13 @@ class VllmRunner:

    def get_inputs(
        self,
-        prompts: Union[list[str], list[torch.Tensor], list[int]],
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
+        prompts: list[str] | list[torch.Tensor] | list[int],
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
    ) -> list[TextPrompt]:
-
-        if any(x is not None and len(x) != len(prompts)
-               for x in [images, videos, audios]):
-            raise ValueError(
-                "All non-None multimodal inputs must have the same length as "
-                "prompts")
+        if any(x is not None and len(x) != len(prompts) for x in [images, videos, audios]):
+            raise ValueError("All non-None multimodal inputs must have the same length as prompts")

        inputs = []
        for i, prompt in enumerate(prompts):
@@ -719,13 +675,11 @@ class VllmRunner:
            if images is not None and (image := images[i]) is not None:
                multi_modal_data["image"] = image
            if videos is not None and (video := videos[i]) is not None:
-                multi_modal_data["video"] = video # type: ignore
+                multi_modal_data["video"] = video  # type: ignore
            if audios is not None and (audio := audios[i]) is not None:
-                multi_modal_data["audio"] = audio # type: ignore
+                multi_modal_data["audio"] = audio  # type: ignore

-            text_prompt_kwargs: dict[str, Any] = {
-                "multi_modal_data": multi_modal_data or None
-            }
+            text_prompt_kwargs: dict[str, Any] = {"multi_modal_data": multi_modal_data or None}
            if isinstance(prompt, str):
                text_prompt_kwargs["prompt"] = prompt
            elif isinstance(prompt, list):
@@ -739,21 +693,16 @@ class VllmRunner:

    def generate(
        self,
-        prompts: Union[list[str], list[torch.Tensor]],
+        prompts: list[str] | list[torch.Tensor],
        sampling_params: SamplingParams,
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
        **kwargs: Any,
    ) -> list[tuple[list[list[int]], list[str]]]:
-        inputs = self.get_inputs(prompts,
-                                 images=images,
-                                 videos=videos,
-                                 audios=audios)
+        inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)

-        req_outputs = self.model.generate(inputs,
-                                          sampling_params=sampling_params,
-                                          **kwargs)
+        req_outputs = self.model.generate(inputs, sampling_params=sampling_params, **kwargs)

        outputs: list[tuple[list[list[int]], list[str]]] = []
        for req_output in req_outputs:
@@ -780,99 +729,83 @@ class VllmRunner:
                output_str = sample.text
                output_ids = list(sample.token_ids)
                output_logprobs = sample.logprobs
-            outputs.append((output_ids, output_str, output_logprobs,
-                            req_output.prompt_logprobs))
+            outputs.append((output_ids, output_str, output_logprobs, req_output.prompt_logprobs))
        return outputs

    def generate_w_logprobs(
        self,
        prompts: list[str],
        sampling_params: SamplingParams,
-        images: Optional[PromptImageInput] = None,
-        audios: Optional[PromptAudioInput] = None,
-        videos: Optional[PromptVideoInput] = None,
+        images: PromptImageInput | None = None,
+        audios: PromptAudioInput | None = None,
+        videos: PromptVideoInput | None = None,
        **kwargs: Any,
-    ) -> Union[list[TokensTextLogprobs],
-               list[TokensTextLogprobsPromptLogprobs]]:
-        inputs = self.get_inputs(prompts,
-                                 images=images,
-                                 videos=videos,
-                                 audios=audios)
+    ) -> list[TokensTextLogprobs] | list[TokensTextLogprobsPromptLogprobs]:
+        inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)

-        req_outputs = self.model.generate(inputs,
-                                          sampling_params=sampling_params,
-                                          **kwargs)
+        req_outputs = self.model.generate(inputs, sampling_params=sampling_params, **kwargs)

-        toks_str_logsprobs_prompt_logprobs = (
-            self._final_steps_generate_w_logprobs(req_outputs))
+        toks_str_logsprobs_prompt_logprobs = self._final_steps_generate_w_logprobs(req_outputs)
        # Omit prompt logprobs if not required by sampling params
-        return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
-                if sampling_params.prompt_logprobs is None else
-                toks_str_logsprobs_prompt_logprobs)
+        return (
+            [x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
+            if sampling_params.prompt_logprobs is None
+            else toks_str_logsprobs_prompt_logprobs
+        )

    def generate_greedy(
        self,
-        prompts: Union[list[str], list[torch.Tensor]],
+        prompts: list[str] | list[torch.Tensor],
        max_tokens: int,
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
        **kwargs: Any,
    ) -> list[tuple[list[int], str]]:
        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
-        outputs = self.generate(prompts,
-                                greedy_params,
-                                images=images,
-                                videos=videos,
-                                audios=audios,
-                                **kwargs)
-        return [(output_ids[0], output_str[0])
-                for output_ids, output_str in outputs]
+        outputs = self.generate(prompts, greedy_params, images=images, videos=videos, audios=audios, **kwargs)
+        return [(output_ids[0], output_str[0]) for output_ids, output_str in outputs]

    def generate_greedy_logprobs(
        self,
        prompts: list[str],
        max_tokens: int,
-        num_logprobs: Optional[int],
-        num_prompt_logprobs: Optional[int] = None,
-        images: Optional[PromptImageInput] = None,
-        audios: Optional[PromptAudioInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        stop_token_ids: Optional[list[int]] = None,
-        stop: Optional[list[str]] = None,
+        num_logprobs: int | None,
+        num_prompt_logprobs: int | None = None,
+        images: PromptImageInput | None = None,
+        audios: PromptAudioInput | None = None,
+        videos: PromptVideoInput | None = None,
+        stop_token_ids: list[int] | None = None,
+        stop: list[str] | None = None,
        **kwargs: Any,
-    ) -> Union[list[TokensTextLogprobs],
-               list[TokensTextLogprobsPromptLogprobs]]:
+    ) -> list[TokensTextLogprobs] | list[TokensTextLogprobsPromptLogprobs]:
        greedy_logprobs_params = SamplingParams(
            temperature=0.0,
            max_tokens=max_tokens,
            logprobs=num_logprobs,
            prompt_logprobs=num_prompt_logprobs,
            stop_token_ids=stop_token_ids,
-            stop=stop)
+            stop=stop,
+        )

-        return self.generate_w_logprobs(prompts,
-                                        greedy_logprobs_params,
-                                        images=images,
-                                        audios=audios,
-                                        videos=videos,
-                                        **kwargs)
+        return self.generate_w_logprobs(
+            prompts, greedy_logprobs_params, images=images, audios=audios, videos=videos, **kwargs
+        )

    def classify(self, prompts: list[str]) -> list[list[float]]:
        req_outputs = self.model.classify(prompts)
        return [req_output.outputs.probs for req_output in req_outputs]

-    def embed(self,
-              prompts: list[str],
-              images: Optional[PromptImageInput] = None,
-              videos: Optional[PromptVideoInput] = None,
-              audios: Optional[PromptAudioInput] = None,
-              *args,
-              **kwargs) -> list[list[float]]:
-        inputs = self.get_inputs(prompts,
-                                 images=images,
-                                 videos=videos,
-                                 audios=audios)
+    def embed(
+        self,
+        prompts: list[str],
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
+        *args,
+        **kwargs,
+    ) -> list[list[float]]:
+        inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)

        req_outputs = self.model.embed(inputs, *args, **kwargs)
        return [req_output.outputs.embedding for req_output in req_outputs]
@@ -887,8 +820,8 @@ class VllmRunner:

    def score(
        self,
-        text_1: Union[str, list[str]],
-        text_2: Union[str, list[str]],
+        text_1: str | list[str],
+        text_2: str | list[str],
        *args,
        **kwargs,
    ) -> list[float]:
@@ -905,14 +838,11 @@ class VllmRunner:


 class HfRunner:
-
    def get_default_device(self):
+        return "cpu" if current_platform.is_cpu() else current_platform.device_type

-        return ("cpu"
-                if current_platform.is_cpu() else current_platform.device_type)
-
-    def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
-        if x is None or isinstance(x, (bool, )):
+    def wrap_device(self, x: _T, device: str | None = None) -> _T:
+        if x is None or isinstance(x, (bool,)):
            return x

        if device is None:
@@ -931,7 +861,7 @@ class HfRunner:
        model_name: str,
        dtype: str = "auto",
        *,
-        model_kwargs: Optional[dict[str, Any]] = None,
+        model_kwargs: dict[str, Any] | None = None,
        trust_remote_code: bool = True,
        is_sentence_transformer: bool = False,
        is_cross_encoder: bool = False,
@@ -984,14 +914,15 @@ class HfRunner:
            )

            # in case some unquantized custom models are not in same dtype
-            if (getattr(model, "quantization_method", None) is None
-                    and any(p.dtype != self.dtype
-                            for p in model.parameters())):
+            if getattr(model, "quantization_method", None) is None and any(
+                p.dtype != self.dtype for p in model.parameters()
+            ):
                model = model.to(dtype=self.dtype)

-            if (getattr(model, "quantization_method", None) != "bitsandbytes"
-                    and len({p.device
-                             for p in model.parameters()}) < 2):
+            if (
+                getattr(model, "quantization_method", None) != "bitsandbytes"
+                and len({p.device for p in model.parameters()}) < 2
+            ):
                model = model.to(device=self.device)

            self.model = model
@@ -1006,6 +937,7 @@ class HfRunner:
        # don't put this import at the top level
        # it will call torch.cuda.device_count()
        from transformers import AutoProcessor  # noqa: F401
+
        self.processor = AutoProcessor.from_pretrained(
            model_name,
            torch_dtype=torch_dtype,
@@ -1017,10 +949,10 @@ class HfRunner:
    def get_inputs(
        self,
        prompts: list[str],
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
-    ) -> list[Union[BatchFeature, BatchEncoding]]:
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
+    ) -> list[BatchFeature | BatchEncoding]:
        if images is not None:
            assert len(prompts) == len(images)

@@ -1030,7 +962,7 @@ class HfRunner:
        if audios is not None:
            assert len(prompts) == len(audios)

-        all_inputs: list[Union[BatchFeature, BatchEncoding]] = []
+        all_inputs: list[BatchFeature | BatchEncoding] = []
        for i, prompt in enumerate(prompts):
            processor_kwargs: dict[str, Any] = {
                "text": prompt,
@@ -1076,16 +1008,11 @@ class HfRunner:

        return outputs

-    def encode(self, prompts: list[str], *args,
-               **kwargs) -> list[list[torch.Tensor]]:
+    def encode(self, prompts: list[str], *args, **kwargs) -> list[list[torch.Tensor]]:
        return self.model.encode(prompts, *args, **kwargs)

-    def predict(self, prompts: list[list[str]], *args,
-                **kwargs) -> torch.Tensor:
-        return self.model.predict(prompts,
-                                  *args,
-                                  convert_to_tensor=True,
-                                  **kwargs)
+    def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
+        return self.model.predict(prompts, *args, convert_to_tensor=True, **kwargs)

    def __enter__(self):
        return self
@@ -1103,22 +1030,25 @@ def ilama_lora_files():
@pytest.fixture(scope="session")
 def llama32_lora_files():
    from huggingface_hub import snapshot_download as hf_snapshot_download
+
    return hf_snapshot_download(repo_id="jeeejeee/llama32-3b-text2sql-spider", local_files_only=True)


 def qwen_prompt(questions: list[str]) -> list[str]:
    placeholder = "<|image_pad|>"
-    return [("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-             f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
-             f"{q}<|im_end|>\n<|im_start|>assistant\n") for q in questions]
+    return [
+        (
+            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+            f"{q}<|im_end|>\n<|im_start|>assistant\n"
+        )
+        for q in questions
+    ]


 def hunyuan_prompt(questions: list[str]) -> list[str]:
    placeholder = "<｜hy_place▁holder▁no▁100｜><｜hy_place▁holder▁no▁102｜><｜hy_place▁holder▁no▁101｜>"  # noqa: E501
-    return [
-        f"<｜hy_begin▁of▁sentence｜>{placeholder}{question}<｜hy_User｜>"
-        for question in questions
-    ]
+    return [f"<｜hy_begin▁of▁sentence｜>{placeholder}{question}<｜hy_User｜>" for question in questions]


 PROMPT_CONFIGS = {