Iluvatar-mrv100 SDK 4.3.0

2025-09-15 14:58:11 +08:00
parent 9efe891f99
commit 8af8290b1d
1052 changed files with 294967 additions and 1 deletions
--- a/vllm/platforms/init.py
+++ b/vllm/platforms/init.py
@@ -0,0 +1,287 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+import traceback
+from itertools import chain
+from typing import TYPE_CHECKING, Optional
+
+from vllm.plugins import load_plugins_by_group
+from vllm.utils import resolve_obj_by_qualname
+
+from .interface import _Backend  # noqa: F401
+from .interface import CpuArchEnum, Platform, PlatformEnum
+
+logger = logging.getLogger(__name__)
+
+
+def vllm_version_matches_substr(substr: str) -> bool:
+    """
+    Check to see if the vLLM version matches a substring.
+    """
+    from importlib.metadata import PackageNotFoundError, version
+    try:
+        vllm_version = version("vllm")
+    except PackageNotFoundError as e:
+        logger.warning(
+            "The vLLM package was not found, so its version could not be "
+            "inspected. This may cause platform detection to fail.")
+        raise e
+    return substr in vllm_version
+
+
+def tpu_platform_plugin() -> Optional[str]:
+    is_tpu = False
+    logger.debug("Checking if TPU platform is available.")
+    try:
+        # While it's technically possible to install libtpu on a
+        # non-TPU machine, this is a very uncommon scenario. Therefore,
+        # we assume that libtpu is installed if and only if the machine
+        # has TPUs.
+        import libtpu  # noqa: F401
+        is_tpu = True
+        logger.debug("Confirmed TPU platform is available.")
+    except Exception as e:
+        logger.debug("TPU platform is not available because: %s", str(e))
+        pass
+
+    return "vllm.platforms.tpu.TpuPlatform" if is_tpu else None
+
+
+def cuda_platform_plugin() -> Optional[str]:
+    return "vllm.platforms.cuda.CudaPlatform"
+    is_cuda = False
+    logger.debug("Checking if CUDA platform is available.")
+    try:
+        from vllm.utils import import_pynvml
+        pynvml = import_pynvml()
+        pynvml.nvmlInit()
+        try:
+            # NOTE: Edge case: vllm cpu build on a GPU machine.
+            # Third-party pynvml can be imported in cpu build,
+            # we need to check if vllm is built with cpu too.
+            # Otherwise, vllm will always activate cuda plugin
+            # on a GPU machine, even if in a cpu build.
+            is_cuda = (pynvml.nvmlDeviceGetCount() > 0
+                       and not vllm_version_matches_substr("cpu"))
+            if pynvml.nvmlDeviceGetCount() <= 0:
+                logger.debug(
+                    "CUDA platform is not available because no GPU is found.")
+            if vllm_version_matches_substr("cpu"):
+                logger.debug("CUDA platform is not available because"
+                             " vLLM is built with CPU.")
+            if is_cuda:
+                logger.debug("Confirmed CUDA platform is available.")
+        finally:
+            pynvml.nvmlShutdown()
+    except Exception as e:
+        logger.debug("Exception happens when checking CUDA platform: %s",
+                     str(e))
+        if "nvml" not in e.__class__.__name__.lower():
+            # If the error is not related to NVML, re-raise it.
+            raise e
+
+        # CUDA is supported on Jetson, but NVML may not be.
+        import os
+
+        def cuda_is_jetson() -> bool:
+            return os.path.isfile("/etc/nv_tegra_release") \
+                or os.path.exists("/sys/class/tegra-firmware")
+
+        if cuda_is_jetson():
+            logger.debug("Confirmed CUDA platform is available on Jetson.")
+            is_cuda = True
+        else:
+            logger.debug("CUDA platform is not available because: %s", str(e))
+
+    return "vllm.platforms.cuda.CudaPlatform" if is_cuda else None
+
+
+def rocm_platform_plugin() -> Optional[str]:
+    is_rocm = False
+    logger.debug("Checking if ROCm platform is available.")
+    try:
+        import amdsmi
+        amdsmi.amdsmi_init()
+        try:
+            if len(amdsmi.amdsmi_get_processor_handles()) > 0:
+                is_rocm = True
+                logger.debug("Confirmed ROCm platform is available.")
+            else:
+                logger.debug("ROCm platform is not available because"
+                             " no GPU is found.")
+        finally:
+            amdsmi.amdsmi_shut_down()
+    except Exception as e:
+        logger.debug("ROCm platform is not available because: %s", str(e))
+        pass
+
+    return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None
+
+
+def hpu_platform_plugin() -> Optional[str]:
+    is_hpu = False
+    logger.debug("Checking if HPU platform is available.")
+    try:
+        from importlib import util
+        is_hpu = util.find_spec('habana_frameworks') is not None
+        if is_hpu:
+            logger.debug("Confirmed HPU platform is available.")
+        else:
+            logger.debug("HPU platform is not available because "
+                         "habana_frameworks is not found.")
+    except Exception as e:
+        logger.debug("HPU platform is not available because: %s", str(e))
+        pass
+
+    return "vllm.platforms.hpu.HpuPlatform" if is_hpu else None
+
+
+def xpu_platform_plugin() -> Optional[str]:
+    is_xpu = False
+    logger.debug("Checking if XPU platform is available.")
+    try:
+        # installed IPEX if the machine has XPUs.
+        import intel_extension_for_pytorch  # noqa: F401
+        import oneccl_bindings_for_pytorch  # noqa: F401
+        import torch
+        if hasattr(torch, 'xpu') and torch.xpu.is_available():
+            is_xpu = True
+            logger.debug("Confirmed XPU platform is available.")
+    except Exception as e:
+        logger.debug("XPU platform is not available because: %s", str(e))
+        pass
+
+    return "vllm.platforms.xpu.XPUPlatform" if is_xpu else None
+
+
+def cpu_platform_plugin() -> Optional[str]:
+    is_cpu = False
+    logger.debug("Checking if CPU platform is available.")
+    try:
+        is_cpu = vllm_version_matches_substr("cpu")
+        if is_cpu:
+            logger.debug("Confirmed CPU platform is available because"
+                         " vLLM is built with CPU.")
+        if not is_cpu:
+            import sys
+            is_cpu = sys.platform.startswith("darwin")
+            if is_cpu:
+                logger.debug("Confirmed CPU platform is available"
+                             " because the machine is MacOS.")
+
+    except Exception as e:
+        logger.debug("CPU platform is not available because: %s", str(e))
+        pass
+
+    return "vllm.platforms.cpu.CpuPlatform" if is_cpu else None
+
+
+def neuron_platform_plugin() -> Optional[str]:
+    is_neuron = False
+    logger.debug("Checking if Neuron platform is available.")
+    try:
+        import transformers_neuronx  # noqa: F401
+        is_neuron = True
+        logger.debug("Confirmed Neuron platform is available because"
+                     " transformers_neuronx is found.")
+    except ImportError as e:
+        logger.debug("Neuron platform is not available because: %s", str(e))
+        pass
+
+    return "vllm.platforms.neuron.NeuronPlatform" if is_neuron else None
+
+
+builtin_platform_plugins = {
+    'tpu': tpu_platform_plugin,
+    'cuda': cuda_platform_plugin,
+    'rocm': rocm_platform_plugin,
+    'hpu': hpu_platform_plugin,
+    'xpu': xpu_platform_plugin,
+    'cpu': cpu_platform_plugin,
+    'neuron': neuron_platform_plugin,
+}
+
+
+def resolve_current_platform_cls_qualname() -> str:
+    platform_plugins = load_plugins_by_group('vllm.platform_plugins')
+
+    activated_plugins = []
+
+    for name, func in chain(builtin_platform_plugins.items(),
+                            platform_plugins.items()):
+        try:
+            assert callable(func)
+            platform_cls_qualname = func()
+            if platform_cls_qualname is not None:
+                activated_plugins.append(name)
+        except Exception:
+            pass
+
+    activated_builtin_plugins = list(
+        set(activated_plugins) & set(builtin_platform_plugins.keys()))
+    activated_oot_plugins = list(
+        set(activated_plugins) & set(platform_plugins.keys()))
+
+    if len(activated_oot_plugins) >= 2:
+        raise RuntimeError(
+            "Only one platform plugin can be activated, but got: "
+            f"{activated_oot_plugins}")
+    elif len(activated_oot_plugins) == 1:
+        platform_cls_qualname = platform_plugins[activated_oot_plugins[0]]()
+        logger.info("Platform plugin %s is activated",
+                    activated_oot_plugins[0])
+    elif len(activated_builtin_plugins) >= 2:
+        raise RuntimeError(
+            "Only one platform plugin can be activated, but got: "
+            f"{activated_builtin_plugins}")
+    elif len(activated_builtin_plugins) == 1:
+        platform_cls_qualname = builtin_platform_plugins[
+            activated_builtin_plugins[0]]()
+        logger.info("Automatically detected platform %s.",
+                    activated_builtin_plugins[0])
+    else:
+        platform_cls_qualname = "vllm.platforms.interface.UnspecifiedPlatform"
+        logger.info(
+            "No platform detected, vLLM is running on UnspecifiedPlatform")
+    return platform_cls_qualname
+
+
+_current_platform = None
+_init_trace: str = ''
+
+if TYPE_CHECKING:
+    current_platform: Platform
+
+
+def __getattr__(name: str):
+    if name == 'current_platform':
+        # lazy init current_platform.
+        # 1. out-of-tree platform plugins need `from vllm.platforms import
+        #    Platform` so that they can inherit `Platform` class. Therefore,
+        #    we cannot resolve `current_platform` during the import of
+        #    `vllm.platforms`.
+        # 2. when users use out-of-tree platform plugins, they might run
+        #    `import vllm`, some vllm internal code might access
+        #    `current_platform` during the import, and we need to make sure
+        #    `current_platform` is only resolved after the plugins are loaded
+        #    (we have tests for this, if any developer violate this, they will
+        #    see the test failures).
+        global _current_platform
+        if _current_platform is None:
+            platform_cls_qualname = resolve_current_platform_cls_qualname()
+            _current_platform = resolve_obj_by_qualname(
+                platform_cls_qualname)()
+            global _init_trace
+            _init_trace = "".join(traceback.format_stack())
+        return _current_platform
+    elif name in globals():
+        return globals()[name]
+    else:
+        raise AttributeError(
+            f"No attribute named '{name}' exists in {__name__}.")
+
+
+__all__ = [
+    'Platform', 'PlatformEnum', 'current_platform', 'CpuArchEnum',
+    "_init_trace"
+]
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -0,0 +1,182 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import sys
+from importlib.util import find_spec
+from typing import TYPE_CHECKING, Optional
+
+import psutil
+import torch
+
+from vllm.logger import init_logger
+
+from .interface import Platform, PlatformEnum, _Backend
+
+logger = init_logger(__name__)
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
+
+logger = init_logger(__name__)
+
+
+class CpuPlatform(Platform):
+    _enum = PlatformEnum.CPU
+    device_name: str = "cpu"
+    device_type: str = "cpu"
+    dispatch_key: str = "CPU"
+
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return "cpu"
+
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
+        if selected_backend and selected_backend != _Backend.TORCH_SDPA:
+            logger.info("Cannot use %s backend on CPU.", selected_backend)
+        if use_mla:
+            logger.info("Using CPU MLA backend.")
+            return "vllm.attention.backends.cpu_mla.CPUMLABackend"
+        logger.info("Using Torch SDPA backend.")
+        return "vllm.attention.backends.torch_sdpa.TorchSDPABackend"
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        return psutil.virtual_memory().total
+
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        return False
+
+    @classmethod
+    def inference_mode(cls):
+        return torch.no_grad()
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        import vllm.envs as envs
+        from vllm.utils import GiB_bytes
+        model_config = vllm_config.model_config
+        # Reminder: Please update docs/source/features/compatibility_matrix.md
+        # If the feature combo become valid
+        if not model_config.enforce_eager:
+            model_config.enforce_eager = True
+
+        cache_config = vllm_config.cache_config
+
+        ipex_avaliable = find_spec("intel_extension_for_pytorch") is not None
+
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 128 if ipex_avaliable else 16
+
+        if not ipex_avaliable and cache_config.block_size != 16:
+            raise RuntimeError(
+                f"--block-size={cache_config.block_size} requires"
+                " intel_extension_for_pytorch")
+
+        scheduler_config = vllm_config.scheduler_config
+        if ((scheduler_config.chunked_prefill_enabled
+             or cache_config.enable_prefix_caching)
+                and cache_config.cache_dtype != "auto"):
+            raise RuntimeError("Chunked-prefill and prefix-cache on the CPU "
+                               "backend is not compatible with FP8 KV cache.")
+
+        if cache_config.cache_dtype == "fp8_e4m3":
+            cache_config.cache_dtype = "fp8_e5m2"
+            logger.warning(
+                "CPU backend doesn't support fp8_e4m3 KV cache type, "
+                "cast to fp8_e5m2.")
+
+        if (cache_config.cache_dtype != "auto"
+                and model_config.dtype == torch.half):
+            logger.warning("FP8 KV cache on the CPU backend only does not"
+                           " support fp16 for now, cast to bf16.")
+            model_config.dtype = torch.bfloat16
+
+        kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
+
+        if kv_cache_space >= 0:
+            if kv_cache_space == 0:
+                cache_config.cpu_kvcache_space_bytes = 4 * GiB_bytes  # type: ignore
+                logger.warning(
+                    "Environment variable VLLM_CPU_KVCACHE_SPACE (GiB) "
+                    "for CPU backend is not set, using 4 by default.")
+            else:
+                cache_config.cpu_kvcache_space_bytes = kv_cache_space * GiB_bytes  # type: ignore # noqa
+        else:
+            raise RuntimeError(
+                "Invalid environment variable VLLM_CPU_KVCACHE_SPACE"
+                f" {kv_cache_space}, expect a positive integer value.")
+
+        parallel_config = vllm_config.parallel_config
+        if (parallel_config.distributed_executor_backend is not None
+                and parallel_config.distributed_executor_backend != "mp"):
+            logger.warning(("%s is not supported on CPU, fallback to mp "
+                            "distributed executor backend."),
+                           parallel_config.distributed_executor_backend)
+            parallel_config.distributed_executor_backend = "mp"
+        if parallel_config.worker_cls == "auto":
+            if vllm_config.speculative_config:
+                parallel_config.worker_cls = \
+                    "vllm.spec_decode.spec_decode_worker.create_spec_worker"
+                parallel_config.sd_worker_cls = \
+                    "vllm.worker.cpu_worker.CPUWorker"
+            else:
+                parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker"
+
+        assert vllm_config.device_config.device_type == "cpu"
+
+        #
+        # Environment variables for CPU executor
+        #
+
+        # Set default threads num for OpenMP parallel
+        os.environ["OMP_NUM_THREADS"] = str(torch.get_num_threads())
+
+        # Disable torch async compiling which won't work with daemonic processes
+        os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
+
+        # Intel OpenMP setting
+        ld_prealod_str = os.getenv("LD_PRELOAD", "")
+        if "libiomp5.so" in ld_prealod_str:
+            # The time(milliseconds) that a thread should wait after
+            # completing the execution of a parallel region, before sleeping.
+            os.environ['KMP_BLOCKTIME'] = "1"
+            # Prevents the CPU to run into low performance state
+            os.environ['KMP_TPAUSE'] = "0"
+            # Provides fine granularity parallelism
+            os.environ['KMP_FORKJOIN_BARRIER_PATTERN'] = "dist,dist"
+            os.environ['KMP_PLAIN_BARRIER_PATTERN'] = "dist,dist"
+            os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist"
+
+        # To hint IPEX uses shared memory based AllReduce
+        os.environ["LOCAL_WORLD_SIZE"] = str(
+            vllm_config.parallel_config.tensor_parallel_size)
+        if sys.platform == "darwin" and \
+                envs.VLLM_WORKER_MULTIPROC_METHOD == "fork":
+            if os.environ.get('VLLM_WORKER_MULTIPROC_METHOD', None) is None:
+                logger.warning(
+                    "Default to spawn method on MacOS. If this is not desired,"
+                    " set VLLM_WORKER_MULTIPROC_METHOD to fork explicitly.")
+                os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
+
+    @classmethod
+    def is_pin_memory_available(cls) -> bool:
+        logger.warning("Pin memory is not supported on CPU.")
+        return False
+
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        return "vllm.lora.punica_wrapper.punica_cpu.PunicaWrapperCPU"
+
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        """
+        Get device specific communicator class for distributed communication.
+        """
+        return "vllm.distributed.device_communicators.cpu_communicator.CpuCommunicator"  # noqa
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -0,0 +1,462 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Code inside this file can safely assume cuda platform, e.g. importing
+pynvml. However, it should not initialize cuda context.
+"""
+
+import os
+from functools import wraps
+from typing import (TYPE_CHECKING, Callable, List, Optional, Tuple, TypeVar,
+                    Union)
+
+import torch
+from typing_extensions import ParamSpec
+
+# import custom ops, trigger op registration
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.utils import import_pynvml
+
+from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig, VllmConfig
+else:
+    ModelConfig = None
+    VllmConfig = None
+
+logger = init_logger(__name__)
+
+_P = ParamSpec("_P")
+_R = TypeVar("_R")
+
+pynvml = import_pynvml()
+
+# pytorch 2.5 uses cudnn sdpa by default, which will cause crash on some models
+# see https://github.com/huggingface/diffusers/issues/9704 for details
+# torch.backends.cuda.enable_cudnn_sdp(False)
+
+
+def device_id_to_physical_device_id(device_id: int) -> int:
+    if "CUDA_VISIBLE_DEVICES" in os.environ:
+        device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
+        if device_ids == [""]:
+            msg = (
+                "CUDA_VISIBLE_DEVICES is set to empty string, which means"
+                " GPU support is disabled. If you are using ray, please unset"
+                " the environment variable `CUDA_VISIBLE_DEVICES` inside the"
+                " worker/actor. "
+                "Check https://github.com/vllm-project/vllm/issues/8402 for"
+                " more information.")
+            raise RuntimeError(msg)
+        physical_device_id = device_ids[device_id]
+        return int(physical_device_id)
+    else:
+        return device_id
+
+
+def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
+
+    @wraps(fn)
+    def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
+        pynvml.nvmlInit()
+        try:
+            return fn(*args, **kwargs)
+        finally:
+            pynvml.nvmlShutdown()
+
+    return wrapper
+
+
+class CudaPlatformBase(Platform):
+    _enum = PlatformEnum.CUDA
+    device_name: str = "cuda"
+    device_type: str = "cuda"
+    dispatch_key: str = "CUDA"
+    ray_device_key: str = "GPU"
+    device_control_env_var: str = "CUDA_VISIBLE_DEVICES"
+
+    @classmethod
+    def get_device_capability(cls,
+                              device_id: int = 0
+                              ) -> Optional[DeviceCapability]:
+        raise NotImplementedError
+
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        raise NotImplementedError
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        raise NotImplementedError
+
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        if enforce_eager:
+            logger.warning(
+                "To see benefits of async output processing, enable CUDA "
+                "graph. Since, enforce-eager is enabled, async output "
+                "processor cannot be used")
+            return False
+        return True
+
+    @classmethod
+    def is_fully_connected(cls, device_ids: List[int]) -> bool:
+        raise NotImplementedError
+
+    @classmethod
+    def log_warnings(cls):
+        pass
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        parallel_config = vllm_config.parallel_config
+        scheduler_config = vllm_config.scheduler_config
+        compilation_config = vllm_config.compilation_config
+        model_config = vllm_config.model_config
+
+        if parallel_config.worker_cls == "auto":
+            if scheduler_config.is_multi_step:
+                if envs.VLLM_USE_V1:
+                    raise NotImplementedError(
+                        "Multi-step scheduling is not supported (and not "
+                        "needed) on vLLM V1. Please launch without "
+                        "--num-scheduler-steps.")
+                else:
+                    parallel_config.worker_cls = \
+                        "vllm.worker.multi_step_worker.MultiStepWorker"
+            elif vllm_config.speculative_config:
+                if envs.VLLM_USE_V1:
+                    parallel_config.worker_cls = \
+                            "vllm.v1.worker.gpu_worker.Worker"
+                else:
+                    parallel_config.worker_cls = \
+                        "vllm.spec_decode.spec_decode_worker.create_spec_worker"
+                    parallel_config.sd_worker_cls = \
+                        "vllm.worker.worker.Worker"
+            else:
+                if envs.VLLM_USE_V1:
+                    parallel_config.worker_cls = \
+                            "vllm.v1.worker.gpu_worker.Worker"
+                else:
+                    parallel_config.worker_cls = "vllm.worker.worker.Worker"
+
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
+        # TODO(lucas): handle this more gracefully
+        # Note: model_config may be None during testing
+        if model_config is not None and model_config.use_mla:
+            # if `VLLM_ATTENTION_BACKEND` is not set and we are using MLA, then
+            # we default to FlashMLA backend, so we need to force the blocksize
+            # here
+            use_flashmla = (envs.VLLM_ATTENTION_BACKEND is None \
+                or envs.VLLM_ATTENTION_BACKEND == "FLASHMLA")
+            from vllm.attention.ops.flashmla import is_flashmla_supported
+            if use_flashmla and is_flashmla_supported()[0] \
+                and cache_config.block_size != 64:
+                cache_config.block_size = 64
+                logger.info(
+                    "Forcing kv cache block size to 64 for FlashMLA backend.")
+
+        if (parallel_config.data_parallel_size > 1
+                and compilation_config.use_cudagraph):
+            logger.info(
+                "Data Parallel: Forcing enforce eager to be True since DP is "
+                "currently not supported with CUDA Graphs.")
+            vllm_config.model_config.enforce_eager = True
+            compilation_config.use_cudagraph = False
+
+    @classmethod
+    def get_current_memory_usage(cls,
+                                 device: Optional[torch.types.Device] = None
+                                 ) -> float:
+        torch.cuda.reset_peak_memory_stats(device)
+        return torch.cuda.max_memory_allocated(device)
+
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
+                             kv_cache_dtype, block_size, use_v1,
+                             use_mla) -> str:
+        if use_mla:
+            # TODO(lucas): refactor to  be more concise
+            #  we should probably consider factoring out V1 here
+            if selected_backend == _Backend.TRITON_MLA or block_size != 64:
+                if use_v1:
+                    logger.info_once("Using Triton MLA backend on V1 engine.")
+                    return ("vllm.v1.attention.backends.mla."
+                            "triton_mla.TritonMLABackend")
+                else:
+                    logger.info("Using Triton MLA backend.")
+                    return "vllm.attention.backends.triton_mla.TritonMLABackend"
+            else:
+                from vllm.attention.backends.flashmla import (
+                    is_flashmla_supported)
+                if not is_flashmla_supported()[0]:
+                    logger.warning(
+                        "FlashMLA backend is not supported due to %s",
+                        is_flashmla_supported()[1])
+                elif block_size != 64:
+                    logger.warning(
+                        "FlashMLA backend is not supported for block size %d"
+                        " (currently only supports block size 64).",
+                        block_size)
+                else:
+                    if use_v1:
+                        logger.info_once(
+                            "Using FlashMLA backend on V1 engine.")
+                        return ("vllm.v1.attention.backends.mla."
+                                "flashmla.FlashMLABackend")
+                    else:
+                        logger.info("Using FlashMLA backend.")
+                        return ("vllm.attention.backends."
+                                "flashmla.FlashMLABackend")
+        if use_v1:
+            if selected_backend == _Backend.TRITON_ATTN_VLLM_V1:
+                logger.info_once("Using Triton backend on V1 engine.")
+                return ("vllm.v1.attention.backends."
+                        "triton_attn.TritonAttentionBackend")
+            if cls.has_device_capability(80):
+                logger.info_once("Using Flash Attention backend on V1 engine.")
+                return ("vllm.v1.attention.backends."
+                        "flash_attn.FlashAttentionBackend")
+        if selected_backend == _Backend.FLASHINFER:
+            logger.info("Using FlashInfer backend.")
+            return "vllm.attention.backends.flashinfer.FlashInferBackend"
+        elif selected_backend == _Backend.XFORMERS:
+            logger.info("Using XFormers backend.")
+            return "vllm.attention.backends.xformers.XFormersBackend"
+        elif selected_backend == _Backend.FLASH_ATTN:
+            pass
+        elif selected_backend:
+            raise ValueError(
+                f"Invalid attention backend for {cls.device_name}, "
+                f"with use_v1: {use_v1} use_mla: {use_mla}")
+
+        target_backend = _Backend.FLASH_ATTN
+        if not cls.has_device_capability(80):
+            # Volta and Turing NVIDIA GPUs.
+            logger.info(
+                "Cannot use FlashAttention-2 backend for Volta and Turing "
+                "GPUs.")
+            target_backend = _Backend.XFORMERS
+        elif dtype not in (torch.float16, torch.bfloat16):
+            logger.info(
+                "Cannot use FlashAttention-2 backend for dtype other than "
+                "torch.float16 or torch.bfloat16.")
+            target_backend = _Backend.XFORMERS
+        elif block_size % 16 != 0:
+            logger.info(
+                "Cannot use FlashAttention-2 backend for block size not "
+                "divisible by 16.")
+            target_backend = _Backend.XFORMERS
+
+        # FlashAttn is valid for the model, checking if the package is
+        # installed.
+        if target_backend == _Backend.FLASH_ATTN:
+            try:
+                # import vllm.vllm_flash_attn  # noqa: F401
+                from vllm.attention.backends.flash_attn import (  # noqa: F401
+                    FlashAttentionBackend, flash_attn_supports_fp8)
+
+                supported_sizes = \
+                    FlashAttentionBackend.get_supported_head_sizes()
+                if head_size not in supported_sizes:
+                    logger.info(
+                        "Cannot use FlashAttention-2 backend for head size %d.",
+                        head_size)
+                    target_backend = _Backend.XFORMERS
+                fp8_kv_cache = (kv_cache_dtype is not None
+                                and kv_cache_dtype.startswith("fp8"))
+                if (fp8_kv_cache and not flash_attn_supports_fp8()):
+                    logger.info(
+                        "Cannot use FlashAttention backend for FP8 KV cache.")
+                    logger.warning(
+                        "Please use FlashInfer backend with FP8 KV Cache for "
+                        "better performance by setting environment variable "
+                        "VLLM_ATTENTION_BACKEND=FLASHINFER")
+                    target_backend = _Backend.XFORMERS
+            except ImportError:
+                logger.info(
+                    "Cannot use FlashAttention-2 backend because the "
+                    "vllm.vllm_flash_attn package is not found. "
+                    "Make sure that vllm_flash_attn was built and installed "
+                    "(on by default).")
+                target_backend = _Backend.XFORMERS
+
+        if target_backend == _Backend.XFORMERS:
+            logger.info("Using XFormers backend.")
+            return "vllm.attention.backends.xformers.XFormersBackend"
+
+        logger.info("Using Flash Attention backend.")
+        return "vllm.attention.backends.flash_attn.FlashAttentionBackend"
+
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU"
+
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        return "vllm.distributed.device_communicators.base_device_communicator.DeviceCommunicatorBase"  # noqa
+
+    @classmethod
+    def supports_fp8(cls) -> bool:
+        return cls.has_device_capability(89)
+
+    @classmethod
+    def supports_v1(cls, model_config: ModelConfig) -> bool:
+        return True
+
+    @classmethod
+    def use_custom_allreduce(cls) -> bool:
+        return True
+
+
+# NVML utils
+# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
+# all the related functions work on real physical device ids.
+# the major benefit of using NVML is that it will not initialize CUDA
+class NvmlCudaPlatform(CudaPlatformBase):
+
+    @classmethod
+    @with_nvml_context
+    def get_device_capability(cls,
+                              device_id: int = 0
+                              ) -> Optional[DeviceCapability]:
+        try:
+            physical_device_id = device_id_to_physical_device_id(device_id)
+            handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
+            major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
+            return DeviceCapability(major=major, minor=minor)
+        except RuntimeError:
+            return None
+
+    @classmethod
+    @with_nvml_context
+    def has_device_capability(
+        cls,
+        capability: Union[Tuple[int, int], int],
+        device_id: int = 0,
+    ) -> bool:
+        try:
+            return super().has_device_capability(capability, device_id)
+        except RuntimeError:
+            return False
+
+    @classmethod
+    @with_nvml_context
+    def get_device_name(cls, device_id: int = 0) -> str:
+        physical_device_id = device_id_to_physical_device_id(device_id)
+        return cls._get_physical_device_name(physical_device_id)
+
+    @classmethod
+    @with_nvml_context
+    def get_device_uuid(cls, device_id: int = 0) -> str:
+        physical_device_id = device_id_to_physical_device_id(device_id)
+        handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
+        return pynvml.nvmlDeviceGetUUID(handle)
+
+    @classmethod
+    @with_nvml_context
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        physical_device_id = device_id_to_physical_device_id(device_id)
+        handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
+        return int(pynvml.nvmlDeviceGetMemoryInfo(handle).total)
+
+    @classmethod
+    @with_nvml_context
+    def is_fully_connected(cls, physical_device_ids: List[int]) -> bool:
+        """
+        query if the set of gpus are fully connected by nvlink (1 hop)
+        """
+        handles = [
+            pynvml.nvmlDeviceGetHandleByIndex(i) for i in physical_device_ids
+        ]
+        for i, handle in enumerate(handles):
+            for j, peer_handle in enumerate(handles):
+                if i < j:
+                    try:
+                        p2p_status = pynvml.nvmlDeviceGetP2PStatus(
+                            handle,
+                            peer_handle,
+                            pynvml.NVML_P2P_CAPS_INDEX_NVLINK,
+                        )
+                        if p2p_status != pynvml.NVML_P2P_STATUS_OK:
+                            return False
+                    except pynvml.NVMLError:
+                        logger.exception(
+                            "NVLink detection failed. This is normal if"
+                            " your machine has no NVLink equipped.")
+                        return False
+        return True
+
+    @classmethod
+    def _get_physical_device_name(cls, device_id: int = 0) -> str:
+        handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
+        return pynvml.nvmlDeviceGetName(handle)
+
+    @classmethod
+    @with_nvml_context
+    def log_warnings(cls):
+        device_ids: int = pynvml.nvmlDeviceGetCount()
+        if device_ids > 1:
+            device_names = [
+                cls._get_physical_device_name(i) for i in range(device_ids)
+            ]
+            if (len(set(device_names)) > 1
+                    and os.environ.get("CUDA_DEVICE_ORDER") != "PCI_BUS_ID"):
+                logger.warning(
+                    "Detected different devices in the system: %s. Please"
+                    " make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to "
+                    "avoid unexpected behavior.",
+                    ", ".join(device_names),
+                )
+
+
+class NonNvmlCudaPlatform(CudaPlatformBase):
+
+    @classmethod
+    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
+        major, minor = torch.cuda.get_device_capability(device_id)
+        return DeviceCapability(major=9, minor=0)
+
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return torch.cuda.get_device_name(device_id)
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        device_props = torch.cuda.get_device_properties(device_id)
+        return device_props.total_memory
+
+    @classmethod
+    def is_fully_connected(cls, physical_device_ids: List[int]) -> bool:
+        logger.exception(
+            "NVLink detection not possible, as context support was"
+            " not found. Assuming no NVLink available.")
+        return False
+
+
+# Autodetect either NVML-enabled or non-NVML platform
+# based on whether NVML is available.
+nvml_available = False
+try:
+    try:
+        pynvml.nvmlInit()
+        nvml_available = False
+    except Exception:
+        # On Jetson, NVML is not supported.
+        nvml_available = False
+finally:
+    if nvml_available:
+        pynvml.nvmlShutdown()
+
+CudaPlatform = NvmlCudaPlatform if nvml_available else NonNvmlCudaPlatform
+
+try:
+    from sphinx.ext.autodoc.mock import _MockModule
+
+    if not isinstance(pynvml, _MockModule):
+        CudaPlatform.log_warnings()
+except ModuleNotFoundError:
+    CudaPlatform.log_warnings()
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -0,0 +1,94 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+from vllm import envs
+from vllm.logger import init_logger
+
+from .interface import Platform, PlatformEnum, _Backend
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
+
+logger = init_logger(__name__)
+
+
+class HpuPlatform(Platform):
+    _enum = PlatformEnum.HPU
+    device_name: str = "hpu"
+    device_type: str = "hpu"
+    dispatch_key: str = "HPU"
+    ray_device_key: str = "HPU"
+    device_control_env_var: str = "HABANA_VISIBLE_MODULES"
+
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
+        logger.info("Using HPUAttention backend.")
+        return "vllm.attention.backends.hpu_attn.HPUAttentionBackend"
+
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        return True
+
+    @staticmethod
+    def inference_mode():
+        return torch.no_grad()
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+
+        scheduler_config = vllm_config.scheduler_config
+        if scheduler_config.is_multi_step:
+            raise NotImplementedError(
+                "Multi-step execution is not implemented for HPU")
+
+        if vllm_config.speculative_config is not None:
+            raise NotImplementedError(
+                "Speculative decoding is not implemented for HPU")
+
+        parallel_config = vllm_config.parallel_config
+        if parallel_config.worker_cls == "auto":
+            parallel_config.worker_cls = "vllm.worker.hpu_worker.HPUWorker"
+
+        # NOTE(kzawora): default block size for Gaudi should be 128
+        # smaller sizes still work, but very inefficiently
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 128
+        if (parallel_config.distributed_executor_backend == 'mp'
+                and envs.VLLM_WORKER_MULTIPROC_METHOD == 'fork'):
+            if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD",
+                              None) is not None:
+                logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork "
+                               "might cause application hangs on exit. Using "
+                               "VLLM_WORKER_MULTIPROC_METHOD=fork anyway, "
+                               "as it was explicitly requested.")
+            else:
+                logger.warning(
+                    "On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork "
+                    "might cause application hangs on exit. Setting "
+                    "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
+                    "To override that behavior, please set "
+                    "VLLM_WORKER_MULTIPROC_METHOD=fork explicitly.")
+                os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+    @classmethod
+    def is_pin_memory_available(cls):
+        logger.warning("Pin memory is not supported on HPU.")
+        return False
+
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        return "vllm.lora.punica_wrapper.punica_hpu.PunicaWrapperHPU"
+
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        return "vllm.distributed.device_communicators.hpu_communicator.HpuCommunicator"  # noqa
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -0,0 +1,393 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import enum
+import platform
+import random
+from platform import uname
+from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from vllm.logger import init_logger
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig, VllmConfig
+    from vllm.utils import FlexibleArgumentParser
+else:
+    ModelConfig = None
+    VllmConfig = None
+    FlexibleArgumentParser = None
+
+logger = init_logger(__name__)
+
+
+def in_wsl() -> bool:
+    # Reference: https://github.com/microsoft/WSL/issues/4071
+    return "microsoft" in " ".join(uname()).lower()
+
+
+class _Backend(enum.Enum):
+    FLASH_ATTN = enum.auto()
+    FLASH_ATTN_VLLM_V1 = enum.auto()
+    TRITON_ATTN_VLLM_V1 = enum.auto()
+    XFORMERS = enum.auto()
+    ROCM_FLASH = enum.auto()
+    TORCH_SDPA = enum.auto()
+    FLASHINFER = enum.auto()
+    TRITON_MLA = enum.auto()  # Supported by V1
+    FLASHMLA = enum.auto()  # Supported by V1
+    HPU_ATTN = enum.auto()
+    PALLAS = enum.auto()
+    PALLAS_VLLM_V1 = enum.auto()
+    IPEX = enum.auto()
+    BLOCK_SPARSE_FLASH_ATTN = enum.auto()
+    NO_ATTENTION = enum.auto()
+
+
+class PlatformEnum(enum.Enum):
+    CUDA = enum.auto()
+    ROCM = enum.auto()
+    TPU = enum.auto()
+    HPU = enum.auto()
+    XPU = enum.auto()
+    CPU = enum.auto()
+    NEURON = enum.auto()
+    OOT = enum.auto()
+    UNSPECIFIED = enum.auto()
+
+
+class CpuArchEnum(enum.Enum):
+    X86 = enum.auto()
+    ARM = enum.auto()
+    POWERPC = enum.auto()
+    OTHER = enum.auto()
+    UNKNOWN = enum.auto()
+
+
+class DeviceCapability(NamedTuple):
+    major: int
+    minor: int
+
+    def as_version_str(self) -> str:
+        return f"{self.major}.{self.minor}"
+
+    def to_int(self) -> int:
+        """
+        Express device capability as an integer ``<major><minor>``.
+
+        It is assumed that the minor version is always a single digit.
+        """
+        assert 0 <= self.minor < 10
+        return self.major * 10 + self.minor
+
+
+class Platform:
+    _enum: PlatformEnum
+    device_name: str
+    device_type: str
+
+    # available dispatch keys:
+    # check https://github.com/pytorch/pytorch/blob/313dac6c1ca0fa0cde32477509cce32089f8532a/torchgen/model.py#L134 # noqa
+    # use "CPU" as a fallback for platforms not registered in PyTorch
+    dispatch_key: str = "CPU"
+
+    # available ray device keys:
+    # https://github.com/ray-project/ray/blob/10ba5adadcc49c60af2c358a33bb943fb491a171/python/ray/_private/ray_constants.py#L438 # noqa
+    # empty string means the device does not support ray
+    ray_device_key: str = ""
+
+    # platform-agnostic way to specify the device control environment variable,
+    # .e.g. CUDA_VISIBLE_DEVICES for CUDA.
+    # hint: search for "get_visible_accelerator_ids_env_var" in
+    # https://github.com/ray-project/ray/tree/master/python/ray/_private/accelerators # noqa
+    device_control_env_var: str = "VLLM_DEVICE_CONTROL_ENV_VAR_PLACEHOLDER"
+
+    # The torch.compile backend for compiling simple and
+    # standalone functions. The default value is "inductor" to keep
+    # the same behavior as PyTorch.
+    # NOTE: for the forward part of the model, vLLM has another separate
+    # compilation strategy.
+    simple_compile_backend: str = "inductor"
+
+    supported_quantization: list[str] = []
+
+    additional_env_vars: list[str] = []
+
+    def is_cuda(self) -> bool:
+        return self._enum == PlatformEnum.CUDA
+
+    def is_rocm(self) -> bool:
+        return self._enum == PlatformEnum.ROCM
+
+    def is_tpu(self) -> bool:
+        return self._enum == PlatformEnum.TPU
+
+    def is_hpu(self) -> bool:
+        return self._enum == PlatformEnum.HPU
+
+    def is_xpu(self) -> bool:
+        return self._enum == PlatformEnum.XPU
+
+    def is_cpu(self) -> bool:
+        return self._enum == PlatformEnum.CPU
+
+    def is_neuron(self) -> bool:
+        return self._enum == PlatformEnum.NEURON
+
+    def is_out_of_tree(self) -> bool:
+        return self._enum == PlatformEnum.OOT
+
+    def is_cuda_alike(self) -> bool:
+        """Stateless version of :func:`torch.cuda.is_available`."""
+        return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
+
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
+        """Get the attention backend class of a device."""
+        return ""
+
+    @classmethod
+    def get_device_capability(
+        cls,
+        device_id: int = 0,
+    ) -> Optional[DeviceCapability]:
+        """Stateless version of :func:`torch.cuda.get_device_capability`."""
+        return None
+
+    @classmethod
+    def has_device_capability(
+        cls,
+        capability: Union[Tuple[int, int], int],
+        device_id: int = 0,
+    ) -> bool:
+        """
+        Test whether this platform is compatible with a device capability.
+
+        The ``capability`` argument can either be:
+
+        - A tuple ``(major, minor)``.
+        - An integer ``<major><minor>``. (See :meth:`DeviceCapability.to_int`)
+        """
+        return True
+        current_capability = cls.get_device_capability(device_id=device_id)
+        if current_capability is None:
+            return False
+
+        if isinstance(capability, tuple):
+            return current_capability >= capability
+
+        return current_capability.to_int() >= capability
+
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        """Get the name of a device."""
+        raise NotImplementedError
+
+    @classmethod
+    def get_device_uuid(cls, device_id: int = 0) -> str:
+        """Get the uuid of a device, e.g. the PCI bus ID."""
+        raise NotImplementedError
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        """Get the total memory of a device in bytes."""
+        raise NotImplementedError
+
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        """
+        Check if the current platform supports async output.
+        """
+        raise NotImplementedError
+
+    @classmethod
+    def inference_mode(cls):
+        """A device-specific wrapper of `torch.inference_mode`.
+
+        This wrapper is recommended because some hardware backends such as TPU
+        do not support `torch.inference_mode`. In such a case, they will fall
+        back to `torch.no_grad` by overriding this method.
+        """
+        return torch.inference_mode(mode=True)
+
+    @classmethod
+    def seed_everything(cls, seed: Optional[int] = None) -> None:
+        """
+        Set the seed of each random module.
+        `torch.manual_seed` will set seed on all devices.
+
+        Loosely based on: https://github.com/Lightning-AI/pytorch-lightning/blob/2.4.0/src/lightning/fabric/utilities/seed.py#L20
+        """
+        if seed is not None:
+            random.seed(seed)
+            np.random.seed(seed)
+            torch.manual_seed(seed)
+
+    @classmethod
+    def pre_register_and_update(cls,
+                                parser: Optional[FlexibleArgumentParser] = None
+                                ) -> None:
+        """
+        Do some pre-registeration or update action for the current platform.
+
+        This function is called before global VllmConfig is initialized or cli
+        arguments are parsed. It's used for out-of-tree platforms to register or
+        update the configuration.
+
+        For example, the out-of-tree quantization config can be imported and
+        registered here dynamically.
+        """
+        pass
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        """
+        Check and update the configuration for the current platform.
+
+        It can raise an exception if the configuration is not compatible with
+        the current platform, or it can update the configuration to make it
+        compatible with the current platform.
+
+        The config is passed by reference, so it can be modified in place.
+        """
+        pass
+
+    @classmethod
+    def verify_model_arch(cls, model_arch: str) -> None:
+        """
+        Verify whether the current platform supports the specified model
+        architecture.
+
+        - This will raise an Error or Warning based on the model support on
+        the current platform.
+        - By default all models are considered supported.
+        """
+        pass
+
+    @classmethod
+    def verify_quantization(cls, quant: str) -> None:
+        """
+        Verify whether the quantization is supported by the current platform.
+        """
+        if cls.supported_quantization and \
+            quant not in cls.supported_quantization:
+            raise ValueError(
+                f"{quant} quantization is currently not supported in "
+                f"{cls.device_name}.")
+
+    @classmethod
+    def get_cpu_architecture(cls) -> CpuArchEnum:
+        """
+        Determine the CPU architecture of the current system.
+        Returns CpuArchEnum indicating the architecture type.
+        """
+        machine = platform.machine().lower()
+
+        if machine in ("x86_64", "amd64", "i386", "i686"):
+            return CpuArchEnum.X86
+        elif machine.startswith("arm") or machine.startswith("aarch"):
+            return CpuArchEnum.ARM
+        elif machine.startswith("ppc"):
+            return CpuArchEnum.POWERPC
+
+        return CpuArchEnum.OTHER if machine else CpuArchEnum.UNKNOWN
+
+    @classmethod
+    def is_pin_memory_available(cls) -> bool:
+        """Checks whether pin memory is available on the current platform."""
+        if in_wsl():
+            # Pinning memory in WSL is not supported.
+            # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
+            logger.warning("Using 'pin_memory=False' as WSL is detected. "
+                           "This may slow down the performance.")
+            return False
+        return True
+
+    @classmethod
+    def get_current_memory_usage(cls,
+                                 device: Optional[torch.types.Device] = None
+                                 ) -> float:
+        """
+        Return the memory usage in bytes.
+        """
+        raise NotImplementedError
+
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        """
+        Return the punica wrapper for current platform.
+        """
+        raise NotImplementedError
+
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        """
+        Get device specific communicator class for distributed communication.
+        """
+        return "vllm.distributed.device_communicators.base_device_communicator.DeviceCommunicatorBase"  # noqa
+
+    @classmethod
+    def supports_fp8(cls) -> bool:
+        """
+        Returns whether the current platform supports FP8 types.
+        """
+        return False
+
+    @classmethod
+    def is_fp8_fnuz(cls) -> bool:
+        """
+        Returns whether the preferred FP8 type is FNUZ on the current platform.
+
+        There are two representations of FP8, OCP FP8 and FNUZ FP8.
+        The OCP specification can be found at https://tinyurl.com/b7jvwpft.
+        The FNUZ specification can be found at https://tinyurl.com/5n6hwwu5.
+
+        AMD's MI300 and MI325 have native hardware support for FNUZ. All other
+        hardware has converged on the OCP FP8 standard.
+        """
+        return False
+
+    @classmethod
+    def fp8_dtype(cls) -> torch.dtype:
+        """
+        Returns the preferred FP8 type on the current platform.
+
+        See the documentation for is_fp8_fnuz for details.
+        """
+        return torch.float8_e4m3fn
+
+    @classmethod
+    def use_all_gather(cls) -> bool:
+        """
+        Whether to use allgather in LogitsProcessor to gather the logits.
+        """
+        import vllm.envs as envs
+        from vllm.config import get_current_vllm_config
+
+        parallel_config = get_current_vllm_config().parallel_config
+        return (envs.VLLM_USE_V1
+                or parallel_config.distributed_executor_backend
+                == "external_launcher")
+
+    @classmethod
+    def supports_v1(cls, model_config: ModelConfig) -> bool:
+        """Returns whether the current platform can support v1 for the supplied
+        model configuration.
+        """
+        return False
+
+    @classmethod
+    def use_custom_allreduce(cls) -> bool:
+        """
+        Returns if custom allreduce is supported on the current platform
+        """
+        return False
+
+
+class UnspecifiedPlatform(Platform):
+    _enum = PlatformEnum.UNSPECIFIED
+    device_type = ""
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import TYPE_CHECKING, Optional
+
+from vllm import envs
+from vllm.logger import init_logger
+
+from .interface import Platform, PlatformEnum
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
+
+logger = init_logger(__name__)
+
+
+class NeuronPlatform(Platform):
+    _enum = PlatformEnum.NEURON
+    device_name: str = "neuron"
+    device_type: str = "neuron"
+    ray_device_key: str = "neuron_cores"
+    supported_quantization: list[str] = ["neuron_quant"]
+    device_control_env_var: str = "NEURON_RT_VISIBLE_CORES"
+
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return "neuron"
+
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        return False
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        parallel_config = vllm_config.parallel_config
+        if parallel_config.worker_cls == "auto":
+            parallel_config.worker_cls = \
+                "vllm.worker.neuron_worker.NeuronWorker"
+
+        if parallel_config.world_size > 1:
+            parallel_config.distributed_executor_backend = "uni"
+
+        assert (vllm_config.lora_config
+                is None), "LoRA is not supported for Neuron backend."
+        assert (not vllm_config.speculative_config
+                ), "Speculative decoding not yet supported for Neuron backend."
+
+        cache_config = vllm_config.cache_config
+        if cache_config:
+            # neuron needs block_size = max_model_len
+            vllm_config.cache_config.block_size = \
+                vllm_config.model_config.max_model_len
+
+    @classmethod
+    def is_pin_memory_available(cls) -> bool:
+        logger.warning("Pin memory is not supported on Neuron.")
+        return False
+
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        if envs.VLLM_USE_V1:
+            return "vllm.distributed.device_communicators.neuron_communicator.NeuronCommunicator"  # noqa
+        else:
+            return Platform.get_device_communicator_cls()
+
+    @classmethod
+    def use_all_gather(cls) -> bool:
+        return True
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+from functools import cache, lru_cache, wraps
+from typing import TYPE_CHECKING, Dict, List, Optional
+
+import torch
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+
+from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig, VllmConfig
+else:
+    ModelConfig = None
+    VllmConfig = None
+
+logger = init_logger(__name__)
+
+try:
+    from amdsmi import (AmdSmiException, amdsmi_get_gpu_asic_info,
+                        amdsmi_get_processor_handles, amdsmi_init,
+                        amdsmi_shut_down, amdsmi_topo_get_link_type)
+except ImportError as e:
+    logger.warning("Failed to import from amdsmi with %r", e)
+
+try:
+    import vllm._C  # noqa: F401
+except ImportError as e:
+    logger.warning("Failed to import from vllm._C with %r", e)
+
+# import custom ops, trigger op registration
+try:
+    import vllm._rocm_C  # noqa: F401
+except ImportError as e:
+    logger.warning("Failed to import from vllm._rocm_C with %r", e)
+
+# Models not supported by ROCm.
+_ROCM_UNSUPPORTED_MODELS: List[str] = []
+
+# Models partially supported by ROCm.
+# Architecture -> Reason.
+_ROCM_SWA_REASON = ("Sliding window attention (SWA) is not yet supported in "
+                    "Triton flash attention. For half-precision SWA support, "
+                    "please use CK flash attention by setting "
+                    "`VLLM_USE_TRITON_FLASH_ATTN=0`")
+_ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = {
+    "Qwen2ForCausalLM":
+    _ROCM_SWA_REASON,
+    "MistralForCausalLM":
+    _ROCM_SWA_REASON,
+    "MixtralForCausalLM":
+    _ROCM_SWA_REASON,
+    "PaliGemmaForConditionalGeneration":
+    ("ROCm flash attention does not yet "
+     "fully support 32-bit precision on PaliGemma"),
+    "Phi3VForCausalLM":
+    ("ROCm Triton flash attention may run into compilation errors due to "
+     "excessive use of shared memory. If this happens, disable Triton FA "
+     "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`")
+}
+
+# Prevent use of clashing `{CUDA/HIP}_VISIBLE_DEVICES``
+if "HIP_VISIBLE_DEVICES" in os.environ:
+    val = os.environ["HIP_VISIBLE_DEVICES"]
+    if cuda_val := os.environ.get("CUDA_VISIBLE_DEVICES", None):
+        assert val == cuda_val
+    else:
+        os.environ["CUDA_VISIBLE_DEVICES"] = val
+
+# AMDSMI utils
+# Note that NVML is not affected by `{CUDA/HIP}_VISIBLE_DEVICES`,
+# all the related functions work on real physical device ids.
+# the major benefit of using AMDSMI is that it will not initialize CUDA
+
+
+def with_amdsmi_context(fn):
+
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        amdsmi_init()
+        try:
+            return fn(*args, **kwargs)
+        finally:
+            amdsmi_shut_down()
+
+    return wrapper
+
+
+def device_id_to_physical_device_id(device_id: int) -> int:
+    if "CUDA_VISIBLE_DEVICES" in os.environ:
+        device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
+        physical_device_id = device_ids[device_id]
+        return int(physical_device_id)
+    else:
+        return device_id
+
+
+@cache
+def use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
+                                    block_size: int, gqa_ratio: int,
+                                    max_seq_len: int,
+                                    sliding_window: int) -> bool:
+
+    GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
+    ON_NAVI = "gfx1" in GPU_ARCH
+    ON_MI250_MI300 = any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942"])
+
+    # rocm custom page attention not support on navi (gfx1*)
+    return (ON_MI250_MI300 and not ON_NAVI
+            and (sliding_window == 0 or sliding_window == (-1, -1))
+            and (qtype == torch.half or qtype == torch.bfloat16)
+            and (head_size == 64 or head_size == 128)
+            and (block_size == 16 or block_size == 32)
+            and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768
+            and envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
+
+
+class RocmPlatform(Platform):
+    _enum = PlatformEnum.ROCM
+    device_name: str = "rocm"
+    device_type: str = "cuda"
+    dispatch_key: str = "CUDA"
+    ray_device_key: str = "GPU"
+    # rocm shares the same device control env var as CUDA
+    device_control_env_var: str = "CUDA_VISIBLE_DEVICES"
+
+    supported_quantization: list[str] = [
+        "awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors",
+        "fbgemm_fp8", "gguf", "quark", "ptpc_fp8"
+    ]
+
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
+                             kv_cache_dtype, block_size, use_v1,
+                             use_mla) -> str:
+        if use_mla:
+            logger.info("Using Triton MLA backend.")
+            return "vllm.attention.backends.triton_mla.TritonMLABackend"
+        selected_backend = (_Backend.ROCM_FLASH if selected_backend
+                            == _Backend.FLASH_ATTN else selected_backend)
+        if envs.VLLM_USE_V1:
+            logger.info("Using Triton Attention backend on V1 engine.")
+            return ("vllm.v1.attention.backends."
+                    "triton_attn.TritonAttentionBackend")
+        if selected_backend == _Backend.ROCM_FLASH:
+            if not cls.has_device_capability(90):
+                # not Instinct series GPUs.
+                logger.info("flash_attn is not supported on NAVI GPUs.")
+        else:
+            logger.info("%s is not supported in AMD GPUs.", selected_backend)
+        logger.info("Using ROCmFlashAttention backend.")
+        return "vllm.attention.backends.rocm_flash_attn.ROCmFlashAttentionBackend"  # noqa: E501
+
+    @classmethod
+    @lru_cache(maxsize=8)
+    def get_device_capability(cls,
+                              device_id: int = 0
+                              ) -> Optional[DeviceCapability]:
+        major, minor = torch.cuda.get_device_capability(device_id)
+        return DeviceCapability(major=major, minor=minor)
+
+    @staticmethod
+    @with_amdsmi_context
+    def is_fully_connected(physical_device_ids: List[int]) -> bool:
+        """
+        Query if the set of gpus are fully connected by xgmi (1 hop)
+        """
+        handles = [
+            amdsmi_get_processor_handles()[i] for i in physical_device_ids
+        ]
+        for i, handle in enumerate(handles):
+            for j, peer_handle in enumerate(handles):
+                if i < j:
+                    try:
+                        link_type = amdsmi_topo_get_link_type(
+                            handle, peer_handle)
+                        # type is 2 for XGMI
+                        if link_type["hops"] != 1 or link_type["type"] != 2:
+                            return False
+                    except AmdSmiException as error:
+                        logger.error("AMD 1 hop XGMI detection failed.",
+                                     exc_info=error)
+                        return False
+        return True
+
+    @classmethod
+    @with_amdsmi_context
+    @lru_cache(maxsize=8)
+    def get_device_name(cls, device_id: int = 0) -> str:
+        physical_device_id = device_id_to_physical_device_id(device_id)
+        handle = amdsmi_get_processor_handles()[physical_device_id]
+        return amdsmi_get_gpu_asic_info(handle)["market_name"]
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        device_props = torch.cuda.get_device_properties(device_id)
+        return device_props.total_memory
+
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        if enforce_eager:
+            logger.warning(
+                "To see benefits of async output processing, enable CUDA "
+                "graph. Since, enforce-eager is enabled, async output "
+                "processor cannot be used")
+            return False
+        return True
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
+        parallel_config = vllm_config.parallel_config
+        scheduler_config = vllm_config.scheduler_config
+        if parallel_config.worker_cls == "auto":
+            if scheduler_config.is_multi_step:
+                if envs.VLLM_USE_V1:
+                    raise NotImplementedError(
+                        "Multi-step scheduling is not supported (and not "
+                        "needed) on vLLM V1. Please launch without "
+                        "--num-scheduler-steps.")
+                else:
+                    parallel_config.worker_cls = \
+                        "vllm.worker.multi_step_worker.MultiStepWorker"
+            elif vllm_config.speculative_config:
+                if envs.VLLM_USE_V1:
+                    raise NotImplementedError(
+                        "Speculative decoding is not yet supported on vLLM V1."
+                    )
+                else:
+                    parallel_config.worker_cls = \
+                        "vllm.spec_decode.spec_decode_worker.create_spec_worker"
+                    parallel_config.sd_worker_cls = \
+                        "vllm.worker.worker.Worker"
+            else:
+                if envs.VLLM_USE_V1:
+                    parallel_config.worker_cls = \
+                            "vllm.v1.worker.gpu_worker.Worker"
+                else:
+                    parallel_config.worker_cls = "vllm.worker.worker.Worker"
+
+    @classmethod
+    def verify_model_arch(cls, model_arch: str) -> None:
+        if model_arch in _ROCM_UNSUPPORTED_MODELS:
+            raise ValueError(f"Model architecture '{model_arch}' is not "
+                             "supported by ROCm for now.")
+
+        if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS:
+            msg = _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch]
+            logger.warning(
+                "Model architecture '%s' is partially "
+                "supported by ROCm: %s", model_arch, msg)
+
+    @classmethod
+    def verify_quantization(cls, quant: str) -> None:
+        super().verify_quantization(quant)
+        if quant == "awq" and not envs.VLLM_USE_TRITON_AWQ:
+            logger.warning(
+                "Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ"
+                " is not set, enabling VLLM_USE_TRITON_AWQ.")
+        envs.VLLM_USE_TRITON_AWQ = True
+
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU"
+
+    @classmethod
+    def get_current_memory_usage(cls,
+                                 device: Optional[torch.types.Device] = None
+                                 ) -> float:
+        torch.cuda.reset_peak_memory_stats(device)
+        return torch.cuda.mem_get_info(device)[1] - torch.cuda.mem_get_info(
+            device)[0]
+
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        return "vllm.distributed.device_communicators.cuda_communicator.CudaCommunicator"  # noqa
+
+    @classmethod
+    def supports_fp8(cls) -> bool:
+        gcn_arch = torch.cuda.get_device_properties(0).gcnArchName
+        return any(gfx in gcn_arch for gfx in ['gfx94', 'gfx95', 'gfx12'])
+
+    @classmethod
+    def is_fp8_fnuz(cls) -> bool:
+        # only device 0 is checked, this assumes MI300 platforms are homogeneous
+        return 'gfx94' in torch.cuda.get_device_properties(0).gcnArchName
+
+    @classmethod
+    def fp8_dtype(cls) -> torch.dtype:
+        if cls.is_fp8_fnuz():
+            return torch.float8_e4m3fnuz
+        else:
+            return torch.float8_e4m3fn
+
+    @classmethod
+    def supports_v1(cls, model_config: ModelConfig) -> bool:
+        # V1 support on AMD gpus is experimental
+        return True
+
+    @classmethod
+    def use_custom_allreduce(cls) -> bool:
+        # We only enable custom allreduce for MI300 series
+        gcn_arch = torch.cuda.get_device_properties(0).gcnArchName
+        supported_archs = ['gfx94']
+        return any(gfx in gcn_arch for gfx in supported_archs)
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -0,0 +1,135 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+
+from .interface import Platform, PlatformEnum, _Backend
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig, VllmConfig
+else:
+    ModelConfig = None
+    VllmConfig = None
+
+logger = init_logger(__name__)
+
+
+class TpuPlatform(Platform):
+    _enum = PlatformEnum.TPU
+    device_name: str = "tpu"
+    device_type: str = "tpu"
+    dispatch_key: str = "XLA"
+    ray_device_key: str = "TPU"
+    device_control_env_var: str = "TPU_VISIBLE_CHIPS"
+
+    supported_quantization: list[str] = [
+        "tpu_int8", "compressed-tensors", "compressed_tensors"
+    ]
+
+    additional_env_vars: list[str] = [
+        "TPU_CHIPS_PER_HOST_BOUNDS", "TPU_HOST_BOUNDS"
+    ]
+
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
+        if (selected_backend != _Backend.PALLAS
+                and selected_backend != _Backend.PALLAS_VLLM_V1):
+            logger.info("Cannot use %s backend on TPU.", selected_backend)
+
+        if use_v1:
+            logger.info("Using Pallas V1 backend.")
+            return "vllm.v1.attention.backends.pallas.PallasAttentionBackend"
+        else:
+            logger.info("Using Pallas backend.")
+            return "vllm.attention.backends.pallas.PallasAttentionBackend"
+
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return "tpu"
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        raise NotImplementedError
+
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        return not envs.VLLM_USE_V1
+
+    @classmethod
+    def inference_mode(cls):
+        return torch.no_grad()
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        from vllm.config import CompilationLevel
+
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
+        compilation_config = vllm_config.compilation_config
+
+        # TPU only supports DYNAMO_ONCE compilation level
+        if compilation_config.level != CompilationLevel.DYNAMO_ONCE:
+            logger.info("[TPU] Forcing DYNAMO_ONCE compilation level")
+            compilation_config.level = CompilationLevel.DYNAMO_ONCE
+
+        if compilation_config.backend == "":
+            compilation_config.backend = "openxla"
+
+        assert vllm_config.speculative_config is None, \
+            "TPU does not support speculative decoding"
+
+        if vllm_config.model_config.dtype in (torch.float16, torch.float32):
+            logger.warning(
+                "The TPU backend currently does not support %s. "
+                "Using bfloat16 instead.", vllm_config.model_config.dtype)
+            vllm_config.model_config.dtype = torch.bfloat16
+
+        parallel_config = vllm_config.parallel_config
+        scheduler_config = vllm_config.scheduler_config
+        if parallel_config.worker_cls == "auto":
+            if scheduler_config.is_multi_step:
+                if envs.VLLM_USE_V1:
+                    raise NotImplementedError(
+                        "Multi-step scheduling is not supported (and not "
+                        "needed) on vLLM V1. Please launch without "
+                        "--num-scheduler-steps.")
+                else:
+                    parallel_config.worker_cls = \
+                        "vllm.worker.multi_step_tpu_worker.MultiStepTPUWorker"
+            else:
+                if envs.VLLM_USE_V1:
+                    parallel_config.worker_cls = \
+                        "vllm.v1.worker.tpu_worker.TPUWorker"
+                else:
+                    parallel_config.worker_cls = \
+                        "vllm.worker.tpu_worker.TPUWorker"
+
+        assert not vllm_config.speculative_config, (
+            "Speculative decoding is not yet supported for TPU backend")
+
+    @classmethod
+    def is_pin_memory_available(cls):
+        logger.warning("Pin memory is not supported on TPU.")
+        return False
+
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        return "vllm.distributed.device_communicators.tpu_communicator.TpuCommunicator"  # noqa
+
+    @classmethod
+    def use_all_gather(cls) -> bool:
+        return True
+
+    @classmethod
+    def supports_v1(cls, model_config: ModelConfig) -> bool:
+        # V1 support on TPU is experimental
+        return True
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -0,0 +1,142 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+from vllm.logger import init_logger
+
+from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
+
+logger = init_logger(__name__)
+
+
+class XPUPlatform(Platform):
+    _enum = PlatformEnum.XPU
+    device_name: str = "xpu"
+    device_type: str = "xpu"
+    dispatch_key: str = "XPU"
+    # Intel XPU's device key is "GPU" for Ray.
+    # see https://github.com/ray-project/ray/blob/6a5eb5865eeb9ccf058a79b44f107e327e360673/python/ray/_private/accelerators/intel_gpu.py#L20 # noqa: E501
+    ray_device_key: str = "GPU"
+    device_control_env_var: str = "ONEAPI_DEVICE_SELECTOR"
+
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
+        if selected_backend != _Backend.IPEX:
+            logger.info("Cannot use %s backend on XPU.", selected_backend)
+        logger.info("Using IPEX attention backend.")
+        return "vllm.attention.backends.ipex_attn.IpexAttnBackend"
+
+    @staticmethod
+    def get_device_capability(
+            device_id: int = 0) -> Optional[DeviceCapability]:
+        # capacity format differs from cuda's and will cause unexpected
+        # failure, so use None directly
+        return None
+
+    @staticmethod
+    def get_device_name(device_id: int = 0) -> str:
+        return torch.xpu.get_device_name(device_id)
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        device_props = torch.xpu.get_device_properties(device_id)
+        return device_props.total_memory
+
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        return True
+
+    @staticmethod
+    def inference_mode():
+        return torch.no_grad()
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
+        # check and update model config
+        model_config = vllm_config.model_config
+        if model_config.dtype == torch.bfloat16:
+            bf16_supported = cls.device_support_bf16()
+            if not bf16_supported:
+                logger.warning(
+                    "bfloat16 is only supported on Intel Data Center GPU, "
+                    "Intel Arc GPU is not supported yet. Your device is %s,"
+                    " which is not supported. will fallback to float16",
+                    cls.get_device_name())
+                model_config.dtype = torch.float16
+        if not model_config.enforce_eager:
+            logger.warning(
+                "CUDA graph is not supported on XPU, fallback to the eager "
+                "mode.")
+            model_config.enforce_eager = True
+
+        if vllm_config.speculative_config is not None:
+            raise NotImplementedError(
+                "XPU does not support speculative decoding")
+
+        if vllm_config.device_config is not None:
+            assert vllm_config.device_config.device_type == "xpu"
+
+        # check and update parallel config
+        parallel_config = vllm_config.parallel_config
+        if parallel_config.worker_cls == "auto":
+            parallel_config.worker_cls = "vllm.worker.xpu_worker.XPUWorker"
+
+        if parallel_config.distributed_executor_backend is None:
+            parallel_config.distributed_executor_backend = "ray"
+        elif parallel_config.distributed_executor_backend == "mp":
+            # FIXME(kunshang):
+            # spawn needs calling `if __name__ == '__main__':``
+            # fork is not supported for xpu start new process.
+            logger.error(
+                "Both start methods (spawn and fork) have issue "
+                "on XPU if you use mp backend, setting it to ray instead.")
+            parallel_config.distributed_executor_backend = "ray"
+
+        elif parallel_config.distributed_executor_backend != "ray":
+            logger.warning(
+                "%s is not supported on XPU, fallback to ray distributed"
+                " executor backend.",
+                parallel_config.distributed_executor_backend)
+            parallel_config.distributed_executor_backend = "ray"
+
+    @classmethod
+    def is_pin_memory_available(cls):
+        logger.warning("Pin memory is not supported on XPU.")
+        return False
+
+    @classmethod
+    def get_current_memory_usage(cls,
+                                 device: Optional[torch.types.Device] = None
+                                 ) -> float:
+        torch.xpu.reset_peak_memory_stats(device)
+        return torch.xpu.max_memory_allocated(device)
+
+    @classmethod
+    def device_support_bf16(cls) -> bool:
+        device_name = cls.get_device_name().lower()
+        if device_name.count("arc") > 0:
+            return False
+        elif device_name.count("data center gpu") > 0:
+            return True
+        else:
+            logger.warning("Unknown device name %s, always use float16",
+                           device_name)
+            return False
+
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        return "vllm.distributed.device_communicators.xpu_communicator.XpuCommunicator"  # noqa