init

2025-08-13 19:46:19 +08:00
commit 5d2e7edf78
1232 changed files with 361215 additions and 0 deletions
--- a/platforms/init.py
+++ b/platforms/init.py
@@ -0,0 +1,291 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import logging
+import traceback
+from itertools import chain
+from typing import TYPE_CHECKING, Optional
+
+from vllm.plugins import load_plugins_by_group
+from vllm.utils import resolve_obj_by_qualname
+
+from .interface import _Backend  # noqa: F401
+from .interface import CpuArchEnum, Platform, PlatformEnum
+
+logger = logging.getLogger(__name__)
+
+
+def vllm_version_matches_substr(substr: str) -> bool:
+    """
+    Check to see if the vLLM version matches a substring.
+    """
+    from importlib.metadata import PackageNotFoundError, version
+    try:
+        vllm_version = version("vllm")
+    except PackageNotFoundError as e:
+        logger.warning(
+            "The vLLM package was not found, so its version could not be "
+            "inspected. This may cause platform detection to fail.")
+        raise e
+    return substr in vllm_version
+
+
+def tpu_platform_plugin() -> Optional[str]:
+    is_tpu = False
+    logger.debug("Checking if TPU platform is available.")
+    try:
+        # While it's technically possible to install libtpu on a
+        # non-TPU machine, this is a very uncommon scenario. Therefore,
+        # we assume that libtpu is installed if and only if the machine
+        # has TPUs.
+        import libtpu  # noqa: F401
+        is_tpu = True
+        logger.debug("Confirmed TPU platform is available.")
+    except Exception as e:
+        logger.debug("TPU platform is not available because: %s", str(e))
+
+    return "vllm.platforms.tpu.TpuPlatform" if is_tpu else None
+
+
+def cuda_platform_plugin() -> Optional[str]:
+    is_cuda = False
+    logger.debug("Checking if CUDA platform is available.")
+    try:
+        from vllm.utils import import_pynvml
+        pynvml = import_pynvml()
+        pynvml.nvmlInit()
+        try:
+            # NOTE: Edge case: vllm cpu build on a GPU machine.
+            # Third-party pynvml can be imported in cpu build,
+            # we need to check if vllm is built with cpu too.
+            # Otherwise, vllm will always activate cuda plugin
+            # on a GPU machine, even if in a cpu build.
+            is_cuda = (pynvml.nvmlDeviceGetCount() > 0
+                       and not vllm_version_matches_substr("cpu"))
+            if pynvml.nvmlDeviceGetCount() <= 0:
+                logger.debug(
+                    "CUDA platform is not available because no GPU is found.")
+            if vllm_version_matches_substr("cpu"):
+                logger.debug("CUDA platform is not available because"
+                             " vLLM is built with CPU.")
+            if is_cuda:
+                logger.debug("Confirmed CUDA platform is available.")
+        finally:
+            pynvml.nvmlShutdown()
+    except Exception as e:
+        logger.debug("Exception happens when checking CUDA platform: %s",
+                     str(e))
+        if "nvml" not in e.__class__.__name__.lower():
+            # If the error is not related to NVML, re-raise it.
+            raise e
+
+        # CUDA is supported on Jetson, but NVML may not be.
+        import os
+
+        def cuda_is_jetson() -> bool:
+            return os.path.isfile("/etc/nv_tegra_release") \
+                or os.path.exists("/sys/class/tegra-firmware")
+
+        if cuda_is_jetson():
+            logger.debug("Confirmed CUDA platform is available on Jetson.")
+            is_cuda = True
+        else:
+            logger.debug("CUDA platform is not available because: %s", str(e))
+
+    return "vllm.platforms.cuda.CudaPlatform" if is_cuda else None
+
+
+def rocm_platform_plugin() -> Optional[str]:
+    is_rocm = False
+    logger.debug("Checking if ROCm platform is available.")
+    try:
+        import amdsmi
+        amdsmi.amdsmi_init()
+        try:
+            if len(amdsmi.amdsmi_get_processor_handles()) > 0:
+                is_rocm = True
+                logger.debug("Confirmed ROCm platform is available.")
+            else:
+                logger.debug("ROCm platform is not available because"
+                             " no GPU is found.")
+        finally:
+            amdsmi.amdsmi_shut_down()
+    except Exception as e:
+        logger.debug("ROCm platform is not available because: %s", str(e))
+
+    return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None
+
+
+def hpu_platform_plugin() -> Optional[str]:
+    is_hpu = False
+    logger.debug("Checking if HPU platform is available.")
+    try:
+        from importlib import util
+        is_hpu = util.find_spec('habana_frameworks') is not None
+        if is_hpu:
+            logger.debug("Confirmed HPU platform is available.")
+        else:
+            logger.debug("HPU platform is not available because "
+                         "habana_frameworks is not found.")
+    except Exception as e:
+        logger.debug("HPU platform is not available because: %s", str(e))
+
+    return "vllm.platforms.hpu.HpuPlatform" if is_hpu else None
+
+
+def xpu_platform_plugin() -> Optional[str]:
+    is_xpu = False
+    logger.debug("Checking if XPU platform is available.")
+    try:
+        # installed IPEX if the machine has XPUs.
+        import intel_extension_for_pytorch  # noqa: F401
+        import oneccl_bindings_for_pytorch  # noqa: F401
+        import torch
+        if hasattr(torch, 'xpu') and torch.xpu.is_available():
+            is_xpu = True
+            logger.debug("Confirmed XPU platform is available.")
+    except Exception as e:
+        logger.debug("XPU platform is not available because: %s", str(e))
+
+    return "vllm.platforms.xpu.XPUPlatform" if is_xpu else None
+
+
+def cpu_platform_plugin() -> Optional[str]:
+    is_cpu = False
+    logger.debug("Checking if CPU platform is available.")
+    try:
+        is_cpu = vllm_version_matches_substr("cpu")
+        if is_cpu:
+            logger.debug("Confirmed CPU platform is available because"
+                         " vLLM is built with CPU.")
+        if not is_cpu:
+            import sys
+            is_cpu = sys.platform.startswith("darwin")
+            if is_cpu:
+                logger.debug("Confirmed CPU platform is available"
+                             " because the machine is MacOS.")
+
+    except Exception as e:
+        logger.debug("CPU platform is not available because: %s", str(e))
+
+    return "vllm.platforms.cpu.CpuPlatform" if is_cpu else None
+
+
+def neuron_platform_plugin() -> Optional[str]:
+    tnx_installed = False
+    nxd_installed = False
+    logger.debug("Checking if Neuron platform is available.")
+    try:
+        import transformers_neuronx  # noqa: F401
+        tnx_installed = True
+        logger.debug("Confirmed Neuron platform is available because"
+                     " transformers_neuronx is found.")
+    except ImportError:
+        pass
+
+    try:
+        import neuronx_distributed_inference  # noqa: F401
+        nxd_installed = True
+        logger.debug("Confirmed Neuron platform is available because"
+                     " neuronx_distributed_inference is found.")
+    except ImportError:
+        pass
+
+    is_neuron = tnx_installed or nxd_installed
+    return "vllm.platforms.neuron.NeuronPlatform" if is_neuron else None
+
+
+builtin_platform_plugins = {
+    'tpu': tpu_platform_plugin,
+    'cuda': cuda_platform_plugin,
+    'rocm': rocm_platform_plugin,
+    'hpu': hpu_platform_plugin,
+    'xpu': xpu_platform_plugin,
+    'cpu': cpu_platform_plugin,
+    'neuron': neuron_platform_plugin,
+}
+
+
+def resolve_current_platform_cls_qualname() -> str:
+    platform_plugins = load_plugins_by_group('vllm.platform_plugins')
+
+    activated_plugins = []
+
+    for name, func in chain(builtin_platform_plugins.items(),
+                            platform_plugins.items()):
+        try:
+            assert callable(func)
+            platform_cls_qualname = func()
+            if platform_cls_qualname is not None:
+                activated_plugins.append(name)
+        except Exception:
+            pass
+
+    activated_builtin_plugins = list(
+        set(activated_plugins) & set(builtin_platform_plugins.keys()))
+    activated_oot_plugins = list(
+        set(activated_plugins) & set(platform_plugins.keys()))
+
+    if len(activated_oot_plugins) >= 2:
+        raise RuntimeError(
+            "Only one platform plugin can be activated, but got: "
+            f"{activated_oot_plugins}")
+    elif len(activated_oot_plugins) == 1:
+        platform_cls_qualname = platform_plugins[activated_oot_plugins[0]]()
+        logger.info("Platform plugin %s is activated",
+                    activated_oot_plugins[0])
+    elif len(activated_builtin_plugins) >= 2:
+        raise RuntimeError(
+            "Only one platform plugin can be activated, but got: "
+            f"{activated_builtin_plugins}")
+    elif len(activated_builtin_plugins) == 1:
+        platform_cls_qualname = builtin_platform_plugins[
+            activated_builtin_plugins[0]]()
+        logger.info("Automatically detected platform %s.",
+                    activated_builtin_plugins[0])
+    else:
+        platform_cls_qualname = "vllm.platforms.interface.UnspecifiedPlatform"
+        logger.info(
+            "No platform detected, vLLM is running on UnspecifiedPlatform")
+    return platform_cls_qualname
+
+
+_current_platform = None
+_init_trace: str = ''
+
+if TYPE_CHECKING:
+    current_platform: Platform
+
+
+def __getattr__(name: str):
+    if name == 'current_platform':
+        # lazy init current_platform.
+        # 1. out-of-tree platform plugins need `from vllm.platforms import
+        #    Platform` so that they can inherit `Platform` class. Therefore,
+        #    we cannot resolve `current_platform` during the import of
+        #    `vllm.platforms`.
+        # 2. when users use out-of-tree platform plugins, they might run
+        #    `import vllm`, some vllm internal code might access
+        #    `current_platform` during the import, and we need to make sure
+        #    `current_platform` is only resolved after the plugins are loaded
+        #    (we have tests for this, if any developer violate this, they will
+        #    see the test failures).
+        global _current_platform
+        if _current_platform is None:
+            platform_cls_qualname = resolve_current_platform_cls_qualname()
+            _current_platform = resolve_obj_by_qualname(
+                platform_cls_qualname)()
+            global _init_trace
+            _init_trace = "".join(traceback.format_stack())
+        return _current_platform
+    elif name in globals():
+        return globals()[name]
+    else:
+        raise AttributeError(
+            f"No attribute named '{name}' exists in {__name__}.")
+
+
+__all__ = [
+    'Platform', 'PlatformEnum', 'current_platform', 'CpuArchEnum',
+    "_init_trace"
+]
--- a/platforms/cpu.py
+++ b/platforms/cpu.py
@@ -0,0 +1,266 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import platform
+import sys
+from importlib.util import find_spec
+from typing import TYPE_CHECKING, Optional
+
+import psutil
+import torch
+
+from vllm.logger import init_logger
+from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
+
+from .interface import CpuArchEnum, Platform, PlatformEnum, _Backend
+
+logger = init_logger(__name__)
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
+
+
+def get_max_threads(pid=0):
+    if hasattr(os, 'sched_getaffinity'):
+        return len(os.sched_getaffinity(pid))
+    elif platform.system() == 'Darwin':
+        return os.cpu_count()
+    else:
+        raise NotImplementedError("Unsupported OS")
+
+
+class CpuPlatform(Platform):
+    _enum = PlatformEnum.CPU
+    device_name: str = "cpu"
+    device_type: str = "cpu"
+    dispatch_key: str = "CPU"
+
+    @property
+    def supported_dtypes(self) -> list[torch.dtype]:
+        if self.get_cpu_architecture() == CpuArchEnum.POWERPC:
+            return [torch.bfloat16, torch.float32]
+        elif sys.platform.startswith(
+                "darwin") and self.get_cpu_architecture() == CpuArchEnum.ARM:
+            # TODO: change this condition to check if the platform support bf16
+            # instead of checking the OS. For instance M2 shall supports bf16
+            # already. But we need to modify `cpu_extension.cmake` to activate
+            # the feature in the build.
+            return [torch.float16, torch.float32]
+        # x86/aarch64 CPU has supported both bf16 and fp16 natively.
+        return [torch.bfloat16, torch.float16, torch.float32]
+
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return "cpu"
+
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
+        if selected_backend and selected_backend != _Backend.TORCH_SDPA:
+            logger.info("Cannot use %s backend on CPU.", selected_backend)
+        if use_mla:
+            logger.info("Using CPU MLA backend.")
+            return "vllm.attention.backends.cpu_mla.CPUMLABackend"
+        logger.info("Using Torch SDPA backend.")
+        if use_v1:
+            return "vllm.v1.attention.backends.cpu_attn.TorchSDPABackend"
+        else:
+            return "vllm.attention.backends.torch_sdpa.TorchSDPABackend"
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        return psutil.virtual_memory().total
+
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        return False
+
+    @classmethod
+    def inference_mode(cls):
+        return torch.no_grad()
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        import vllm.envs as envs
+        from vllm.utils import GiB_bytes
+        model_config = vllm_config.model_config
+        # Reminder: Please update docs/features/compatibility_matrix.md
+        # If the feature combo become valid
+        if not model_config.enforce_eager:
+            model_config.enforce_eager = True
+
+        model_config.disable_cascade_attn = True
+
+        cache_config = vllm_config.cache_config
+
+        ipex_available = find_spec("intel_extension_for_pytorch") is not None
+
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 128 if ipex_available else 16
+
+        if not ipex_available and cache_config.block_size != 16:
+            raise RuntimeError(
+                f"--block-size={cache_config.block_size} requires"
+                " intel_extension_for_pytorch")
+
+        scheduler_config = vllm_config.scheduler_config
+        if ((scheduler_config.chunked_prefill_enabled
+             or cache_config.enable_prefix_caching)
+                and cache_config.cache_dtype != "auto"):
+            raise RuntimeError("Chunked-prefill and prefix-cache on the CPU "
+                               "backend is not compatible with FP8 KV cache.")
+
+        if cache_config.cache_dtype == "fp8_e4m3":
+            cache_config.cache_dtype = "fp8_e5m2"
+            logger.warning(
+                "CPU backend doesn't support fp8_e4m3 KV cache type, "
+                "cast to fp8_e5m2.")
+
+        if (cache_config.cache_dtype != "auto"
+                and model_config.dtype == torch.half):
+            logger.warning("FP8 KV cache on the CPU backend only does not"
+                           " support fp16 for now, cast to bf16.")
+            model_config.dtype = torch.bfloat16
+
+        kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
+
+        if kv_cache_space >= 0:
+            if kv_cache_space == 0:
+                cache_config.cpu_kvcache_space_bytes = 4 * GiB_bytes  # type: ignore
+                logger.warning(
+                    "Environment variable VLLM_CPU_KVCACHE_SPACE (GiB) "
+                    "for CPU backend is not set, using 4 by default.")
+            else:
+                cache_config.cpu_kvcache_space_bytes = kv_cache_space * GiB_bytes  # type: ignore # noqa
+        else:
+            raise RuntimeError(
+                "Invalid environment variable VLLM_CPU_KVCACHE_SPACE"
+                f" {kv_cache_space}, expect a positive integer value.")
+
+        parallel_config = vllm_config.parallel_config
+        if (parallel_config.world_size > 1
+                and parallel_config.distributed_executor_backend is not None
+                and parallel_config.distributed_executor_backend != "mp"):
+            logger.warning(("%s is not supported on CPU, fallback to mp "
+                            "distributed executor backend."),
+                           parallel_config.distributed_executor_backend)
+            parallel_config.distributed_executor_backend = "mp"
+        if parallel_config.worker_cls == "auto":
+            if vllm_config.speculative_config:
+                parallel_config.worker_cls = \
+                    "vllm.spec_decode.spec_decode_worker.create_spec_worker"
+                parallel_config.sd_worker_cls = \
+                    "vllm.worker.cpu_worker.CPUWorker"
+            else:
+                if envs.VLLM_USE_V1:
+                    parallel_config.worker_cls = \
+                        "vllm.v1.worker.cpu_worker.CPUWorker"
+                else:
+                    parallel_config.worker_cls = \
+                        "vllm.worker.cpu_worker.CPUWorker"
+
+        # Note: workaround for v1 gpu_model_runner
+        from vllm.config import CompilationLevel
+        vllm_config.compilation_config.cudagraph_capture_sizes = []
+
+        compilation_config = vllm_config.compilation_config
+        if (envs.VLLM_USE_V1 and vllm_config.compilation_config.level
+                == CompilationLevel.PIECEWISE):
+            compilation_config.level = CompilationLevel.DYNAMO_ONCE
+            compilation_config.backend = "eager"
+            compilation_config.custom_ops += ["none"]
+            compilation_config.inductor_compile_config.update({
+                "dce":
+                True,
+                "size_asserts":
+                False,
+                "nan_asserts":
+                False,
+                "memory_planning":
+                True,
+                "epilogue_fusion":
+                True,
+            })
+
+        if vllm_config.lora_config is not None:
+            compilation_config.level = CompilationLevel.NO_COMPILATION
+
+        assert vllm_config.device_config.device_type == "cpu"
+
+        #
+        # Environment variables for CPU executor
+        #
+
+        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+        # Note: to avoid the error 'nthreads cannot be larger than environment
+        #  variable "NUMEXPR_MAX_THREADS" (64)'.
+        os.environ["NUMEXPR_MAX_THREADS"] = str(get_max_threads())
+
+        # Set default threads num for OpenMP parallel
+        os.environ["OMP_NUM_THREADS"] = str(torch.get_num_threads())
+
+        # Disable torch async compiling which won't work with daemonic processes
+        os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
+
+        # Share the cpusets list among ranks by spawning process instead
+        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+        # Intel OpenMP setting
+        ld_prealod_str = os.getenv("LD_PRELOAD", "")
+        if "libiomp5.so" in ld_prealod_str:
+            # The time(milliseconds) that a thread should wait after
+            # completing the execution of a parallel region, before sleeping.
+            os.environ['KMP_BLOCKTIME'] = "1"
+            # Prevents the CPU to run into low performance state
+            os.environ['KMP_TPAUSE'] = "0"
+            # Provides fine granularity parallelism
+            os.environ['KMP_FORKJOIN_BARRIER_PATTERN'] = "dist,dist"
+            os.environ['KMP_PLAIN_BARRIER_PATTERN'] = "dist,dist"
+            os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist"
+
+        # To hint IPEX uses shared memory based AllReduce
+        os.environ["LOCAL_WORLD_SIZE"] = str(
+            vllm_config.parallel_config.tensor_parallel_size)
+
+        if vllm_config.model_config and vllm_config.model_config.use_mla:
+            logger.info(
+                "MLA is enabled on a non-GPU platform; forcing chunked "
+                "prefill and prefix caching to be disabled.")
+            vllm_config.scheduler_config.enable_chunked_prefill = False
+            vllm_config.scheduler_config.chunked_prefill_enabled = False
+            vllm_config.scheduler_config.max_num_batched_tokens = max(
+                vllm_config.scheduler_config.max_model_len,
+                DEFAULT_MAX_NUM_BATCHED_TOKENS)
+
+    @classmethod
+    def is_pin_memory_available(cls) -> bool:
+        logger.warning("Pin memory is not supported on CPU.")
+        return False
+
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        return "vllm.lora.punica_wrapper.punica_cpu.PunicaWrapperCPU"
+
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        """
+        Get device specific communicator class for distributed communication.
+        """
+        return "vllm.distributed.device_communicators.cpu_communicator.CpuCommunicator"  # noqa
+
+    @classmethod
+    def supports_structured_output(cls) -> bool:
+        return True
+
+    @classmethod
+    def supports_v1(cls, model_config) -> bool:
+        """Returns whether the current platform can support v1 for the supplied
+        model configuration.
+        """
+        return True
--- a/platforms/cuda.py
+++ b/platforms/cuda.py
@@ -0,0 +1,526 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Code inside this file can safely assume cuda platform, e.g. importing
+pynvml. However, it should not initialize cuda context.
+"""
+
+import os
+from datetime import timedelta
+from functools import wraps
+from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
+
+import torch
+from torch.distributed import PrefixStore, ProcessGroup
+from torch.distributed.distributed_c10d import is_nccl_available
+from typing_extensions import ParamSpec
+
+# import custom ops, trigger op registration
+import vllm._C  # noqa
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.utils import import_pynvml
+
+from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig, VllmConfig
+
+logger = init_logger(__name__)
+
+_P = ParamSpec("_P")
+_R = TypeVar("_R")
+
+pynvml = import_pynvml()
+
+# pytorch 2.5 uses cudnn sdpa by default, which will cause crash on some models
+# see https://github.com/huggingface/diffusers/issues/9704 for details
+torch.backends.cuda.enable_cudnn_sdp(False)
+
+
+def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
+
+    @wraps(fn)
+    def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
+        pynvml.nvmlInit()
+        try:
+            return fn(*args, **kwargs)
+        finally:
+            pynvml.nvmlShutdown()
+
+    return wrapper
+
+
+class CudaPlatformBase(Platform):
+    _enum = PlatformEnum.CUDA
+    device_name: str = "cuda"
+    device_type: str = "cuda"
+    dispatch_key: str = "CUDA"
+    ray_device_key: str = "GPU"
+    device_control_env_var: str = "CUDA_VISIBLE_DEVICES"
+
+    @property
+    def supported_dtypes(self) -> list[torch.dtype]:
+        if self.has_device_capability(80):
+            # Ampere and Hopper or later NVIDIA GPUs.
+            return [torch.bfloat16, torch.float16, torch.float32]
+        elif (not self.has_device_capability(80)
+              ) and self.has_device_capability(60):
+            # Pascal, Volta and Turing NVIDIA GPUs, BF16 is not supported
+            return [torch.float16, torch.float32]
+        # Kepler and Maxwell NVIDIA GPUs, only FP32 is supported,
+        # though vLLM doesn't support these GPUs.
+        return [torch.float32]
+
+    @classmethod
+    def get_device_capability(cls,
+                              device_id: int = 0
+                              ) -> Optional[DeviceCapability]:
+        raise NotImplementedError
+
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        raise NotImplementedError
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        raise NotImplementedError
+
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        if enforce_eager:
+            logger.warning(
+                "To see benefits of async output processing, enable CUDA "
+                "graph. Since, enforce-eager is enabled, async output "
+                "processor cannot be used")
+            return False
+        return True
+
+    @classmethod
+    def is_fully_connected(cls, device_ids: list[int]) -> bool:
+        raise NotImplementedError
+
+    @classmethod
+    def log_warnings(cls):
+        pass
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
+        parallel_config = vllm_config.parallel_config
+        scheduler_config = vllm_config.scheduler_config
+        model_config = vllm_config.model_config
+
+        if parallel_config.worker_cls == "auto":
+            if scheduler_config.is_multi_step:
+                if envs.VLLM_USE_V1:
+                    raise NotImplementedError(
+                        "Multi-step scheduling is not supported (and not "
+                        "needed) on vLLM V1. Please launch without "
+                        "--num-scheduler-steps.")
+                else:
+                    parallel_config.worker_cls = \
+                        "vllm.worker.multi_step_worker.MultiStepWorker"
+            elif vllm_config.speculative_config:
+                if envs.VLLM_USE_V1:
+                    parallel_config.worker_cls = \
+                            "vllm.v1.worker.gpu_worker.Worker"
+                else:
+                    parallel_config.worker_cls = \
+                        "vllm.spec_decode.spec_decode_worker.create_spec_worker"
+                    parallel_config.sd_worker_cls = \
+                        "vllm.worker.worker.Worker"
+            else:
+                if envs.VLLM_USE_V1:
+                    parallel_config.worker_cls = \
+                            "vllm.v1.worker.gpu_worker.Worker"
+                else:
+                    parallel_config.worker_cls = "vllm.worker.worker.Worker"
+
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
+        # TODO(lucas): handle this more gracefully
+        # Note: model_config may be None during testing
+        if model_config is not None and model_config.use_mla:
+            # if `VLLM_ATTENTION_BACKEND` is not set and we are using MLA, then
+            # we default to FlashMLA backend, so we need to force the blocksize
+            # here
+            use_flashmla = (envs.VLLM_ATTENTION_BACKEND is None \
+                or envs.VLLM_ATTENTION_BACKEND == "FLASHMLA")
+            from vllm.attention.ops.flashmla import is_flashmla_supported
+            if use_flashmla and is_flashmla_supported()[0] \
+                and cache_config.block_size != 64:
+                cache_config.block_size = 64
+                logger.info(
+                    "Forcing kv cache block size to 64 for FlashMLA backend.")
+
+        if (envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput"
+                and parallel_config.data_parallel_size > 1
+                and vllm_config.compilation_config.use_cudagraph):
+            logger.info(
+                "Data Parallel: Forcing enforce eager to be True since DP "
+                "with DeepEP high-throughput kernels are not CUDA Graph "
+                "compatible. The DeepEP low-latency kernels are CUDA Graph "
+                "compatible. Set the all_to_all backend to deepep_low_latency "
+                "to use those kernels instead.")
+            vllm_config.compilation_config.use_cudagraph = False
+            vllm_config.model_config.enforce_eager = True
+            # TODO (varun): Turning this ON gives incorrect results for the
+            # Deepseek-V2-lite model.
+            vllm_config.compilation_config.use_inductor = False
+
+    @classmethod
+    def get_current_memory_usage(cls,
+                                 device: Optional[torch.types.Device] = None
+                                 ) -> float:
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats(device)
+        return torch.cuda.max_memory_allocated(device)
+
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
+                             kv_cache_dtype, block_size, use_v1,
+                             use_mla) -> str:
+        if use_mla:
+            # TODO(lucas): refactor to  be more concise
+            #  we should probably consider factoring out V1 here
+            if selected_backend == _Backend.CUTLASS_MLA_VLLM_V1:
+                if use_v1:
+                    logger.info_once("Using Cutlass MLA backend on V1 engine.")
+                    return ("vllm.v1.attention.backends.mla."
+                            "cutlass_mla.CutlassMLABackend")
+                else:
+                    logger.warning(
+                        "Cutlass MLA backend is only supported on V1 engine")
+            if selected_backend == _Backend.TRITON_MLA or block_size != 64:
+                if use_v1:
+                    logger.info_once("Using Triton MLA backend on V1 engine.")
+                    return ("vllm.v1.attention.backends.mla."
+                            "triton_mla.TritonMLABackend")
+                else:
+                    logger.info("Using Triton MLA backend.")
+                    return "vllm.attention.backends.triton_mla.TritonMLABackend"
+            else:
+                from vllm.attention.backends.flashmla import (
+                    is_flashmla_supported)
+                if not is_flashmla_supported()[0]:
+                    logger.warning(
+                        "FlashMLA backend is not supported due to %s",
+                        is_flashmla_supported()[1])
+                elif block_size != 64:
+                    logger.warning(
+                        "FlashMLA backend is not supported for block size %d"
+                        " (currently only supports block size 64).",
+                        block_size)
+                else:
+                    if use_v1:
+                        logger.info_once(
+                            "Using FlashMLA backend on V1 engine.")
+                        return ("vllm.v1.attention.backends.mla."
+                                "flashmla.FlashMLABackend")
+                    else:
+                        logger.info("Using FlashMLA backend.")
+                        return ("vllm.attention.backends."
+                                "flashmla.FlashMLABackend")
+        if use_v1:
+            if selected_backend == _Backend.FLASHINFER:
+                logger.info_once("Using FlashInfer backend on V1 engine.")
+                return "vllm.v1.attention.backends.flashinfer.FlashInferBackend"
+            if selected_backend == _Backend.FLEX_ATTENTION:
+                logger.info("Using FlexAttenion backend on V1 engine.")
+                return "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"  # noqa: E501
+            if selected_backend == _Backend.TRITON_ATTN_VLLM_V1:
+                logger.info_once("Using Triton backend on V1 engine.")
+                return ("vllm.v1.attention.backends."
+                        "triton_attn.TritonAttentionBackend")
+            if cls.is_device_capability(100):
+                # Prefer FlashInfer for V1 on Blackwell GPUs if installed
+                try:
+                    import flashinfer  # noqa: F401
+                    logger.info_once(
+                        "Using FlashInfer backend on V1 engine by default for "
+                        "Blackwell (SM 10.0) GPUs.")
+                    return ("vllm.v1.attention.backends."
+                            "flashinfer.FlashInferBackend")
+                except ImportError:
+                    logger.info_once(
+                        "FlashInfer failed to import for V1 engine on "
+                        "Blackwell (SM 10.0) GPUs; it is recommended to "
+                        "install FlashInfer for better performance.")
+                    pass
+            if cls.has_device_capability(80):
+                logger.info_once("Using Flash Attention backend on V1 engine.")
+                return ("vllm.v1.attention.backends."
+                        "flash_attn.FlashAttentionBackend")
+        if selected_backend == _Backend.FLASHINFER:
+            logger.info("Using FlashInfer backend.")
+            return "vllm.attention.backends.flashinfer.FlashInferBackend"
+        elif selected_backend == _Backend.XFORMERS:
+            logger.info("Using XFormers backend.")
+            return "vllm.attention.backends.xformers.XFormersBackend"
+        elif selected_backend == _Backend.DUAL_CHUNK_FLASH_ATTN:
+            logger.info("Using DualChunkFlashAttention backend.")
+            return ("vllm.attention.backends.dual_chunk_flash_attn."
+                    "DualChunkFlashAttentionBackend")
+        elif selected_backend == _Backend.FLASH_ATTN:
+            pass
+        elif selected_backend:
+            raise ValueError(
+                f"Invalid attention backend for {cls.device_name}, "
+                f"with use_v1: {use_v1} use_mla: {use_mla}")
+
+        target_backend = _Backend.FLASH_ATTN
+        if not cls.has_device_capability(80):
+            # Volta and Turing NVIDIA GPUs.
+            logger.info(
+                "Cannot use FlashAttention-2 backend for Volta and Turing "
+                "GPUs.")
+            target_backend = _Backend.XFORMERS
+        elif dtype not in (torch.float16, torch.bfloat16):
+            logger.info(
+                "Cannot use FlashAttention-2 backend for dtype other than "
+                "torch.float16 or torch.bfloat16.")
+            target_backend = _Backend.XFORMERS
+        elif block_size % 16 != 0:
+            logger.info(
+                "Cannot use FlashAttention-2 backend for block size not "
+                "divisible by 16.")
+            target_backend = _Backend.XFORMERS
+
+        # FlashAttn is valid for the model, checking if the package is
+        # installed.
+        if target_backend == _Backend.FLASH_ATTN:
+            try:
+                import flash_attn  # noqa: F401
+                from vllm.attention.backends.flash_attn import (  # noqa: F401
+                    FlashAttentionBackend, flash_attn_supports_fp8)
+
+                supported_sizes = \
+                    FlashAttentionBackend.get_supported_head_sizes()
+                if head_size not in supported_sizes:
+                    logger.info(
+                        "Cannot use FlashAttention-2 backend for head size %d.",
+                        head_size)
+                    target_backend = _Backend.XFORMERS
+                fp8_kv_cache = (kv_cache_dtype is not None
+                                and kv_cache_dtype.startswith("fp8"))
+                if (fp8_kv_cache and not flash_attn_supports_fp8()):
+                    logger.info(
+                        "Cannot use FlashAttention backend for FP8 KV cache.")
+                    logger.warning(
+                        "Please use FlashInfer backend with FP8 KV Cache for "
+                        "better performance by setting environment variable "
+                        "VLLM_ATTENTION_BACKEND=FLASHINFER")
+                    target_backend = _Backend.XFORMERS
+            except ImportError:
+                logger.info(
+                    "Cannot use FlashAttention-2 backend because the "
+                    "vllm.vllm_flash_attn package is not found. "
+                    "Make sure that vllm_flash_attn was built and installed "
+                    "(on by default).")
+                target_backend = _Backend.XFORMERS
+
+        if target_backend == _Backend.XFORMERS:
+            logger.info("Using XFormers backend.")
+            return "vllm.attention.backends.xformers.XFormersBackend"
+
+        logger.info("Using Flash Attention backend.")
+        return "vllm.attention.backends.flash_attn.FlashAttentionBackend"
+
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU"
+
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        return "vllm.distributed.device_communicators.cuda_communicator.CudaCommunicator"  # noqa
+
+    @classmethod
+    def supports_fp8(cls) -> bool:
+        return cls.has_device_capability(89)
+
+    @classmethod
+    def supports_v1(cls, model_config: "ModelConfig") -> bool:
+        return True
+
+    @classmethod
+    def use_custom_allreduce(cls) -> bool:
+        return True
+
+    @classmethod
+    def get_piecewise_backend_cls(cls) -> str:
+        return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend"  # noqa
+
+    @classmethod
+    def stateless_init_device_torch_dist_pg(
+        cls,
+        backend: str,
+        prefix_store: PrefixStore,
+        group_rank: int,
+        group_size: int,
+        timeout: timedelta,
+    ) -> ProcessGroup:
+        assert is_nccl_available()
+        pg: ProcessGroup = ProcessGroup(
+            prefix_store,
+            group_rank,
+            group_size,
+        )
+        from torch.distributed.distributed_c10d import ProcessGroupNCCL
+
+        backend_options = ProcessGroupNCCL.Options()
+        backend_options._timeout = timeout
+
+        backend_class = ProcessGroupNCCL(prefix_store, group_rank, group_size,
+                                         backend_options)
+        backend_type = ProcessGroup.BackendType.NCCL
+        device = torch.device("cuda")
+        pg._set_default_backend(backend_type)
+        backend_class._set_sequence_number_for_group()
+
+        pg._register_backend(device, backend_type, backend_class)
+        return pg
+
+
+# NVML utils
+# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
+# all the related functions work on real physical device ids.
+# the major benefit of using NVML is that it will not initialize CUDA
+class NvmlCudaPlatform(CudaPlatformBase):
+
+    @classmethod
+    @with_nvml_context
+    def get_device_capability(cls,
+                              device_id: int = 0
+                              ) -> Optional[DeviceCapability]:
+        try:
+            physical_device_id = cls.device_id_to_physical_device_id(device_id)
+            handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
+            major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
+            return DeviceCapability(major=major, minor=minor)
+        except RuntimeError:
+            return None
+
+    @classmethod
+    @with_nvml_context
+    def has_device_capability(
+        cls,
+        capability: Union[tuple[int, int], int],
+        device_id: int = 0,
+    ) -> bool:
+        try:
+            return super().has_device_capability(capability, device_id)
+        except RuntimeError:
+            return False
+
+    @classmethod
+    @with_nvml_context
+    def get_device_name(cls, device_id: int = 0) -> str:
+        physical_device_id = cls.device_id_to_physical_device_id(device_id)
+        return cls._get_physical_device_name(physical_device_id)
+
+    @classmethod
+    @with_nvml_context
+    def get_device_uuid(cls, device_id: int = 0) -> str:
+        physical_device_id = cls.device_id_to_physical_device_id(device_id)
+        handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
+        return pynvml.nvmlDeviceGetUUID(handle)
+
+    @classmethod
+    @with_nvml_context
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        physical_device_id = cls.device_id_to_physical_device_id(device_id)
+        handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
+        return int(pynvml.nvmlDeviceGetMemoryInfo(handle).total)
+
+    @classmethod
+    @with_nvml_context
+    def is_fully_connected(cls, physical_device_ids: list[int]) -> bool:
+        """
+        query if the set of gpus are fully connected by nvlink (1 hop)
+        """
+        handles = [
+            pynvml.nvmlDeviceGetHandleByIndex(i) for i in physical_device_ids
+        ]
+        for i, handle in enumerate(handles):
+            for j, peer_handle in enumerate(handles):
+                if i < j:
+                    try:
+                        p2p_status = pynvml.nvmlDeviceGetP2PStatus(
+                            handle,
+                            peer_handle,
+                            pynvml.NVML_P2P_CAPS_INDEX_NVLINK,
+                        )
+                        if p2p_status != pynvml.NVML_P2P_STATUS_OK:
+                            return False
+                    except pynvml.NVMLError:
+                        logger.exception(
+                            "NVLink detection failed. This is normal if"
+                            " your machine has no NVLink equipped.")
+                        return False
+        return True
+
+    @classmethod
+    def _get_physical_device_name(cls, device_id: int = 0) -> str:
+        handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
+        return pynvml.nvmlDeviceGetName(handle)
+
+    @classmethod
+    @with_nvml_context
+    def log_warnings(cls):
+        device_ids: int = pynvml.nvmlDeviceGetCount()
+        if device_ids > 1:
+            device_names = [
+                cls._get_physical_device_name(i) for i in range(device_ids)
+            ]
+            if (len(set(device_names)) > 1
+                    and os.environ.get("CUDA_DEVICE_ORDER") != "PCI_BUS_ID"):
+                logger.warning(
+                    "Detected different devices in the system: %s. Please"
+                    " make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to "
+                    "avoid unexpected behavior.",
+                    ", ".join(device_names),
+                )
+
+
+class NonNvmlCudaPlatform(CudaPlatformBase):
+
+    @classmethod
+    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
+        major, minor = torch.cuda.get_device_capability(device_id)
+        return DeviceCapability(major=major, minor=minor)
+
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return torch.cuda.get_device_name(device_id)
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        device_props = torch.cuda.get_device_properties(device_id)
+        return device_props.total_memory
+
+    @classmethod
+    def is_fully_connected(cls, physical_device_ids: list[int]) -> bool:
+        logger.exception(
+            "NVLink detection not possible, as context support was"
+            " not found. Assuming no NVLink available.")
+        return False
+
+
+# Autodetect either NVML-enabled or non-NVML platform
+# based on whether NVML is available.
+nvml_available = False
+try:
+    try:
+        pynvml.nvmlInit()
+        nvml_available = True
+    except Exception:
+        # On Jetson, NVML is not supported.
+        nvml_available = False
+finally:
+    if nvml_available:
+        pynvml.nvmlShutdown()
+
+CudaPlatform = NvmlCudaPlatform if nvml_available else NonNvmlCudaPlatform
+
+CudaPlatform.log_warnings()
--- a/platforms/hpu.py
+++ b/platforms/hpu.py
@@ -0,0 +1,106 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+from vllm import envs
+from vllm.logger import init_logger
+from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
+
+from .interface import Platform, PlatformEnum, _Backend
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
+
+logger = init_logger(__name__)
+
+
+class HpuPlatform(Platform):
+    _enum = PlatformEnum.HPU
+    device_name: str = "hpu"
+    device_type: str = "hpu"
+    dispatch_key: str = "HPU"
+    ray_device_key: str = "HPU"
+    device_control_env_var: str = "HABANA_VISIBLE_MODULES"
+
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
+        logger.info("Using HPUAttention backend.")
+        return "vllm.attention.backends.hpu_attn.HPUAttentionBackend"
+
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        return True
+
+    @classmethod
+    def inference_mode(cls):
+        return torch.no_grad()
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+
+        scheduler_config = vllm_config.scheduler_config
+        parallel_config = vllm_config.parallel_config
+        if scheduler_config.is_multi_step:
+            parallel_config.worker_cls = \
+                "vllm.worker.multi_step_hpu_worker.MultiStepHPUWorker"
+
+        if vllm_config.speculative_config is not None:
+            raise NotImplementedError(
+                "Speculative decoding is not implemented for HPU")
+
+        if parallel_config.worker_cls == "auto":
+            parallel_config.worker_cls = "vllm.worker.hpu_worker.HPUWorker"
+
+        # NOTE(kzawora): default block size for Gaudi should be 128
+        # smaller sizes still work, but very inefficiently
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 128
+        if (parallel_config.distributed_executor_backend == 'mp'
+                and envs.VLLM_WORKER_MULTIPROC_METHOD == 'fork'):
+            if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD",
+                              None) is not None:
+                logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork "
+                               "might cause application hangs on exit. Using "
+                               "VLLM_WORKER_MULTIPROC_METHOD=fork anyway, "
+                               "as it was explicitly requested.")
+            else:
+                logger.warning(
+                    "On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork "
+                    "might cause application hangs on exit. Setting "
+                    "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
+                    "To override that behavior, please set "
+                    "VLLM_WORKER_MULTIPROC_METHOD=fork explicitly.")
+                os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+        if vllm_config.model_config and vllm_config.model_config.use_mla:
+            logger.info(
+                "MLA is enabled on a non-GPU platform; forcing chunked "
+                "prefill and prefix caching to be disabled.")
+            vllm_config.scheduler_config.enable_chunked_prefill = False
+            vllm_config.scheduler_config.chunked_prefill_enabled = False
+            vllm_config.scheduler_config.max_num_batched_tokens = max(
+                vllm_config.scheduler_config.max_model_len,
+                DEFAULT_MAX_NUM_BATCHED_TOKENS)
+
+    @classmethod
+    def is_pin_memory_available(cls):
+        logger.warning("Pin memory is not supported on HPU.")
+        return False
+
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        return "vllm.lora.punica_wrapper.punica_hpu.PunicaWrapperHPU"
+
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        return "vllm.distributed.device_communicators.hpu_communicator.HpuCommunicator"  # noqa
--- a/platforms/interface.py
+++ b/platforms/interface.py
@@ -0,0 +1,538 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import enum
+import os
+import platform
+import random
+from datetime import timedelta
+from platform import uname
+from typing import TYPE_CHECKING, NamedTuple, Optional, Union
+
+import numpy as np
+import torch
+from torch.distributed import PrefixStore, ProcessGroup
+
+from vllm.inputs import ProcessorInputs, PromptType
+from vllm.logger import init_logger
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig, VllmConfig
+    from vllm.lora.request import LoRARequest
+    from vllm.pooling_params import PoolingParams
+    from vllm.sampling_params import SamplingParams
+    from vllm.utils import FlexibleArgumentParser
+else:
+    ModelConfig = None
+    VllmConfig = None
+    LoRARequest = None
+    PoolingParams = None
+    SamplingParams = None
+    FlexibleArgumentParser = None
+
+logger = init_logger(__name__)
+
+
+def in_wsl() -> bool:
+    # Reference: https://github.com/microsoft/WSL/issues/4071
+    return "microsoft" in " ".join(uname()).lower()
+
+
+class _Backend(enum.Enum):
+    FLASH_ATTN = enum.auto()
+    FLASH_ATTN_VLLM_V1 = enum.auto()
+    TRITON_ATTN_VLLM_V1 = enum.auto()
+    XFORMERS = enum.auto()
+    ROCM_FLASH = enum.auto()
+    ROCM_AITER_MLA = enum.auto()  # Supported by V1
+    ROCM_AITER_MLA_VLLM_V1 = enum.auto()
+    TORCH_SDPA = enum.auto()
+    FLASHINFER = enum.auto()
+    FLASHINFER_VLLM_V1 = enum.auto()
+    TRITON_MLA = enum.auto()  # Supported by V1
+    TRITON_MLA_VLLM_V1 = enum.auto()
+    FLASHMLA_VLLM_V1 = enum.auto()
+    FLASHMLA = enum.auto()  # Supported by V1
+    CUTLASS_MLA_VLLM_V1 = enum.auto()
+    HPU_ATTN = enum.auto()
+    PALLAS = enum.auto()
+    PALLAS_VLLM_V1 = enum.auto()
+    IPEX = enum.auto()
+    BLOCK_SPARSE_FLASH_ATTN = enum.auto()
+    DUAL_CHUNK_FLASH_ATTN = enum.auto()
+    NO_ATTENTION = enum.auto()
+    FLEX_ATTENTION = enum.auto()
+
+
+class PlatformEnum(enum.Enum):
+    CUDA = enum.auto()
+    ROCM = enum.auto()
+    TPU = enum.auto()
+    HPU = enum.auto()
+    XPU = enum.auto()
+    CPU = enum.auto()
+    NEURON = enum.auto()
+    OOT = enum.auto()
+    UNSPECIFIED = enum.auto()
+
+
+class CpuArchEnum(enum.Enum):
+    X86 = enum.auto()
+    ARM = enum.auto()
+    POWERPC = enum.auto()
+    OTHER = enum.auto()
+    UNKNOWN = enum.auto()
+
+
+class DeviceCapability(NamedTuple):
+    major: int
+    minor: int
+
+    def as_version_str(self) -> str:
+        return f"{self.major}.{self.minor}"
+
+    def to_int(self) -> int:
+        """
+        Express device capability as an integer `<major><minor>`.
+
+        It is assumed that the minor version is always a single digit.
+        """
+        assert 0 <= self.minor < 10
+        return self.major * 10 + self.minor
+
+
+class Platform:
+    _enum: PlatformEnum
+    device_name: str
+    device_type: str
+
+    # available dispatch keys:
+    # check https://github.com/pytorch/pytorch/blob/313dac6c1ca0fa0cde32477509cce32089f8532a/torchgen/model.py#L134 # noqa
+    # use "CPU" as a fallback for platforms not registered in PyTorch
+    dispatch_key: str = "CPU"
+
+    # available ray device keys:
+    # https://github.com/ray-project/ray/blob/10ba5adadcc49c60af2c358a33bb943fb491a171/python/ray/_private/ray_constants.py#L438 # noqa
+    # empty string means the device does not support ray
+    ray_device_key: str = ""
+
+    # platform-agnostic way to specify the device control environment variable,
+    # .e.g. CUDA_VISIBLE_DEVICES for CUDA.
+    # hint: search for "get_visible_accelerator_ids_env_var" in
+    # https://github.com/ray-project/ray/tree/master/python/ray/_private/accelerators # noqa
+    device_control_env_var: str = "VLLM_DEVICE_CONTROL_ENV_VAR_PLACEHOLDER"
+
+    # The torch.compile backend for compiling simple and
+    # standalone functions. The default value is "inductor" to keep
+    # the same behavior as PyTorch.
+    # NOTE: for the forward part of the model, vLLM has another separate
+    # compilation strategy.
+    simple_compile_backend: str = "inductor"
+
+    supported_quantization: list[str] = []
+
+    additional_env_vars: list[str] = []
+
+    @property
+    def supported_dtypes(self) -> list[torch.dtype]:
+        """Returns the supported dtypes for the current platform."""
+        # Be careful with the order of the dtypes. The first dtype will
+        # be used as the default dtype fallback for the current platform,
+        # when encountering unsupported dtypes in "auto" dtype.
+        return [torch.bfloat16, torch.float16, torch.float32]
+
+    def is_cuda(self) -> bool:
+        return self._enum == PlatformEnum.CUDA
+
+    def is_rocm(self) -> bool:
+        return self._enum == PlatformEnum.ROCM
+
+    def is_tpu(self) -> bool:
+        return self._enum == PlatformEnum.TPU
+
+    def is_hpu(self) -> bool:
+        return self._enum == PlatformEnum.HPU
+
+    def is_xpu(self) -> bool:
+        return self._enum == PlatformEnum.XPU
+
+    def is_cpu(self) -> bool:
+        return self._enum == PlatformEnum.CPU
+
+    def is_neuron(self) -> bool:
+        return self._enum == PlatformEnum.NEURON
+
+    def is_out_of_tree(self) -> bool:
+        return self._enum == PlatformEnum.OOT
+
+    def is_cuda_alike(self) -> bool:
+        """Stateless version of [torch.cuda.is_available][]."""
+        return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
+
+    def is_sleep_mode_available(self) -> bool:
+        return self._enum == PlatformEnum.CUDA
+
+    @classmethod
+    def device_id_to_physical_device_id(cls, device_id: int):
+        if cls.device_control_env_var in os.environ:
+            device_ids = os.environ[cls.device_control_env_var].split(",")
+            if device_ids == [""]:
+                msg = (f"{cls.device_control_env_var} is set to empty string, "
+                       "which means current platform support is disabled. If "
+                       "you are using ray, please unset the environment "
+                       f"variable `{cls.device_control_env_var}` inside the "
+                       "worker/actor. Check "
+                       "https://github.com/vllm-project/vllm/issues/8402 for "
+                       "more information.")
+                raise RuntimeError(msg)
+            physical_device_id = device_ids[device_id]
+            return int(physical_device_id)
+        else:
+            return device_id
+
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
+        """Get the attention backend class of a device."""
+        return ""
+
+    @classmethod
+    def get_device_capability(
+        cls,
+        device_id: int = 0,
+    ) -> Optional[DeviceCapability]:
+        """Stateless version of [torch.cuda.get_device_capability][]."""
+        return None
+
+    @classmethod
+    def has_device_capability(
+        cls,
+        capability: Union[tuple[int, int], int],
+        device_id: int = 0,
+    ) -> bool:
+        """
+        Test whether this platform is compatible with a device capability.
+
+        The `capability` argument can either be:
+
+        - A tuple `(major, minor)`.
+        - An integer `<major><minor>`. (See
+        [`DeviceCapability.to_int`][vllm.platforms.interface.DeviceCapability.to_int])
+        """
+        current_capability = cls.get_device_capability(device_id=device_id)
+        if current_capability is None:
+            return False
+
+        if isinstance(capability, tuple):
+            return current_capability >= capability
+
+        return current_capability.to_int() >= capability
+
+    @classmethod
+    def is_device_capability(
+        cls,
+        capability: Union[tuple[int, int], int],
+        device_id: int = 0,
+    ) -> bool:
+        """
+        Test whether this platform has exactly the specified device capability.
+
+        The `capability` argument can either be:
+
+        - A tuple `(major, minor)`.
+        - An integer `<major><minor>`. (See
+        [`DeviceCapability.to_int`][vllm.platforms.interface.DeviceCapability.to_int])
+        """
+        current_capability = cls.get_device_capability(device_id=device_id)
+        if current_capability is None:
+            return False
+
+        if isinstance(capability, tuple):
+            return current_capability == capability
+
+        return current_capability.to_int() == capability
+
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        """Get the name of a device."""
+        raise NotImplementedError
+
+    @classmethod
+    def get_device_uuid(cls, device_id: int = 0) -> str:
+        """Get the uuid of a device, e.g. the PCI bus ID."""
+        raise NotImplementedError
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        """Get the total memory of a device in bytes."""
+        raise NotImplementedError
+
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        """
+        Check if the current platform supports async output.
+        """
+        raise NotImplementedError
+
+    @classmethod
+    def inference_mode(cls):
+        """A device-specific wrapper of `torch.inference_mode`.
+
+        This wrapper is recommended because some hardware backends such as TPU
+        do not support `torch.inference_mode`. In such a case, they will fall
+        back to `torch.no_grad` by overriding this method.
+        """
+        return torch.inference_mode(mode=True)
+
+    @classmethod
+    def seed_everything(cls, seed: Optional[int] = None) -> None:
+        """
+        Set the seed of each random module.
+        `torch.manual_seed` will set seed on all devices.
+
+        Loosely based on: https://github.com/Lightning-AI/pytorch-lightning/blob/2.4.0/src/lightning/fabric/utilities/seed.py#L20
+        """
+        if seed is not None:
+            random.seed(seed)
+            np.random.seed(seed)
+            torch.manual_seed(seed)
+
+    @classmethod
+    def pre_register_and_update(cls,
+                                parser: Optional[FlexibleArgumentParser] = None
+                                ) -> None:
+        """
+        Do some pre-registration or update action for the current platform.
+
+        This function is called before global VllmConfig is initialized or cli
+        arguments are parsed. It's used for out-of-tree platforms to register or
+        update the configuration.
+
+        For example, the out-of-tree quantization config can be imported and
+        registered here dynamically.
+        """
+        pass
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        """
+        Check and update the configuration for the current platform.
+
+        It can raise an exception if the configuration is not compatible with
+        the current platform, or it can update the configuration to make it
+        compatible with the current platform.
+
+        The config is passed by reference, so it can be modified in place.
+        """
+        pass
+
+    @classmethod
+    def verify_model_arch(cls, model_arch: str) -> None:
+        """
+        Verify whether the current platform supports the specified model
+        architecture.
+
+        - This will raise an Error or Warning based on the model support on
+        the current platform.
+        - By default all models are considered supported.
+        """
+        pass
+
+    @classmethod
+    def verify_quantization(cls, quant: str) -> None:
+        """
+        Verify whether the quantization is supported by the current platform.
+        """
+        if cls.supported_quantization and \
+            quant not in cls.supported_quantization:
+            raise ValueError(
+                f"{quant} quantization is currently not supported in "
+                f"{cls.device_name}.")
+
+    @classmethod
+    def get_cpu_architecture(cls) -> CpuArchEnum:
+        """
+        Determine the CPU architecture of the current system.
+        Returns CpuArchEnum indicating the architecture type.
+        """
+        machine = platform.machine().lower()
+
+        if machine in ("x86_64", "amd64", "i386", "i686"):
+            return CpuArchEnum.X86
+        elif machine.startswith("arm") or machine.startswith("aarch"):
+            return CpuArchEnum.ARM
+        elif machine.startswith("ppc"):
+            return CpuArchEnum.POWERPC
+
+        return CpuArchEnum.OTHER if machine else CpuArchEnum.UNKNOWN
+
+    @classmethod
+    def is_pin_memory_available(cls) -> bool:
+        """Checks whether pin memory is available on the current platform."""
+        if in_wsl():
+            # Pinning memory in WSL is not supported.
+            # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
+            logger.warning("Using 'pin_memory=False' as WSL is detected. "
+                           "This may slow down the performance.")
+            return False
+        return True
+
+    @classmethod
+    def get_current_memory_usage(cls,
+                                 device: Optional[torch.types.Device] = None
+                                 ) -> float:
+        """
+        Return the memory usage in bytes.
+        """
+        raise NotImplementedError
+
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        """
+        Return the punica wrapper for current platform.
+        """
+        raise NotImplementedError
+
+    @classmethod
+    def get_infinity_values(cls, dtype: torch.dtype) -> tuple[float, float]:
+        """
+        Return the platform specific values for (-inf, inf)
+        """
+        return float("-inf"), float("inf")
+
+    @classmethod
+    def can_update_inplace(cls) -> bool:
+        """
+        Checks if the platform allows inplace memory updates
+        """
+        return True
+
+    @classmethod
+    def get_lora_vocab_padding_size(cls) -> int:
+        """
+        Returns how much padding the LoRA logits need for kernels
+        """
+        return 256
+
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        """
+        Get device specific communicator class for distributed communication.
+        """
+        return "vllm.distributed.device_communicators.base_device_communicator.DeviceCommunicatorBase"  # noqa
+
+    @classmethod
+    def supports_mx(cls) -> bool:
+        """
+        Returns whether the current platform supports MX types.
+        """
+        return False
+
+    @classmethod
+    def supports_fp8(cls) -> bool:
+        """
+        Returns whether the current platform supports FP8 types.
+        """
+        return False
+
+    @classmethod
+    def is_fp8_fnuz(cls) -> bool:
+        """
+        Returns whether the preferred FP8 type is FNUZ on the current platform.
+
+        There are two representations of FP8, OCP FP8 and FNUZ FP8.
+        The OCP specification can be found at https://tinyurl.com/b7jvwpft.
+        The FNUZ specification can be found at https://tinyurl.com/5n6hwwu5.
+
+        AMD's MI300 and MI325 have native hardware support for FNUZ. All other
+        hardware has converged on the OCP FP8 standard.
+        """
+        return False
+
+    @classmethod
+    def fp8_dtype(cls) -> torch.dtype:
+        """
+        Returns the preferred FP8 type on the current platform.
+
+        See the documentation for is_fp8_fnuz for details.
+        """
+        return torch.float8_e4m3fn
+
+    @classmethod
+    def use_all_gather(cls) -> bool:
+        """
+        Whether to use allgather in LogitsProcessor to gather the logits.
+        """
+        import vllm.envs as envs
+        from vllm.config import get_current_vllm_config
+
+        parallel_config = get_current_vllm_config().parallel_config
+        return (envs.VLLM_USE_V1
+                or parallel_config.distributed_executor_backend
+                == "external_launcher")
+
+    @classmethod
+    def supports_v1(cls, model_config: ModelConfig) -> bool:
+        """Returns whether the current platform can support v1 for the supplied
+        model configuration.
+        """
+        return False
+
+    @classmethod
+    def use_custom_allreduce(cls) -> bool:
+        """
+        Returns if custom allreduce is supported on the current platform
+        """
+        return False
+
+    @classmethod
+    def validate_request(
+        cls,
+        prompt: PromptType,
+        params: Union[SamplingParams, PoolingParams],
+        processed_inputs: ProcessorInputs,
+    ) -> None:
+        """Raises if this request is unsupported on this platform"""
+
+    def __getattr__(self, key: str):
+        device = getattr(torch, self.device_type, None)
+        if device is not None and hasattr(device, key):
+            return getattr(device, key)
+        else:
+            logger.warning("Current platform %s does not have '%s'" \
+            " attribute.", self.device_type, key)
+            return None
+
+    @classmethod
+    def get_cu_count(cls, device_id: int = 0) -> int:
+        """
+        Returns the total number of compute units (CU) on single GPU.
+        """
+        raise NotImplementedError
+
+    @classmethod
+    def get_piecewise_backend_cls(cls) -> str:
+        """
+        Get piecewise backend class for piecewise graph.
+        """
+        return "vllm.compilation.base_piecewise_backend.AbstractPiecewiseBackend"  # noqa
+
+    @classmethod
+    def stateless_init_device_torch_dist_pg(
+        cls,
+        backend: str,
+        prefix_store: PrefixStore,
+        group_rank: int,
+        group_size: int,
+        timeout: timedelta,
+    ) -> ProcessGroup:
+        """
+        Init platform-specific torch distributed process group.
+        """
+        raise RuntimeError(f"Unsupported torch distributed backend: {backend}")
+
+
+class UnspecifiedPlatform(Platform):
+    _enum = PlatformEnum.UNSPECIFIED
+    device_type = ""
--- a/platforms/neuron.py
+++ b/platforms/neuron.py
@@ -0,0 +1,150 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import enum
+import os
+from functools import lru_cache
+from typing import TYPE_CHECKING, Optional
+
+from vllm import envs
+from vllm.logger import init_logger
+from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
+
+from .interface import Platform, PlatformEnum
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
+
+logger = init_logger(__name__)
+
+
+class NeuronFramework(enum.Enum):
+    TRANSFORMERS_NEURONX = "transformers-neuronx"
+    NEURONX_DISTRIBUTED_INFERENCE = "neuronx-distributed-inference"
+
+
+class NeuronPlatform(Platform):
+    _enum = PlatformEnum.NEURON
+    device_name: str = "neuron"
+    device_type: str = "neuron"
+    ray_device_key: str = "neuron_cores"
+    supported_quantization: list[str] = ["neuron_quant", "fbgemm_fp8"]
+    device_control_env_var: str = "NEURON_RT_VISIBLE_CORES"
+
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return "neuron"
+
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        return False
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        parallel_config = vllm_config.parallel_config
+        if parallel_config.worker_cls == "auto":
+            parallel_config.worker_cls = \
+                "vllm.worker.neuron_worker.NeuronWorker"
+
+        if parallel_config.world_size > 1:
+            parallel_config.distributed_executor_backend = "uni"
+
+        if vllm_config.cache_config and vllm_config.model_config:
+            # neuron needs block_size = max_model_len
+            vllm_config.cache_config.block_size = \
+                vllm_config.model_config.max_model_len  # type: ignore
+
+        if vllm_config.model_config and vllm_config.model_config.use_mla:
+            logger.info(
+                "MLA is enabled on a non-GPU platform; forcing chunked "
+                "prefill and prefix caching to be disabled.")
+            vllm_config.scheduler_config.enable_chunked_prefill = False
+            vllm_config.scheduler_config.chunked_prefill_enabled = False
+            vllm_config.scheduler_config.max_num_batched_tokens = max(
+                vllm_config.scheduler_config.max_model_len,
+                DEFAULT_MAX_NUM_BATCHED_TOKENS)
+
+    @classmethod
+    def is_pin_memory_available(cls) -> bool:
+        logger.warning("Pin memory is not supported on Neuron.")
+        return False
+
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        if envs.VLLM_USE_V1:
+            return "vllm.distributed.device_communicators.neuron_communicator.NeuronCommunicator"  # noqa
+        else:
+            return Platform.get_device_communicator_cls()
+
+    @classmethod
+    def use_all_gather(cls) -> bool:
+        return True
+
+    @classmethod
+    @lru_cache
+    def is_neuronx_distributed_inference(cls) -> bool:
+        try:
+            import neuronx_distributed_inference
+        except ImportError:
+            neuronx_distributed_inference = None
+        return neuronx_distributed_inference is not None
+
+    @classmethod
+    @lru_cache
+    def is_transformers_neuronx(cls) -> bool:
+        try:
+            import transformers_neuronx
+        except ImportError:
+            transformers_neuronx = None
+        return transformers_neuronx is not None
+
+    def get_neuron_framework_to_use(self):
+        """Return the specified framework if corresponding installations are
+        available.
+
+        If no framework is specified, use neuronx-distributed-inference by
+        default.
+        If that's unavailable, check and switch to transformers-neuronx.
+        """
+        if not self.is_neuron():
+            raise AssertionError(
+                f"Neuron Framework unavailable for platform: {self}")
+
+        tnx_installed = self.is_transformers_neuronx()
+        nxd_installed = self.is_neuronx_distributed_inference()
+
+        specified_framework = os.environ.get("VLLM_NEURON_FRAMEWORK")
+        tnx_framework = NeuronFramework.TRANSFORMERS_NEURONX.value
+        nxd_framework = NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE.value
+        if specified_framework == tnx_framework and tnx_installed:
+            return self.TRANSFORMERS_NEURONX
+
+        if ((specified_framework == nxd_framework and nxd_installed)
+                or (specified_framework is None and nxd_installed)):
+            return NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE
+
+        if specified_framework is None and tnx_installed:
+            return NeuronFramework.TRANSFORMERS_NEURONX
+
+        return None
+
+    def use_neuronx_distributed(self):
+        """
+        Return True if the framework determined in get_neuron_framework_to_use()
+        is NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE, False otherwise. This
+        is used to select the Neuron model framework and framework-specific
+        configuration to apply during model compilation.
+        """
+        nxd_framework = NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE
+        return self.get_neuron_framework_to_use() == nxd_framework
+
+    def use_transformers_neuronx(self):
+        """
+        Return True if the framework determined in get_neuron_framework_to_use()
+        is NeuronFramework.TRANSFORMERS_NEURONX, False otherwise. This is used
+        to select the Neuron model framework and framework-specific
+        configuration to apply during model compilation.
+        """
+        return self.get_neuron_framework_to_use(
+        ) == NeuronFramework.TRANSFORMERS_NEURONX
--- a/platforms/rocm.py
+++ b/platforms/rocm.py
@@ -0,0 +1,435 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from datetime import timedelta
+from functools import cache, lru_cache, wraps
+from typing import TYPE_CHECKING, Optional
+
+import torch
+from torch.distributed import PrefixStore, ProcessGroup
+from torch.distributed.distributed_c10d import is_nccl_available
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+
+from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig, VllmConfig
+
+logger = init_logger(__name__)
+
+try:
+    from amdsmi import (AmdSmiException, amdsmi_get_gpu_asic_info,
+                        amdsmi_get_processor_handles, amdsmi_init,
+                        amdsmi_shut_down, amdsmi_topo_get_link_type)
+except ImportError as e:
+    logger.warning("Failed to import from amdsmi with %r", e)
+
+try:
+    import vllm._C  # noqa: F401
+except ImportError as e:
+    logger.warning("Failed to import from vllm._C with %r", e)
+
+# import custom ops, trigger op registration
+try:
+    import vllm._rocm_C  # noqa: F401
+except ImportError as e:
+    logger.warning("Failed to import from vllm._rocm_C with %r", e)
+
+# Models not supported by ROCm.
+_ROCM_UNSUPPORTED_MODELS: list[str] = []
+
+# Models partially supported by ROCm.
+# Architecture -> Reason.
+_ROCM_SWA_REASON = ("Sliding window attention (SWA) is not yet supported in "
+                    "Triton flash attention. For half-precision SWA support, "
+                    "please use CK flash attention by setting "
+                    "`VLLM_USE_TRITON_FLASH_ATTN=0`")
+_ROCM_PARTIALLY_SUPPORTED_MODELS: dict[str, str] = {
+    "Qwen2ForCausalLM":
+    _ROCM_SWA_REASON,
+    "MistralForCausalLM":
+    _ROCM_SWA_REASON,
+    "MixtralForCausalLM":
+    _ROCM_SWA_REASON,
+    "PaliGemmaForConditionalGeneration":
+    ("ROCm flash attention does not yet "
+     "fully support 32-bit precision on PaliGemma"),
+    "Phi3VForCausalLM":
+    ("ROCm Triton flash attention may run into compilation errors due to "
+     "excessive use of shared memory. If this happens, disable Triton FA "
+     "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`")
+}
+_ROCM_DEVICE_ID_NAME_MAP: dict[str, str] = {
+    "0x74a0": "AMD_Instinct_MI300A",
+    "0x74a1": "AMD_Instinct_MI300X",
+    "0x74b5": "AMD_Instinct_MI300X",  # MI300X VF
+    "0x74a5": "AMD_Instinct_MI325X",
+    "0x74b9": "AMD_Instinct_MI325X",  # MI325X VF
+    "0x74a9": "AMD_Instinct_MI300X_HF",
+    "0x74bd": "AMD_Instinct_MI300X_HF",
+}
+
+# Prevent use of clashing `{CUDA/HIP}_VISIBLE_DEVICES``
+if "HIP_VISIBLE_DEVICES" in os.environ:
+    val = os.environ["HIP_VISIBLE_DEVICES"]
+    if cuda_val := os.environ.get("CUDA_VISIBLE_DEVICES", None):
+        assert val == cuda_val
+    else:
+        os.environ["CUDA_VISIBLE_DEVICES"] = val
+
+# AMDSMI utils
+# Note that NVML is not affected by `{CUDA/HIP}_VISIBLE_DEVICES`,
+# all the related functions work on real physical device ids.
+# the major benefit of using AMDSMI is that it will not initialize CUDA
+
+
+def with_amdsmi_context(fn):
+
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        amdsmi_init()
+        try:
+            return fn(*args, **kwargs)
+        finally:
+            amdsmi_shut_down()
+
+    return wrapper
+
+
+@cache
+def on_gfx1x() -> bool:
+    GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
+    return any(arch in GPU_ARCH for arch in ["gfx11", "gfx12"])
+
+
+@cache
+def on_mi3xx() -> bool:
+    GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
+    return any(arch in GPU_ARCH for arch in ["gfx942", "gfx950"])
+
+
+@cache
+def on_gfx9() -> bool:
+    GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
+    return any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942", "gfx950"])
+
+
+@cache
+def use_rocm_custom_paged_attention(
+        qtype: torch.dtype,
+        head_size: int,
+        block_size: int,
+        gqa_ratio: int,
+        max_seq_len: int,
+        sliding_window: int,
+        kv_cache_dtype: str,
+        alibi_slopes: Optional[torch.Tensor] = None) -> bool:
+
+    GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
+    ON_GFX9 = any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942", "gfx950"])
+    ON_GFX11_GFX12 = any(arch in GPU_ARCH for arch in ["gfx11", "gfx12"])
+
+    # custom paged attn always supported on V0. On V1, requires sliding window
+    # disabled due to observed numerical discrepancy.
+    if ON_GFX9:
+        return ((not envs.VLLM_USE_V1 or sliding_window == 0
+                 or sliding_window == (-1, -1))
+                and (qtype == torch.half or qtype == torch.bfloat16)
+                and (head_size == 64 or head_size == 128)
+                and (block_size == 16 or block_size == 32)
+                and (gqa_ratio >= 1 and gqa_ratio <= 16)
+                and max_seq_len <= 32768 and (envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
+                and not (envs.VLLM_ROCM_USE_AITER_PAGED_ATTN
+                         and envs.VLLM_ROCM_USE_AITER))
+
+    else:
+        return (ON_GFX11_GFX12 and (not envs.VLLM_USE_V1 or sliding_window == 0
+                                    or sliding_window == (-1, -1))
+                and (qtype == torch.half or qtype == torch.bfloat16)
+                and head_size == 128 and block_size == 16
+                and (gqa_ratio >= 3 and gqa_ratio <= 16)
+                and max_seq_len <= 32768 and alibi_slopes is None
+                and kv_cache_dtype == "auto"
+                and envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
+
+
+class RocmPlatform(Platform):
+    _enum = PlatformEnum.ROCM
+    device_name: str = "rocm"
+    device_type: str = "cuda"
+    dispatch_key: str = "CUDA"
+    ray_device_key: str = "GPU"
+    # rocm shares the same device control env var as CUDA
+    device_control_env_var: str = "CUDA_VISIBLE_DEVICES"
+
+    supported_quantization: list[str] = [
+        "awq", "gptq", "fp8", "compressed-tensors", "fbgemm_fp8", "gguf",
+        "quark", "ptpc_fp8"
+    ]
+
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
+                             kv_cache_dtype, block_size, use_v1,
+                             use_mla) -> str:
+        if use_mla:
+            from vllm.attention.backends.rocm_aiter_mla import (
+                is_aiter_mla_enabled)
+
+            if selected_backend is None:
+                selected_backend = (_Backend.ROCM_AITER_MLA if
+                                    is_aiter_mla_enabled() or block_size == 1
+                                    else _Backend.TRITON_MLA)
+
+            if selected_backend == _Backend.TRITON_MLA:
+                if block_size != 1:
+                    logger.info("Using Triton MLA backend.")
+                    return "vllm.attention.backends.triton_mla.TritonMLABackend"  # noqa: E501
+                else:
+                    raise ValueError(
+                        f" The selected backend, {selected_backend.name},"
+                        f"does not support block size {block_size}.")
+            elif selected_backend == _Backend.ROCM_AITER_MLA \
+                or selected_backend == _Backend.ROCM_AITER_MLA_VLLM_V1:
+                if block_size == 1:
+                    if use_v1:
+                        logger.info("Using AITER MLA backend on V1 engine.")
+                        return "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend"  # noqa: E501
+                    else:
+                        logger.info("Using AITER MLA backend")
+                        return "vllm.attention.backends.rocm_aiter_mla.AiterMLABackend"  # noqa: E501
+                else:
+                    raise ValueError(
+                        f" The selected backend, {selected_backend.name},"
+                        f"does not support block size {block_size}."
+                        "(currently only supports block size 1)")
+            else:
+                raise ValueError(
+                    f" The selected backend, {selected_backend.name},"
+                    f"is not MLA type while requested for MLA backend.")
+
+        if selected_backend is None or selected_backend == _Backend.FLASH_ATTN:
+            selected_backend = _Backend.ROCM_FLASH
+
+        if envs.VLLM_USE_V1:
+            logger.info("Using Triton Attention backend on V1 engine.")
+            return ("vllm.v1.attention.backends."
+                    "triton_attn.TritonAttentionBackend")
+        if selected_backend == _Backend.ROCM_FLASH:
+            if not cls.has_device_capability(90):
+                # not Instinct series GPUs.
+                logger.info("flash_attn is not supported on NAVI GPUs.")
+        else:
+            logger.info("%s is not supported in AMD GPUs.", selected_backend)
+        logger.info("Using ROCmFlashAttention backend.")
+        return "vllm.attention.backends.rocm_flash_attn.ROCmFlashAttentionBackend"  # noqa: E501
+
+    @classmethod
+    @lru_cache(maxsize=8)
+    def get_device_capability(cls,
+                              device_id: int = 0
+                              ) -> Optional[DeviceCapability]:
+        major, minor = torch.cuda.get_device_capability(device_id)
+        return DeviceCapability(major=major, minor=minor)
+
+    @classmethod
+    @with_amdsmi_context
+    def is_fully_connected(cls, physical_device_ids: list[int]) -> bool:
+        """
+        Query if the set of gpus are fully connected by xgmi (1 hop)
+        """
+        handles = [
+            amdsmi_get_processor_handles()[i] for i in physical_device_ids
+        ]
+        for i, handle in enumerate(handles):
+            for j, peer_handle in enumerate(handles):
+                if i < j:
+                    try:
+                        link_type = amdsmi_topo_get_link_type(
+                            handle, peer_handle)
+                        # type is 2 for XGMI
+                        if link_type["hops"] != 1 or link_type["type"] != 2:
+                            return False
+                    except AmdSmiException as error:
+                        logger.error("AMD 1 hop XGMI detection failed.",
+                                     exc_info=error)
+                        return False
+        return True
+
+    @classmethod
+    @with_amdsmi_context
+    @lru_cache(maxsize=8)
+    def get_device_name(cls, device_id: int = 0) -> str:
+        physical_device_id = cls.device_id_to_physical_device_id(device_id)
+        handle = amdsmi_get_processor_handles()[physical_device_id]
+        asic_info = amdsmi_get_gpu_asic_info(handle)
+        device_name: str = asic_info["device_id"]
+        if device_name in _ROCM_DEVICE_ID_NAME_MAP:
+            return _ROCM_DEVICE_ID_NAME_MAP[device_name]
+        return asic_info["market_name"]
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        device_props = torch.cuda.get_device_properties(device_id)
+        return device_props.total_memory
+
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        if enforce_eager:
+            logger.warning(
+                "To see benefits of async output processing, enable CUDA "
+                "graph. Since, enforce-eager is enabled, async output "
+                "processor cannot be used")
+            return False
+        return True
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
+        parallel_config = vllm_config.parallel_config
+        scheduler_config = vllm_config.scheduler_config
+        if parallel_config.worker_cls == "auto":
+            if scheduler_config.is_multi_step:
+                if envs.VLLM_USE_V1:
+                    raise NotImplementedError(
+                        "Multi-step scheduling is not supported (and not "
+                        "needed) on vLLM V1. Please launch without "
+                        "--num-scheduler-steps.")
+                else:
+                    parallel_config.worker_cls = \
+                        "vllm.worker.multi_step_worker.MultiStepWorker"
+            elif vllm_config.speculative_config:
+                if envs.VLLM_USE_V1:
+                    raise NotImplementedError(
+                        "Speculative decoding is not yet supported on vLLM V1."
+                    )
+                else:
+                    parallel_config.worker_cls = \
+                        "vllm.spec_decode.spec_decode_worker.create_spec_worker"
+                    parallel_config.sd_worker_cls = \
+                        "vllm.worker.worker.Worker"
+            else:
+                if envs.VLLM_USE_V1:
+                    parallel_config.worker_cls = \
+                            "vllm.v1.worker.gpu_worker.Worker"
+                else:
+                    parallel_config.worker_cls = "vllm.worker.worker.Worker"
+
+    @classmethod
+    def verify_model_arch(cls, model_arch: str) -> None:
+        if model_arch in _ROCM_UNSUPPORTED_MODELS:
+            raise ValueError(f"Model architecture '{model_arch}' is not "
+                             "supported by ROCm for now.")
+
+        if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS:
+            msg = _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch]
+            logger.warning(
+                "Model architecture '%s' is partially "
+                "supported by ROCm: %s", model_arch, msg)
+
+    @classmethod
+    def verify_quantization(cls, quant: str) -> None:
+        super().verify_quantization(quant)
+        if quant == "awq" and not envs.VLLM_USE_TRITON_AWQ:
+            logger.warning(
+                "Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ"
+                " is not set, enabling VLLM_USE_TRITON_AWQ.")
+        envs.VLLM_USE_TRITON_AWQ = True
+
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU"
+
+    @classmethod
+    def get_current_memory_usage(cls,
+                                 device: Optional[torch.types.Device] = None
+                                 ) -> float:
+        torch.cuda.reset_peak_memory_stats(device)
+        return torch.cuda.mem_get_info(device)[1] - torch.cuda.mem_get_info(
+            device)[0]
+
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        return "vllm.distributed.device_communicators.cuda_communicator.CudaCommunicator"  # noqa
+
+    @classmethod
+    def supports_mx(cls) -> bool:
+        gcn_arch = torch.cuda.get_device_properties(0).gcnArchName
+        return any(gfx in gcn_arch for gfx in ["gfx95"])
+
+    @classmethod
+    def supports_fp8(cls) -> bool:
+        gcn_arch = torch.cuda.get_device_properties(0).gcnArchName
+        return any(gfx in gcn_arch for gfx in ['gfx94', 'gfx95', 'gfx12'])
+
+    @classmethod
+    def is_fp8_fnuz(cls) -> bool:
+        # only device 0 is checked, this assumes MI300 platforms are homogeneous
+        return 'gfx94' in torch.cuda.get_device_properties(0).gcnArchName
+
+    @classmethod
+    def fp8_dtype(cls) -> torch.dtype:
+        if cls.is_fp8_fnuz():
+            return torch.float8_e4m3fnuz
+        else:
+            return torch.float8_e4m3fn
+
+    @classmethod
+    def supports_v1(cls, model_config: "ModelConfig") -> bool:
+        # V1 support on AMD gpus is experimental
+        return True
+
+    @classmethod
+    def use_custom_allreduce(cls) -> bool:
+        # We only enable custom allreduce for MI300 series
+        gcn_arch = torch.cuda.get_device_properties(0).gcnArchName
+        supported_archs = ['gfx94', 'gfx95']
+        return any(gfx in gcn_arch for gfx in supported_archs)
+
+    @classmethod
+    def get_cu_count(cls, device_id: int = 0) -> int:
+        return torch.cuda.get_device_properties(
+            device_id).multi_processor_count
+
+    @classmethod
+    def is_navi(cls) -> bool:
+        return 'gfx1' in torch.cuda.get_device_properties(0).gcnArchName
+
+    @classmethod
+    def get_piecewise_backend_cls(cls) -> str:
+        return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend"  # noqa
+
+    @classmethod
+    def stateless_init_device_torch_dist_pg(
+        cls,
+        backend: str,
+        prefix_store: PrefixStore,
+        group_rank: int,
+        group_size: int,
+        timeout: timedelta,
+    ) -> ProcessGroup:
+        assert is_nccl_available()
+        pg: ProcessGroup = ProcessGroup(
+            prefix_store,
+            group_rank,
+            group_size,
+        )
+        from torch.distributed.distributed_c10d import ProcessGroupNCCL
+
+        backend_options = ProcessGroupNCCL.Options()
+        backend_options._timeout = timeout
+
+        backend_class = ProcessGroupNCCL(prefix_store, group_rank, group_size,
+                                         backend_options)
+        backend_type = ProcessGroup.BackendType.NCCL
+        device = torch.device("cuda")
+        pg._set_default_backend(backend_type)
+        backend_class._set_sequence_number_for_group()
+
+        pg._register_backend(device, backend_type, backend_class)
+        return pg
--- a/platforms/tpu.py
+++ b/platforms/tpu.py
@@ -0,0 +1,216 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import TYPE_CHECKING, Optional, Union, cast
+
+import torch
+from tpu_info import device
+
+import vllm.envs as envs
+from vllm.inputs import ProcessorInputs, PromptType
+from vllm.logger import init_logger
+from vllm.sampling_params import SamplingParams, SamplingType
+from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
+
+from .interface import Platform, PlatformEnum, _Backend
+
+if TYPE_CHECKING:
+    from vllm.config import BlockSize, ModelConfig, VllmConfig
+    from vllm.pooling_params import PoolingParams
+else:
+    BlockSize = None
+    ModelConfig = None
+    VllmConfig = None
+    PoolingParams = None
+
+logger = init_logger(__name__)
+
+
+class TpuPlatform(Platform):
+    _enum = PlatformEnum.TPU
+    device_name: str = "tpu"
+    device_type: str = "tpu"
+    dispatch_key: str = "XLA"
+    ray_device_key: str = "TPU"
+    device_control_env_var: str = "TPU_VISIBLE_CHIPS"
+    simple_compile_backend: str = "openxla"
+
+    supported_quantization: list[str] = ["tpu_int8", "compressed-tensors"]
+
+    additional_env_vars: list[str] = [
+        "TPU_CHIPS_PER_HOST_BOUNDS", "TPU_HOST_BOUNDS"
+    ]
+
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
+        if (selected_backend != _Backend.PALLAS
+                and selected_backend != _Backend.PALLAS_VLLM_V1):
+            logger.info("Cannot use %s backend on TPU.", selected_backend)
+
+        if use_v1:
+            logger.info("Using Pallas V1 backend.")
+            return "vllm.v1.attention.backends.pallas.PallasAttentionBackend"
+        else:
+            logger.info("Using Pallas backend.")
+            return "vllm.attention.backends.pallas.PallasAttentionBackend"
+
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        chip_type, _ = device.get_local_chips()
+        return f"TPU {chip_type.name}"
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        raise NotImplementedError
+
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        return not envs.VLLM_USE_V1
+
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        return "vllm.lora.punica_wrapper.punica_tpu.PunicaWrapperTPU"
+
+    @classmethod
+    def get_infinity_values(cls, dtype: torch.dtype) -> tuple[float, float]:
+        return torch.finfo(dtype).min, torch.finfo(dtype).max
+
+    @classmethod
+    def can_update_inplace(cls):
+        return False
+
+    @classmethod
+    def get_lora_vocab_padding_size(cls) -> int:
+        return 1
+
+    @classmethod
+    def inference_mode(cls):
+        return torch.no_grad()
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        from vllm.config import CompilationLevel
+
+        cache_config = vllm_config.cache_config
+        # For v0, the default block size is 16.
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = cast(BlockSize, 16)
+        compilation_config = vllm_config.compilation_config
+
+        # TPU only supports DYNAMO_ONCE compilation level
+        if compilation_config.level != CompilationLevel.DYNAMO_ONCE:
+            logger.info("[TPU] Forcing DYNAMO_ONCE compilation level")
+            compilation_config.level = CompilationLevel.DYNAMO_ONCE
+
+        if compilation_config.backend == "":
+            compilation_config.backend = "openxla"
+
+        assert vllm_config.speculative_config is None, \
+            "TPU does not support speculative decoding"
+
+        if vllm_config.model_config.dtype in (torch.float16, torch.float32):
+            logger.warning(
+                "The TPU backend currently does not support %s. "
+                "Using bfloat16 instead.", vllm_config.model_config.dtype)
+            vllm_config.model_config.dtype = torch.bfloat16
+
+        if envs.VLLM_USE_V1:
+            from vllm.v1.attention.backends.pallas import (
+                PallasAttentionBackend)
+            cache_config.block_size = PallasAttentionBackend.get_page_size(
+                vllm_config)  # type: ignore[assignment]
+            min_page_size = PallasAttentionBackend.get_min_page_size(
+                vllm_config)
+            if min_page_size > cache_config.block_size:
+                logger.warning(
+                    "Increase the page size from %s to %s to make sure there's"
+                    "no SMEM OOM",
+                    cache_config.block_size,
+                    min_page_size,
+                )
+                cache_config.block_size = min_page_size  # type: ignore[assignment]
+
+        parallel_config = vllm_config.parallel_config
+        scheduler_config = vllm_config.scheduler_config
+        if parallel_config.worker_cls == "auto":
+            if scheduler_config.is_multi_step:
+                if envs.VLLM_USE_V1:
+                    raise NotImplementedError(
+                        "Multi-step scheduling is not supported (and not "
+                        "needed) on vLLM V1. Please launch without "
+                        "--num-scheduler-steps.")
+                else:
+                    parallel_config.worker_cls = \
+                        "vllm.worker.multi_step_tpu_worker.MultiStepTPUWorker"
+            else:
+                if envs.VLLM_USE_V1:
+                    parallel_config.worker_cls = \
+                        "vllm.v1.worker.tpu_worker.TPUWorker"
+                else:
+                    parallel_config.worker_cls = \
+                        "vllm.worker.tpu_worker.TPUWorker"
+
+        assert not vllm_config.speculative_config, (
+            "Speculative decoding is not yet supported for TPU backend")
+
+        if scheduler_config.is_multimodal_model and not \
+            scheduler_config.disable_chunked_mm_input:
+            logger.warning("TPU does not support running Multimodal models"\
+            " without setting `--disable_chunked_mm_input`. " \
+            "Forcing --disable_chunked_mm_input.")
+            scheduler_config.disable_chunked_mm_input = True
+
+        if vllm_config.model_config and vllm_config.model_config.use_mla:
+            logger.info(
+                "MLA is enabled on a non-GPU platform; forcing chunked "
+                "prefill and prefix caching to be disabled.")
+            vllm_config.scheduler_config.enable_chunked_prefill = False
+            vllm_config.scheduler_config.chunked_prefill_enabled = False
+            vllm_config.scheduler_config.max_num_batched_tokens = max(
+                vllm_config.scheduler_config.max_model_len,
+                DEFAULT_MAX_NUM_BATCHED_TOKENS)
+
+    @classmethod
+    def is_pin_memory_available(cls):
+        logger.warning("Pin memory is not supported on TPU.")
+        return False
+
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        return "vllm.distributed.device_communicators.tpu_communicator.TpuCommunicator"  # noqa
+
+    @classmethod
+    def use_all_gather(cls) -> bool:
+        return True
+
+    @classmethod
+    def supports_v1(cls, model_config: ModelConfig) -> bool:
+        # V1 support on TPU is experimental
+        return True
+
+    @classmethod
+    def validate_request(
+        cls,
+        prompt: PromptType,
+        params: Union[SamplingParams, PoolingParams],
+        processed_inputs: ProcessorInputs,
+    ) -> None:
+        """Raises if this request is unsupported on this platform"""
+        if isinstance(params, SamplingParams):
+            if params.guided_decoding is not None and not envs.VLLM_USE_V1:
+                raise ValueError("Structured output is not supported on "
+                                 f"{cls.device_name} V0.")
+            if params.sampling_type == SamplingType.RANDOM_SEED:
+                raise ValueError(
+                    "Torch XLA does not support per-request seed.")
+
+
+try:
+    from tpu_commons.platforms import TpuPlatform as TpuCommonsPlatform
+    TpuPlatform = TpuCommonsPlatform  # type: ignore
+except ImportError:
+    logger.info("tpu_commons not found, using vLLM's TpuPlatform")
+    pass
--- a/platforms/xpu.py
+++ b/platforms/xpu.py
@@ -0,0 +1,156 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
+
+from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
+
+logger = init_logger(__name__)
+
+
+class XPUPlatform(Platform):
+    _enum = PlatformEnum.XPU
+    device_name: str = "xpu"
+    device_type: str = "xpu"
+    dispatch_key: str = "XPU"
+    # Intel XPU's device key is "GPU" for Ray.
+    # see https://github.com/ray-project/ray/blob/6a5eb5865eeb9ccf058a79b44f107e327e360673/python/ray/_private/accelerators/intel_gpu.py#L20 # noqa: E501
+    ray_device_key: str = "GPU"
+    device_control_env_var: str = "ONEAPI_DEVICE_SELECTOR"
+
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool,
+                             use_mla: bool) -> str:
+        if selected_backend != _Backend.IPEX:
+            logger.info("Cannot use %s backend on XPU.", selected_backend)
+        logger.info("Using IPEX attention backend.")
+        return "vllm.attention.backends.ipex_attn.IpexAttnBackend"
+
+    @classmethod
+    def get_device_capability(
+        cls,
+        device_id: int = 0,
+    ) -> Optional[DeviceCapability]:
+        # capacity format differs from cuda's and will cause unexpected
+        # failure, so use None directly
+        return None
+
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return torch.xpu.get_device_name(device_id)
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        device_props = torch.xpu.get_device_properties(device_id)
+        return device_props.total_memory
+
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        return True
+
+    @classmethod
+    def inference_mode(cls):
+        return torch.no_grad()
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
+        # check and update model config
+        model_config = vllm_config.model_config
+        if model_config.dtype == torch.bfloat16:
+            bf16_supported = cls.device_support_bf16()
+            if not bf16_supported:
+                logger.warning(
+                    "bfloat16 is only supported on Intel Data Center GPU, "
+                    "Intel Arc GPU is not supported yet. Your device is %s,"
+                    " which is not supported. will fallback to float16",
+                    cls.get_device_name())
+                model_config.dtype = torch.float16
+        if not model_config.enforce_eager:
+            logger.warning(
+                "CUDA graph is not supported on XPU, fallback to the eager "
+                "mode.")
+            model_config.enforce_eager = True
+
+        if vllm_config.speculative_config is not None:
+            raise NotImplementedError(
+                "XPU does not support speculative decoding")
+
+        if vllm_config.device_config is not None:
+            assert vllm_config.device_config.device_type == "xpu"
+
+        # check and update parallel config
+        parallel_config = vllm_config.parallel_config
+        if parallel_config.worker_cls == "auto":
+            parallel_config.worker_cls = "vllm.worker.xpu_worker.XPUWorker"
+
+        if parallel_config.distributed_executor_backend is None:
+            parallel_config.distributed_executor_backend = "ray"
+        elif parallel_config.distributed_executor_backend == "mp":
+            # FIXME(kunshang):
+            # spawn needs calling `if __name__ == '__main__':``
+            # fork is not supported for xpu start new process.
+            logger.error(
+                "Both start methods (spawn and fork) have issue "
+                "on XPU if you use mp backend, setting it to ray instead.")
+            parallel_config.distributed_executor_backend = "ray"
+
+        elif parallel_config.distributed_executor_backend != "ray":
+            logger.warning(
+                "%s is not supported on XPU, fallback to ray distributed"
+                " executor backend.",
+                parallel_config.distributed_executor_backend)
+            parallel_config.distributed_executor_backend = "ray"
+
+        if vllm_config.model_config and vllm_config.model_config.use_mla:
+            logger.info(
+                "MLA is enabled on a non-GPU platform; forcing chunked "
+                "prefill and prefix caching to be disabled.")
+            vllm_config.scheduler_config.enable_chunked_prefill = False
+            vllm_config.scheduler_config.chunked_prefill_enabled = False
+            vllm_config.scheduler_config.max_num_batched_tokens = max(
+                vllm_config.scheduler_config.max_model_len,
+                DEFAULT_MAX_NUM_BATCHED_TOKENS)
+
+    @classmethod
+    def is_pin_memory_available(cls):
+        logger.warning("Pin memory is not supported on XPU.")
+        return False
+
+    @classmethod
+    def get_current_memory_usage(cls,
+                                 device: Optional[torch.types.Device] = None
+                                 ) -> float:
+        torch.xpu.reset_peak_memory_stats(device)
+        return torch.xpu.max_memory_allocated(device)
+
+    @classmethod
+    def device_support_bf16(cls) -> bool:
+        device_name = cls.get_device_name().lower()
+        if device_name.count("arc") > 0:
+            return False
+        elif device_name.count("data center gpu") > 0:
+            return True
+        else:
+            logger.warning("Unknown device name %s, always use float16",
+                           device_name)
+            return False
+
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        return "vllm.distributed.device_communicators.xpu_communicator.XpuCommunicator"  # noqa