init
This commit is contained in:
291
platforms/__init__.py
Normal file
291
platforms/__init__.py
Normal file
@@ -0,0 +1,291 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import logging
|
||||
import traceback
|
||||
from itertools import chain
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
from vllm.plugins import load_plugins_by_group
|
||||
from vllm.utils import resolve_obj_by_qualname
|
||||
|
||||
from .interface import _Backend # noqa: F401
|
||||
from .interface import CpuArchEnum, Platform, PlatformEnum
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def vllm_version_matches_substr(substr: str) -> bool:
|
||||
"""
|
||||
Check to see if the vLLM version matches a substring.
|
||||
"""
|
||||
from importlib.metadata import PackageNotFoundError, version
|
||||
try:
|
||||
vllm_version = version("vllm")
|
||||
except PackageNotFoundError as e:
|
||||
logger.warning(
|
||||
"The vLLM package was not found, so its version could not be "
|
||||
"inspected. This may cause platform detection to fail.")
|
||||
raise e
|
||||
return substr in vllm_version
|
||||
|
||||
|
||||
def tpu_platform_plugin() -> Optional[str]:
|
||||
is_tpu = False
|
||||
logger.debug("Checking if TPU platform is available.")
|
||||
try:
|
||||
# While it's technically possible to install libtpu on a
|
||||
# non-TPU machine, this is a very uncommon scenario. Therefore,
|
||||
# we assume that libtpu is installed if and only if the machine
|
||||
# has TPUs.
|
||||
import libtpu # noqa: F401
|
||||
is_tpu = True
|
||||
logger.debug("Confirmed TPU platform is available.")
|
||||
except Exception as e:
|
||||
logger.debug("TPU platform is not available because: %s", str(e))
|
||||
|
||||
return "vllm.platforms.tpu.TpuPlatform" if is_tpu else None
|
||||
|
||||
|
||||
def cuda_platform_plugin() -> Optional[str]:
|
||||
is_cuda = False
|
||||
logger.debug("Checking if CUDA platform is available.")
|
||||
try:
|
||||
from vllm.utils import import_pynvml
|
||||
pynvml = import_pynvml()
|
||||
pynvml.nvmlInit()
|
||||
try:
|
||||
# NOTE: Edge case: vllm cpu build on a GPU machine.
|
||||
# Third-party pynvml can be imported in cpu build,
|
||||
# we need to check if vllm is built with cpu too.
|
||||
# Otherwise, vllm will always activate cuda plugin
|
||||
# on a GPU machine, even if in a cpu build.
|
||||
is_cuda = (pynvml.nvmlDeviceGetCount() > 0
|
||||
and not vllm_version_matches_substr("cpu"))
|
||||
if pynvml.nvmlDeviceGetCount() <= 0:
|
||||
logger.debug(
|
||||
"CUDA platform is not available because no GPU is found.")
|
||||
if vllm_version_matches_substr("cpu"):
|
||||
logger.debug("CUDA platform is not available because"
|
||||
" vLLM is built with CPU.")
|
||||
if is_cuda:
|
||||
logger.debug("Confirmed CUDA platform is available.")
|
||||
finally:
|
||||
pynvml.nvmlShutdown()
|
||||
except Exception as e:
|
||||
logger.debug("Exception happens when checking CUDA platform: %s",
|
||||
str(e))
|
||||
if "nvml" not in e.__class__.__name__.lower():
|
||||
# If the error is not related to NVML, re-raise it.
|
||||
raise e
|
||||
|
||||
# CUDA is supported on Jetson, but NVML may not be.
|
||||
import os
|
||||
|
||||
def cuda_is_jetson() -> bool:
|
||||
return os.path.isfile("/etc/nv_tegra_release") \
|
||||
or os.path.exists("/sys/class/tegra-firmware")
|
||||
|
||||
if cuda_is_jetson():
|
||||
logger.debug("Confirmed CUDA platform is available on Jetson.")
|
||||
is_cuda = True
|
||||
else:
|
||||
logger.debug("CUDA platform is not available because: %s", str(e))
|
||||
|
||||
return "vllm.platforms.cuda.CudaPlatform" if is_cuda else None
|
||||
|
||||
|
||||
def rocm_platform_plugin() -> Optional[str]:
|
||||
is_rocm = False
|
||||
logger.debug("Checking if ROCm platform is available.")
|
||||
try:
|
||||
import amdsmi
|
||||
amdsmi.amdsmi_init()
|
||||
try:
|
||||
if len(amdsmi.amdsmi_get_processor_handles()) > 0:
|
||||
is_rocm = True
|
||||
logger.debug("Confirmed ROCm platform is available.")
|
||||
else:
|
||||
logger.debug("ROCm platform is not available because"
|
||||
" no GPU is found.")
|
||||
finally:
|
||||
amdsmi.amdsmi_shut_down()
|
||||
except Exception as e:
|
||||
logger.debug("ROCm platform is not available because: %s", str(e))
|
||||
|
||||
return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None
|
||||
|
||||
|
||||
def hpu_platform_plugin() -> Optional[str]:
|
||||
is_hpu = False
|
||||
logger.debug("Checking if HPU platform is available.")
|
||||
try:
|
||||
from importlib import util
|
||||
is_hpu = util.find_spec('habana_frameworks') is not None
|
||||
if is_hpu:
|
||||
logger.debug("Confirmed HPU platform is available.")
|
||||
else:
|
||||
logger.debug("HPU platform is not available because "
|
||||
"habana_frameworks is not found.")
|
||||
except Exception as e:
|
||||
logger.debug("HPU platform is not available because: %s", str(e))
|
||||
|
||||
return "vllm.platforms.hpu.HpuPlatform" if is_hpu else None
|
||||
|
||||
|
||||
def xpu_platform_plugin() -> Optional[str]:
|
||||
is_xpu = False
|
||||
logger.debug("Checking if XPU platform is available.")
|
||||
try:
|
||||
# installed IPEX if the machine has XPUs.
|
||||
import intel_extension_for_pytorch # noqa: F401
|
||||
import oneccl_bindings_for_pytorch # noqa: F401
|
||||
import torch
|
||||
if hasattr(torch, 'xpu') and torch.xpu.is_available():
|
||||
is_xpu = True
|
||||
logger.debug("Confirmed XPU platform is available.")
|
||||
except Exception as e:
|
||||
logger.debug("XPU platform is not available because: %s", str(e))
|
||||
|
||||
return "vllm.platforms.xpu.XPUPlatform" if is_xpu else None
|
||||
|
||||
|
||||
def cpu_platform_plugin() -> Optional[str]:
|
||||
is_cpu = False
|
||||
logger.debug("Checking if CPU platform is available.")
|
||||
try:
|
||||
is_cpu = vllm_version_matches_substr("cpu")
|
||||
if is_cpu:
|
||||
logger.debug("Confirmed CPU platform is available because"
|
||||
" vLLM is built with CPU.")
|
||||
if not is_cpu:
|
||||
import sys
|
||||
is_cpu = sys.platform.startswith("darwin")
|
||||
if is_cpu:
|
||||
logger.debug("Confirmed CPU platform is available"
|
||||
" because the machine is MacOS.")
|
||||
|
||||
except Exception as e:
|
||||
logger.debug("CPU platform is not available because: %s", str(e))
|
||||
|
||||
return "vllm.platforms.cpu.CpuPlatform" if is_cpu else None
|
||||
|
||||
|
||||
def neuron_platform_plugin() -> Optional[str]:
|
||||
tnx_installed = False
|
||||
nxd_installed = False
|
||||
logger.debug("Checking if Neuron platform is available.")
|
||||
try:
|
||||
import transformers_neuronx # noqa: F401
|
||||
tnx_installed = True
|
||||
logger.debug("Confirmed Neuron platform is available because"
|
||||
" transformers_neuronx is found.")
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import neuronx_distributed_inference # noqa: F401
|
||||
nxd_installed = True
|
||||
logger.debug("Confirmed Neuron platform is available because"
|
||||
" neuronx_distributed_inference is found.")
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
is_neuron = tnx_installed or nxd_installed
|
||||
return "vllm.platforms.neuron.NeuronPlatform" if is_neuron else None
|
||||
|
||||
|
||||
builtin_platform_plugins = {
|
||||
'tpu': tpu_platform_plugin,
|
||||
'cuda': cuda_platform_plugin,
|
||||
'rocm': rocm_platform_plugin,
|
||||
'hpu': hpu_platform_plugin,
|
||||
'xpu': xpu_platform_plugin,
|
||||
'cpu': cpu_platform_plugin,
|
||||
'neuron': neuron_platform_plugin,
|
||||
}
|
||||
|
||||
|
||||
def resolve_current_platform_cls_qualname() -> str:
|
||||
platform_plugins = load_plugins_by_group('vllm.platform_plugins')
|
||||
|
||||
activated_plugins = []
|
||||
|
||||
for name, func in chain(builtin_platform_plugins.items(),
|
||||
platform_plugins.items()):
|
||||
try:
|
||||
assert callable(func)
|
||||
platform_cls_qualname = func()
|
||||
if platform_cls_qualname is not None:
|
||||
activated_plugins.append(name)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
activated_builtin_plugins = list(
|
||||
set(activated_plugins) & set(builtin_platform_plugins.keys()))
|
||||
activated_oot_plugins = list(
|
||||
set(activated_plugins) & set(platform_plugins.keys()))
|
||||
|
||||
if len(activated_oot_plugins) >= 2:
|
||||
raise RuntimeError(
|
||||
"Only one platform plugin can be activated, but got: "
|
||||
f"{activated_oot_plugins}")
|
||||
elif len(activated_oot_plugins) == 1:
|
||||
platform_cls_qualname = platform_plugins[activated_oot_plugins[0]]()
|
||||
logger.info("Platform plugin %s is activated",
|
||||
activated_oot_plugins[0])
|
||||
elif len(activated_builtin_plugins) >= 2:
|
||||
raise RuntimeError(
|
||||
"Only one platform plugin can be activated, but got: "
|
||||
f"{activated_builtin_plugins}")
|
||||
elif len(activated_builtin_plugins) == 1:
|
||||
platform_cls_qualname = builtin_platform_plugins[
|
||||
activated_builtin_plugins[0]]()
|
||||
logger.info("Automatically detected platform %s.",
|
||||
activated_builtin_plugins[0])
|
||||
else:
|
||||
platform_cls_qualname = "vllm.platforms.interface.UnspecifiedPlatform"
|
||||
logger.info(
|
||||
"No platform detected, vLLM is running on UnspecifiedPlatform")
|
||||
return platform_cls_qualname
|
||||
|
||||
|
||||
_current_platform = None
|
||||
_init_trace: str = ''
|
||||
|
||||
if TYPE_CHECKING:
|
||||
current_platform: Platform
|
||||
|
||||
|
||||
def __getattr__(name: str):
|
||||
if name == 'current_platform':
|
||||
# lazy init current_platform.
|
||||
# 1. out-of-tree platform plugins need `from vllm.platforms import
|
||||
# Platform` so that they can inherit `Platform` class. Therefore,
|
||||
# we cannot resolve `current_platform` during the import of
|
||||
# `vllm.platforms`.
|
||||
# 2. when users use out-of-tree platform plugins, they might run
|
||||
# `import vllm`, some vllm internal code might access
|
||||
# `current_platform` during the import, and we need to make sure
|
||||
# `current_platform` is only resolved after the plugins are loaded
|
||||
# (we have tests for this, if any developer violate this, they will
|
||||
# see the test failures).
|
||||
global _current_platform
|
||||
if _current_platform is None:
|
||||
platform_cls_qualname = resolve_current_platform_cls_qualname()
|
||||
_current_platform = resolve_obj_by_qualname(
|
||||
platform_cls_qualname)()
|
||||
global _init_trace
|
||||
_init_trace = "".join(traceback.format_stack())
|
||||
return _current_platform
|
||||
elif name in globals():
|
||||
return globals()[name]
|
||||
else:
|
||||
raise AttributeError(
|
||||
f"No attribute named '{name}' exists in {__name__}.")
|
||||
|
||||
|
||||
__all__ = [
|
||||
'Platform', 'PlatformEnum', 'current_platform', 'CpuArchEnum',
|
||||
"_init_trace"
|
||||
]
|
||||
266
platforms/cpu.py
Normal file
266
platforms/cpu.py
Normal file
@@ -0,0 +1,266 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
import platform
|
||||
import sys
|
||||
from importlib.util import find_spec
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
import psutil
|
||||
import torch
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
|
||||
|
||||
from .interface import CpuArchEnum, Platform, PlatformEnum, _Backend
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import VllmConfig
|
||||
else:
|
||||
VllmConfig = None
|
||||
|
||||
|
||||
def get_max_threads(pid=0):
|
||||
if hasattr(os, 'sched_getaffinity'):
|
||||
return len(os.sched_getaffinity(pid))
|
||||
elif platform.system() == 'Darwin':
|
||||
return os.cpu_count()
|
||||
else:
|
||||
raise NotImplementedError("Unsupported OS")
|
||||
|
||||
|
||||
class CpuPlatform(Platform):
|
||||
_enum = PlatformEnum.CPU
|
||||
device_name: str = "cpu"
|
||||
device_type: str = "cpu"
|
||||
dispatch_key: str = "CPU"
|
||||
|
||||
@property
|
||||
def supported_dtypes(self) -> list[torch.dtype]:
|
||||
if self.get_cpu_architecture() == CpuArchEnum.POWERPC:
|
||||
return [torch.bfloat16, torch.float32]
|
||||
elif sys.platform.startswith(
|
||||
"darwin") and self.get_cpu_architecture() == CpuArchEnum.ARM:
|
||||
# TODO: change this condition to check if the platform support bf16
|
||||
# instead of checking the OS. For instance M2 shall supports bf16
|
||||
# already. But we need to modify `cpu_extension.cmake` to activate
|
||||
# the feature in the build.
|
||||
return [torch.float16, torch.float32]
|
||||
# x86/aarch64 CPU has supported both bf16 and fp16 natively.
|
||||
return [torch.bfloat16, torch.float16, torch.float32]
|
||||
|
||||
@classmethod
|
||||
def get_device_name(cls, device_id: int = 0) -> str:
|
||||
return "cpu"
|
||||
|
||||
@classmethod
|
||||
def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
|
||||
dtype: torch.dtype, kv_cache_dtype: Optional[str],
|
||||
block_size: int, use_v1: bool,
|
||||
use_mla: bool) -> str:
|
||||
if selected_backend and selected_backend != _Backend.TORCH_SDPA:
|
||||
logger.info("Cannot use %s backend on CPU.", selected_backend)
|
||||
if use_mla:
|
||||
logger.info("Using CPU MLA backend.")
|
||||
return "vllm.attention.backends.cpu_mla.CPUMLABackend"
|
||||
logger.info("Using Torch SDPA backend.")
|
||||
if use_v1:
|
||||
return "vllm.v1.attention.backends.cpu_attn.TorchSDPABackend"
|
||||
else:
|
||||
return "vllm.attention.backends.torch_sdpa.TorchSDPABackend"
|
||||
|
||||
@classmethod
|
||||
def get_device_total_memory(cls, device_id: int = 0) -> int:
|
||||
return psutil.virtual_memory().total
|
||||
|
||||
@classmethod
|
||||
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def inference_mode(cls):
|
||||
return torch.no_grad()
|
||||
|
||||
@classmethod
|
||||
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
|
||||
import vllm.envs as envs
|
||||
from vllm.utils import GiB_bytes
|
||||
model_config = vllm_config.model_config
|
||||
# Reminder: Please update docs/features/compatibility_matrix.md
|
||||
# If the feature combo become valid
|
||||
if not model_config.enforce_eager:
|
||||
model_config.enforce_eager = True
|
||||
|
||||
model_config.disable_cascade_attn = True
|
||||
|
||||
cache_config = vllm_config.cache_config
|
||||
|
||||
ipex_available = find_spec("intel_extension_for_pytorch") is not None
|
||||
|
||||
if cache_config and cache_config.block_size is None:
|
||||
cache_config.block_size = 128 if ipex_available else 16
|
||||
|
||||
if not ipex_available and cache_config.block_size != 16:
|
||||
raise RuntimeError(
|
||||
f"--block-size={cache_config.block_size} requires"
|
||||
" intel_extension_for_pytorch")
|
||||
|
||||
scheduler_config = vllm_config.scheduler_config
|
||||
if ((scheduler_config.chunked_prefill_enabled
|
||||
or cache_config.enable_prefix_caching)
|
||||
and cache_config.cache_dtype != "auto"):
|
||||
raise RuntimeError("Chunked-prefill and prefix-cache on the CPU "
|
||||
"backend is not compatible with FP8 KV cache.")
|
||||
|
||||
if cache_config.cache_dtype == "fp8_e4m3":
|
||||
cache_config.cache_dtype = "fp8_e5m2"
|
||||
logger.warning(
|
||||
"CPU backend doesn't support fp8_e4m3 KV cache type, "
|
||||
"cast to fp8_e5m2.")
|
||||
|
||||
if (cache_config.cache_dtype != "auto"
|
||||
and model_config.dtype == torch.half):
|
||||
logger.warning("FP8 KV cache on the CPU backend only does not"
|
||||
" support fp16 for now, cast to bf16.")
|
||||
model_config.dtype = torch.bfloat16
|
||||
|
||||
kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
|
||||
|
||||
if kv_cache_space >= 0:
|
||||
if kv_cache_space == 0:
|
||||
cache_config.cpu_kvcache_space_bytes = 4 * GiB_bytes # type: ignore
|
||||
logger.warning(
|
||||
"Environment variable VLLM_CPU_KVCACHE_SPACE (GiB) "
|
||||
"for CPU backend is not set, using 4 by default.")
|
||||
else:
|
||||
cache_config.cpu_kvcache_space_bytes = kv_cache_space * GiB_bytes # type: ignore # noqa
|
||||
else:
|
||||
raise RuntimeError(
|
||||
"Invalid environment variable VLLM_CPU_KVCACHE_SPACE"
|
||||
f" {kv_cache_space}, expect a positive integer value.")
|
||||
|
||||
parallel_config = vllm_config.parallel_config
|
||||
if (parallel_config.world_size > 1
|
||||
and parallel_config.distributed_executor_backend is not None
|
||||
and parallel_config.distributed_executor_backend != "mp"):
|
||||
logger.warning(("%s is not supported on CPU, fallback to mp "
|
||||
"distributed executor backend."),
|
||||
parallel_config.distributed_executor_backend)
|
||||
parallel_config.distributed_executor_backend = "mp"
|
||||
if parallel_config.worker_cls == "auto":
|
||||
if vllm_config.speculative_config:
|
||||
parallel_config.worker_cls = \
|
||||
"vllm.spec_decode.spec_decode_worker.create_spec_worker"
|
||||
parallel_config.sd_worker_cls = \
|
||||
"vllm.worker.cpu_worker.CPUWorker"
|
||||
else:
|
||||
if envs.VLLM_USE_V1:
|
||||
parallel_config.worker_cls = \
|
||||
"vllm.v1.worker.cpu_worker.CPUWorker"
|
||||
else:
|
||||
parallel_config.worker_cls = \
|
||||
"vllm.worker.cpu_worker.CPUWorker"
|
||||
|
||||
# Note: workaround for v1 gpu_model_runner
|
||||
from vllm.config import CompilationLevel
|
||||
vllm_config.compilation_config.cudagraph_capture_sizes = []
|
||||
|
||||
compilation_config = vllm_config.compilation_config
|
||||
if (envs.VLLM_USE_V1 and vllm_config.compilation_config.level
|
||||
== CompilationLevel.PIECEWISE):
|
||||
compilation_config.level = CompilationLevel.DYNAMO_ONCE
|
||||
compilation_config.backend = "eager"
|
||||
compilation_config.custom_ops += ["none"]
|
||||
compilation_config.inductor_compile_config.update({
|
||||
"dce":
|
||||
True,
|
||||
"size_asserts":
|
||||
False,
|
||||
"nan_asserts":
|
||||
False,
|
||||
"memory_planning":
|
||||
True,
|
||||
"epilogue_fusion":
|
||||
True,
|
||||
})
|
||||
|
||||
if vllm_config.lora_config is not None:
|
||||
compilation_config.level = CompilationLevel.NO_COMPILATION
|
||||
|
||||
assert vllm_config.device_config.device_type == "cpu"
|
||||
|
||||
#
|
||||
# Environment variables for CPU executor
|
||||
#
|
||||
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
# Note: to avoid the error 'nthreads cannot be larger than environment
|
||||
# variable "NUMEXPR_MAX_THREADS" (64)'.
|
||||
os.environ["NUMEXPR_MAX_THREADS"] = str(get_max_threads())
|
||||
|
||||
# Set default threads num for OpenMP parallel
|
||||
os.environ["OMP_NUM_THREADS"] = str(torch.get_num_threads())
|
||||
|
||||
# Disable torch async compiling which won't work with daemonic processes
|
||||
os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
|
||||
|
||||
# Share the cpusets list among ranks by spawning process instead
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
# Intel OpenMP setting
|
||||
ld_prealod_str = os.getenv("LD_PRELOAD", "")
|
||||
if "libiomp5.so" in ld_prealod_str:
|
||||
# The time(milliseconds) that a thread should wait after
|
||||
# completing the execution of a parallel region, before sleeping.
|
||||
os.environ['KMP_BLOCKTIME'] = "1"
|
||||
# Prevents the CPU to run into low performance state
|
||||
os.environ['KMP_TPAUSE'] = "0"
|
||||
# Provides fine granularity parallelism
|
||||
os.environ['KMP_FORKJOIN_BARRIER_PATTERN'] = "dist,dist"
|
||||
os.environ['KMP_PLAIN_BARRIER_PATTERN'] = "dist,dist"
|
||||
os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist"
|
||||
|
||||
# To hint IPEX uses shared memory based AllReduce
|
||||
os.environ["LOCAL_WORLD_SIZE"] = str(
|
||||
vllm_config.parallel_config.tensor_parallel_size)
|
||||
|
||||
if vllm_config.model_config and vllm_config.model_config.use_mla:
|
||||
logger.info(
|
||||
"MLA is enabled on a non-GPU platform; forcing chunked "
|
||||
"prefill and prefix caching to be disabled.")
|
||||
vllm_config.scheduler_config.enable_chunked_prefill = False
|
||||
vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
vllm_config.scheduler_config.max_num_batched_tokens = max(
|
||||
vllm_config.scheduler_config.max_model_len,
|
||||
DEFAULT_MAX_NUM_BATCHED_TOKENS)
|
||||
|
||||
@classmethod
|
||||
def is_pin_memory_available(cls) -> bool:
|
||||
logger.warning("Pin memory is not supported on CPU.")
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def get_punica_wrapper(cls) -> str:
|
||||
return "vllm.lora.punica_wrapper.punica_cpu.PunicaWrapperCPU"
|
||||
|
||||
@classmethod
|
||||
def get_device_communicator_cls(cls) -> str:
|
||||
"""
|
||||
Get device specific communicator class for distributed communication.
|
||||
"""
|
||||
return "vllm.distributed.device_communicators.cpu_communicator.CpuCommunicator" # noqa
|
||||
|
||||
@classmethod
|
||||
def supports_structured_output(cls) -> bool:
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def supports_v1(cls, model_config) -> bool:
|
||||
"""Returns whether the current platform can support v1 for the supplied
|
||||
model configuration.
|
||||
"""
|
||||
return True
|
||||
526
platforms/cuda.py
Normal file
526
platforms/cuda.py
Normal file
@@ -0,0 +1,526 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Code inside this file can safely assume cuda platform, e.g. importing
|
||||
pynvml. However, it should not initialize cuda context.
|
||||
"""
|
||||
|
||||
import os
|
||||
from datetime import timedelta
|
||||
from functools import wraps
|
||||
from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
|
||||
|
||||
import torch
|
||||
from torch.distributed import PrefixStore, ProcessGroup
|
||||
from torch.distributed.distributed_c10d import is_nccl_available
|
||||
from typing_extensions import ParamSpec
|
||||
|
||||
# import custom ops, trigger op registration
|
||||
import vllm._C # noqa
|
||||
import vllm.envs as envs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import import_pynvml
|
||||
|
||||
from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ModelConfig, VllmConfig
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
_P = ParamSpec("_P")
|
||||
_R = TypeVar("_R")
|
||||
|
||||
pynvml = import_pynvml()
|
||||
|
||||
# pytorch 2.5 uses cudnn sdpa by default, which will cause crash on some models
|
||||
# see https://github.com/huggingface/diffusers/issues/9704 for details
|
||||
torch.backends.cuda.enable_cudnn_sdp(False)
|
||||
|
||||
|
||||
def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
|
||||
|
||||
@wraps(fn)
|
||||
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
|
||||
pynvml.nvmlInit()
|
||||
try:
|
||||
return fn(*args, **kwargs)
|
||||
finally:
|
||||
pynvml.nvmlShutdown()
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
class CudaPlatformBase(Platform):
|
||||
_enum = PlatformEnum.CUDA
|
||||
device_name: str = "cuda"
|
||||
device_type: str = "cuda"
|
||||
dispatch_key: str = "CUDA"
|
||||
ray_device_key: str = "GPU"
|
||||
device_control_env_var: str = "CUDA_VISIBLE_DEVICES"
|
||||
|
||||
@property
|
||||
def supported_dtypes(self) -> list[torch.dtype]:
|
||||
if self.has_device_capability(80):
|
||||
# Ampere and Hopper or later NVIDIA GPUs.
|
||||
return [torch.bfloat16, torch.float16, torch.float32]
|
||||
elif (not self.has_device_capability(80)
|
||||
) and self.has_device_capability(60):
|
||||
# Pascal, Volta and Turing NVIDIA GPUs, BF16 is not supported
|
||||
return [torch.float16, torch.float32]
|
||||
# Kepler and Maxwell NVIDIA GPUs, only FP32 is supported,
|
||||
# though vLLM doesn't support these GPUs.
|
||||
return [torch.float32]
|
||||
|
||||
@classmethod
|
||||
def get_device_capability(cls,
|
||||
device_id: int = 0
|
||||
) -> Optional[DeviceCapability]:
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def get_device_name(cls, device_id: int = 0) -> str:
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def get_device_total_memory(cls, device_id: int = 0) -> int:
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
|
||||
if enforce_eager:
|
||||
logger.warning(
|
||||
"To see benefits of async output processing, enable CUDA "
|
||||
"graph. Since, enforce-eager is enabled, async output "
|
||||
"processor cannot be used")
|
||||
return False
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def is_fully_connected(cls, device_ids: list[int]) -> bool:
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def log_warnings(cls):
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
|
||||
parallel_config = vllm_config.parallel_config
|
||||
scheduler_config = vllm_config.scheduler_config
|
||||
model_config = vllm_config.model_config
|
||||
|
||||
if parallel_config.worker_cls == "auto":
|
||||
if scheduler_config.is_multi_step:
|
||||
if envs.VLLM_USE_V1:
|
||||
raise NotImplementedError(
|
||||
"Multi-step scheduling is not supported (and not "
|
||||
"needed) on vLLM V1. Please launch without "
|
||||
"--num-scheduler-steps.")
|
||||
else:
|
||||
parallel_config.worker_cls = \
|
||||
"vllm.worker.multi_step_worker.MultiStepWorker"
|
||||
elif vllm_config.speculative_config:
|
||||
if envs.VLLM_USE_V1:
|
||||
parallel_config.worker_cls = \
|
||||
"vllm.v1.worker.gpu_worker.Worker"
|
||||
else:
|
||||
parallel_config.worker_cls = \
|
||||
"vllm.spec_decode.spec_decode_worker.create_spec_worker"
|
||||
parallel_config.sd_worker_cls = \
|
||||
"vllm.worker.worker.Worker"
|
||||
else:
|
||||
if envs.VLLM_USE_V1:
|
||||
parallel_config.worker_cls = \
|
||||
"vllm.v1.worker.gpu_worker.Worker"
|
||||
else:
|
||||
parallel_config.worker_cls = "vllm.worker.worker.Worker"
|
||||
|
||||
cache_config = vllm_config.cache_config
|
||||
if cache_config and cache_config.block_size is None:
|
||||
cache_config.block_size = 16
|
||||
|
||||
# TODO(lucas): handle this more gracefully
|
||||
# Note: model_config may be None during testing
|
||||
if model_config is not None and model_config.use_mla:
|
||||
# if `VLLM_ATTENTION_BACKEND` is not set and we are using MLA, then
|
||||
# we default to FlashMLA backend, so we need to force the blocksize
|
||||
# here
|
||||
use_flashmla = (envs.VLLM_ATTENTION_BACKEND is None \
|
||||
or envs.VLLM_ATTENTION_BACKEND == "FLASHMLA")
|
||||
from vllm.attention.ops.flashmla import is_flashmla_supported
|
||||
if use_flashmla and is_flashmla_supported()[0] \
|
||||
and cache_config.block_size != 64:
|
||||
cache_config.block_size = 64
|
||||
logger.info(
|
||||
"Forcing kv cache block size to 64 for FlashMLA backend.")
|
||||
|
||||
if (envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput"
|
||||
and parallel_config.data_parallel_size > 1
|
||||
and vllm_config.compilation_config.use_cudagraph):
|
||||
logger.info(
|
||||
"Data Parallel: Forcing enforce eager to be True since DP "
|
||||
"with DeepEP high-throughput kernels are not CUDA Graph "
|
||||
"compatible. The DeepEP low-latency kernels are CUDA Graph "
|
||||
"compatible. Set the all_to_all backend to deepep_low_latency "
|
||||
"to use those kernels instead.")
|
||||
vllm_config.compilation_config.use_cudagraph = False
|
||||
vllm_config.model_config.enforce_eager = True
|
||||
# TODO (varun): Turning this ON gives incorrect results for the
|
||||
# Deepseek-V2-lite model.
|
||||
vllm_config.compilation_config.use_inductor = False
|
||||
|
||||
@classmethod
|
||||
def get_current_memory_usage(cls,
|
||||
device: Optional[torch.types.Device] = None
|
||||
) -> float:
|
||||
torch.cuda.empty_cache()
|
||||
torch.cuda.reset_peak_memory_stats(device)
|
||||
return torch.cuda.max_memory_allocated(device)
|
||||
|
||||
@classmethod
|
||||
def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
|
||||
kv_cache_dtype, block_size, use_v1,
|
||||
use_mla) -> str:
|
||||
if use_mla:
|
||||
# TODO(lucas): refactor to be more concise
|
||||
# we should probably consider factoring out V1 here
|
||||
if selected_backend == _Backend.CUTLASS_MLA_VLLM_V1:
|
||||
if use_v1:
|
||||
logger.info_once("Using Cutlass MLA backend on V1 engine.")
|
||||
return ("vllm.v1.attention.backends.mla."
|
||||
"cutlass_mla.CutlassMLABackend")
|
||||
else:
|
||||
logger.warning(
|
||||
"Cutlass MLA backend is only supported on V1 engine")
|
||||
if selected_backend == _Backend.TRITON_MLA or block_size != 64:
|
||||
if use_v1:
|
||||
logger.info_once("Using Triton MLA backend on V1 engine.")
|
||||
return ("vllm.v1.attention.backends.mla."
|
||||
"triton_mla.TritonMLABackend")
|
||||
else:
|
||||
logger.info("Using Triton MLA backend.")
|
||||
return "vllm.attention.backends.triton_mla.TritonMLABackend"
|
||||
else:
|
||||
from vllm.attention.backends.flashmla import (
|
||||
is_flashmla_supported)
|
||||
if not is_flashmla_supported()[0]:
|
||||
logger.warning(
|
||||
"FlashMLA backend is not supported due to %s",
|
||||
is_flashmla_supported()[1])
|
||||
elif block_size != 64:
|
||||
logger.warning(
|
||||
"FlashMLA backend is not supported for block size %d"
|
||||
" (currently only supports block size 64).",
|
||||
block_size)
|
||||
else:
|
||||
if use_v1:
|
||||
logger.info_once(
|
||||
"Using FlashMLA backend on V1 engine.")
|
||||
return ("vllm.v1.attention.backends.mla."
|
||||
"flashmla.FlashMLABackend")
|
||||
else:
|
||||
logger.info("Using FlashMLA backend.")
|
||||
return ("vllm.attention.backends."
|
||||
"flashmla.FlashMLABackend")
|
||||
if use_v1:
|
||||
if selected_backend == _Backend.FLASHINFER:
|
||||
logger.info_once("Using FlashInfer backend on V1 engine.")
|
||||
return "vllm.v1.attention.backends.flashinfer.FlashInferBackend"
|
||||
if selected_backend == _Backend.FLEX_ATTENTION:
|
||||
logger.info("Using FlexAttenion backend on V1 engine.")
|
||||
return "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend" # noqa: E501
|
||||
if selected_backend == _Backend.TRITON_ATTN_VLLM_V1:
|
||||
logger.info_once("Using Triton backend on V1 engine.")
|
||||
return ("vllm.v1.attention.backends."
|
||||
"triton_attn.TritonAttentionBackend")
|
||||
if cls.is_device_capability(100):
|
||||
# Prefer FlashInfer for V1 on Blackwell GPUs if installed
|
||||
try:
|
||||
import flashinfer # noqa: F401
|
||||
logger.info_once(
|
||||
"Using FlashInfer backend on V1 engine by default for "
|
||||
"Blackwell (SM 10.0) GPUs.")
|
||||
return ("vllm.v1.attention.backends."
|
||||
"flashinfer.FlashInferBackend")
|
||||
except ImportError:
|
||||
logger.info_once(
|
||||
"FlashInfer failed to import for V1 engine on "
|
||||
"Blackwell (SM 10.0) GPUs; it is recommended to "
|
||||
"install FlashInfer for better performance.")
|
||||
pass
|
||||
if cls.has_device_capability(80):
|
||||
logger.info_once("Using Flash Attention backend on V1 engine.")
|
||||
return ("vllm.v1.attention.backends."
|
||||
"flash_attn.FlashAttentionBackend")
|
||||
if selected_backend == _Backend.FLASHINFER:
|
||||
logger.info("Using FlashInfer backend.")
|
||||
return "vllm.attention.backends.flashinfer.FlashInferBackend"
|
||||
elif selected_backend == _Backend.XFORMERS:
|
||||
logger.info("Using XFormers backend.")
|
||||
return "vllm.attention.backends.xformers.XFormersBackend"
|
||||
elif selected_backend == _Backend.DUAL_CHUNK_FLASH_ATTN:
|
||||
logger.info("Using DualChunkFlashAttention backend.")
|
||||
return ("vllm.attention.backends.dual_chunk_flash_attn."
|
||||
"DualChunkFlashAttentionBackend")
|
||||
elif selected_backend == _Backend.FLASH_ATTN:
|
||||
pass
|
||||
elif selected_backend:
|
||||
raise ValueError(
|
||||
f"Invalid attention backend for {cls.device_name}, "
|
||||
f"with use_v1: {use_v1} use_mla: {use_mla}")
|
||||
|
||||
target_backend = _Backend.FLASH_ATTN
|
||||
if not cls.has_device_capability(80):
|
||||
# Volta and Turing NVIDIA GPUs.
|
||||
logger.info(
|
||||
"Cannot use FlashAttention-2 backend for Volta and Turing "
|
||||
"GPUs.")
|
||||
target_backend = _Backend.XFORMERS
|
||||
elif dtype not in (torch.float16, torch.bfloat16):
|
||||
logger.info(
|
||||
"Cannot use FlashAttention-2 backend for dtype other than "
|
||||
"torch.float16 or torch.bfloat16.")
|
||||
target_backend = _Backend.XFORMERS
|
||||
elif block_size % 16 != 0:
|
||||
logger.info(
|
||||
"Cannot use FlashAttention-2 backend for block size not "
|
||||
"divisible by 16.")
|
||||
target_backend = _Backend.XFORMERS
|
||||
|
||||
# FlashAttn is valid for the model, checking if the package is
|
||||
# installed.
|
||||
if target_backend == _Backend.FLASH_ATTN:
|
||||
try:
|
||||
import flash_attn # noqa: F401
|
||||
from vllm.attention.backends.flash_attn import ( # noqa: F401
|
||||
FlashAttentionBackend, flash_attn_supports_fp8)
|
||||
|
||||
supported_sizes = \
|
||||
FlashAttentionBackend.get_supported_head_sizes()
|
||||
if head_size not in supported_sizes:
|
||||
logger.info(
|
||||
"Cannot use FlashAttention-2 backend for head size %d.",
|
||||
head_size)
|
||||
target_backend = _Backend.XFORMERS
|
||||
fp8_kv_cache = (kv_cache_dtype is not None
|
||||
and kv_cache_dtype.startswith("fp8"))
|
||||
if (fp8_kv_cache and not flash_attn_supports_fp8()):
|
||||
logger.info(
|
||||
"Cannot use FlashAttention backend for FP8 KV cache.")
|
||||
logger.warning(
|
||||
"Please use FlashInfer backend with FP8 KV Cache for "
|
||||
"better performance by setting environment variable "
|
||||
"VLLM_ATTENTION_BACKEND=FLASHINFER")
|
||||
target_backend = _Backend.XFORMERS
|
||||
except ImportError:
|
||||
logger.info(
|
||||
"Cannot use FlashAttention-2 backend because the "
|
||||
"vllm.vllm_flash_attn package is not found. "
|
||||
"Make sure that vllm_flash_attn was built and installed "
|
||||
"(on by default).")
|
||||
target_backend = _Backend.XFORMERS
|
||||
|
||||
if target_backend == _Backend.XFORMERS:
|
||||
logger.info("Using XFormers backend.")
|
||||
return "vllm.attention.backends.xformers.XFormersBackend"
|
||||
|
||||
logger.info("Using Flash Attention backend.")
|
||||
return "vllm.attention.backends.flash_attn.FlashAttentionBackend"
|
||||
|
||||
@classmethod
|
||||
def get_punica_wrapper(cls) -> str:
|
||||
return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU"
|
||||
|
||||
@classmethod
|
||||
def get_device_communicator_cls(cls) -> str:
|
||||
return "vllm.distributed.device_communicators.cuda_communicator.CudaCommunicator" # noqa
|
||||
|
||||
@classmethod
|
||||
def supports_fp8(cls) -> bool:
|
||||
return cls.has_device_capability(89)
|
||||
|
||||
@classmethod
|
||||
def supports_v1(cls, model_config: "ModelConfig") -> bool:
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def use_custom_allreduce(cls) -> bool:
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def get_piecewise_backend_cls(cls) -> str:
|
||||
return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend" # noqa
|
||||
|
||||
@classmethod
|
||||
def stateless_init_device_torch_dist_pg(
|
||||
cls,
|
||||
backend: str,
|
||||
prefix_store: PrefixStore,
|
||||
group_rank: int,
|
||||
group_size: int,
|
||||
timeout: timedelta,
|
||||
) -> ProcessGroup:
|
||||
assert is_nccl_available()
|
||||
pg: ProcessGroup = ProcessGroup(
|
||||
prefix_store,
|
||||
group_rank,
|
||||
group_size,
|
||||
)
|
||||
from torch.distributed.distributed_c10d import ProcessGroupNCCL
|
||||
|
||||
backend_options = ProcessGroupNCCL.Options()
|
||||
backend_options._timeout = timeout
|
||||
|
||||
backend_class = ProcessGroupNCCL(prefix_store, group_rank, group_size,
|
||||
backend_options)
|
||||
backend_type = ProcessGroup.BackendType.NCCL
|
||||
device = torch.device("cuda")
|
||||
pg._set_default_backend(backend_type)
|
||||
backend_class._set_sequence_number_for_group()
|
||||
|
||||
pg._register_backend(device, backend_type, backend_class)
|
||||
return pg
|
||||
|
||||
|
||||
# NVML utils
|
||||
# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
|
||||
# all the related functions work on real physical device ids.
|
||||
# the major benefit of using NVML is that it will not initialize CUDA
|
||||
class NvmlCudaPlatform(CudaPlatformBase):
|
||||
|
||||
@classmethod
|
||||
@with_nvml_context
|
||||
def get_device_capability(cls,
|
||||
device_id: int = 0
|
||||
) -> Optional[DeviceCapability]:
|
||||
try:
|
||||
physical_device_id = cls.device_id_to_physical_device_id(device_id)
|
||||
handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
|
||||
major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
|
||||
return DeviceCapability(major=major, minor=minor)
|
||||
except RuntimeError:
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
@with_nvml_context
|
||||
def has_device_capability(
|
||||
cls,
|
||||
capability: Union[tuple[int, int], int],
|
||||
device_id: int = 0,
|
||||
) -> bool:
|
||||
try:
|
||||
return super().has_device_capability(capability, device_id)
|
||||
except RuntimeError:
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
@with_nvml_context
|
||||
def get_device_name(cls, device_id: int = 0) -> str:
|
||||
physical_device_id = cls.device_id_to_physical_device_id(device_id)
|
||||
return cls._get_physical_device_name(physical_device_id)
|
||||
|
||||
@classmethod
|
||||
@with_nvml_context
|
||||
def get_device_uuid(cls, device_id: int = 0) -> str:
|
||||
physical_device_id = cls.device_id_to_physical_device_id(device_id)
|
||||
handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
|
||||
return pynvml.nvmlDeviceGetUUID(handle)
|
||||
|
||||
@classmethod
|
||||
@with_nvml_context
|
||||
def get_device_total_memory(cls, device_id: int = 0) -> int:
|
||||
physical_device_id = cls.device_id_to_physical_device_id(device_id)
|
||||
handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
|
||||
return int(pynvml.nvmlDeviceGetMemoryInfo(handle).total)
|
||||
|
||||
@classmethod
|
||||
@with_nvml_context
|
||||
def is_fully_connected(cls, physical_device_ids: list[int]) -> bool:
|
||||
"""
|
||||
query if the set of gpus are fully connected by nvlink (1 hop)
|
||||
"""
|
||||
handles = [
|
||||
pynvml.nvmlDeviceGetHandleByIndex(i) for i in physical_device_ids
|
||||
]
|
||||
for i, handle in enumerate(handles):
|
||||
for j, peer_handle in enumerate(handles):
|
||||
if i < j:
|
||||
try:
|
||||
p2p_status = pynvml.nvmlDeviceGetP2PStatus(
|
||||
handle,
|
||||
peer_handle,
|
||||
pynvml.NVML_P2P_CAPS_INDEX_NVLINK,
|
||||
)
|
||||
if p2p_status != pynvml.NVML_P2P_STATUS_OK:
|
||||
return False
|
||||
except pynvml.NVMLError:
|
||||
logger.exception(
|
||||
"NVLink detection failed. This is normal if"
|
||||
" your machine has no NVLink equipped.")
|
||||
return False
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def _get_physical_device_name(cls, device_id: int = 0) -> str:
|
||||
handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
|
||||
return pynvml.nvmlDeviceGetName(handle)
|
||||
|
||||
@classmethod
|
||||
@with_nvml_context
|
||||
def log_warnings(cls):
|
||||
device_ids: int = pynvml.nvmlDeviceGetCount()
|
||||
if device_ids > 1:
|
||||
device_names = [
|
||||
cls._get_physical_device_name(i) for i in range(device_ids)
|
||||
]
|
||||
if (len(set(device_names)) > 1
|
||||
and os.environ.get("CUDA_DEVICE_ORDER") != "PCI_BUS_ID"):
|
||||
logger.warning(
|
||||
"Detected different devices in the system: %s. Please"
|
||||
" make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to "
|
||||
"avoid unexpected behavior.",
|
||||
", ".join(device_names),
|
||||
)
|
||||
|
||||
|
||||
class NonNvmlCudaPlatform(CudaPlatformBase):
|
||||
|
||||
@classmethod
|
||||
def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
|
||||
major, minor = torch.cuda.get_device_capability(device_id)
|
||||
return DeviceCapability(major=major, minor=minor)
|
||||
|
||||
@classmethod
|
||||
def get_device_name(cls, device_id: int = 0) -> str:
|
||||
return torch.cuda.get_device_name(device_id)
|
||||
|
||||
@classmethod
|
||||
def get_device_total_memory(cls, device_id: int = 0) -> int:
|
||||
device_props = torch.cuda.get_device_properties(device_id)
|
||||
return device_props.total_memory
|
||||
|
||||
@classmethod
|
||||
def is_fully_connected(cls, physical_device_ids: list[int]) -> bool:
|
||||
logger.exception(
|
||||
"NVLink detection not possible, as context support was"
|
||||
" not found. Assuming no NVLink available.")
|
||||
return False
|
||||
|
||||
|
||||
# Autodetect either NVML-enabled or non-NVML platform
|
||||
# based on whether NVML is available.
|
||||
nvml_available = False
|
||||
try:
|
||||
try:
|
||||
pynvml.nvmlInit()
|
||||
nvml_available = True
|
||||
except Exception:
|
||||
# On Jetson, NVML is not supported.
|
||||
nvml_available = False
|
||||
finally:
|
||||
if nvml_available:
|
||||
pynvml.nvmlShutdown()
|
||||
|
||||
CudaPlatform = NvmlCudaPlatform if nvml_available else NonNvmlCudaPlatform
|
||||
|
||||
CudaPlatform.log_warnings()
|
||||
106
platforms/hpu.py
Normal file
106
platforms/hpu.py
Normal file
@@ -0,0 +1,106 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
import torch
|
||||
|
||||
from vllm import envs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
|
||||
|
||||
from .interface import Platform, PlatformEnum, _Backend
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import VllmConfig
|
||||
else:
|
||||
VllmConfig = None
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class HpuPlatform(Platform):
|
||||
_enum = PlatformEnum.HPU
|
||||
device_name: str = "hpu"
|
||||
device_type: str = "hpu"
|
||||
dispatch_key: str = "HPU"
|
||||
ray_device_key: str = "HPU"
|
||||
device_control_env_var: str = "HABANA_VISIBLE_MODULES"
|
||||
|
||||
@classmethod
|
||||
def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
|
||||
dtype: torch.dtype, kv_cache_dtype: Optional[str],
|
||||
block_size: int, use_v1: bool,
|
||||
use_mla: bool) -> str:
|
||||
logger.info("Using HPUAttention backend.")
|
||||
return "vllm.attention.backends.hpu_attn.HPUAttentionBackend"
|
||||
|
||||
@classmethod
|
||||
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def inference_mode(cls):
|
||||
return torch.no_grad()
|
||||
|
||||
@classmethod
|
||||
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
|
||||
|
||||
scheduler_config = vllm_config.scheduler_config
|
||||
parallel_config = vllm_config.parallel_config
|
||||
if scheduler_config.is_multi_step:
|
||||
parallel_config.worker_cls = \
|
||||
"vllm.worker.multi_step_hpu_worker.MultiStepHPUWorker"
|
||||
|
||||
if vllm_config.speculative_config is not None:
|
||||
raise NotImplementedError(
|
||||
"Speculative decoding is not implemented for HPU")
|
||||
|
||||
if parallel_config.worker_cls == "auto":
|
||||
parallel_config.worker_cls = "vllm.worker.hpu_worker.HPUWorker"
|
||||
|
||||
# NOTE(kzawora): default block size for Gaudi should be 128
|
||||
# smaller sizes still work, but very inefficiently
|
||||
cache_config = vllm_config.cache_config
|
||||
if cache_config and cache_config.block_size is None:
|
||||
cache_config.block_size = 128
|
||||
if (parallel_config.distributed_executor_backend == 'mp'
|
||||
and envs.VLLM_WORKER_MULTIPROC_METHOD == 'fork'):
|
||||
if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD",
|
||||
None) is not None:
|
||||
logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork "
|
||||
"might cause application hangs on exit. Using "
|
||||
"VLLM_WORKER_MULTIPROC_METHOD=fork anyway, "
|
||||
"as it was explicitly requested.")
|
||||
else:
|
||||
logger.warning(
|
||||
"On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork "
|
||||
"might cause application hangs on exit. Setting "
|
||||
"VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
|
||||
"To override that behavior, please set "
|
||||
"VLLM_WORKER_MULTIPROC_METHOD=fork explicitly.")
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
if vllm_config.model_config and vllm_config.model_config.use_mla:
|
||||
logger.info(
|
||||
"MLA is enabled on a non-GPU platform; forcing chunked "
|
||||
"prefill and prefix caching to be disabled.")
|
||||
vllm_config.scheduler_config.enable_chunked_prefill = False
|
||||
vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
vllm_config.scheduler_config.max_num_batched_tokens = max(
|
||||
vllm_config.scheduler_config.max_model_len,
|
||||
DEFAULT_MAX_NUM_BATCHED_TOKENS)
|
||||
|
||||
@classmethod
|
||||
def is_pin_memory_available(cls):
|
||||
logger.warning("Pin memory is not supported on HPU.")
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def get_punica_wrapper(cls) -> str:
|
||||
return "vllm.lora.punica_wrapper.punica_hpu.PunicaWrapperHPU"
|
||||
|
||||
@classmethod
|
||||
def get_device_communicator_cls(cls) -> str:
|
||||
return "vllm.distributed.device_communicators.hpu_communicator.HpuCommunicator" # noqa
|
||||
538
platforms/interface.py
Normal file
538
platforms/interface.py
Normal file
@@ -0,0 +1,538 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import enum
|
||||
import os
|
||||
import platform
|
||||
import random
|
||||
from datetime import timedelta
|
||||
from platform import uname
|
||||
from typing import TYPE_CHECKING, NamedTuple, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.distributed import PrefixStore, ProcessGroup
|
||||
|
||||
from vllm.inputs import ProcessorInputs, PromptType
|
||||
from vllm.logger import init_logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ModelConfig, VllmConfig
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
else:
|
||||
ModelConfig = None
|
||||
VllmConfig = None
|
||||
LoRARequest = None
|
||||
PoolingParams = None
|
||||
SamplingParams = None
|
||||
FlexibleArgumentParser = None
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def in_wsl() -> bool:
|
||||
# Reference: https://github.com/microsoft/WSL/issues/4071
|
||||
return "microsoft" in " ".join(uname()).lower()
|
||||
|
||||
|
||||
class _Backend(enum.Enum):
|
||||
FLASH_ATTN = enum.auto()
|
||||
FLASH_ATTN_VLLM_V1 = enum.auto()
|
||||
TRITON_ATTN_VLLM_V1 = enum.auto()
|
||||
XFORMERS = enum.auto()
|
||||
ROCM_FLASH = enum.auto()
|
||||
ROCM_AITER_MLA = enum.auto() # Supported by V1
|
||||
ROCM_AITER_MLA_VLLM_V1 = enum.auto()
|
||||
TORCH_SDPA = enum.auto()
|
||||
FLASHINFER = enum.auto()
|
||||
FLASHINFER_VLLM_V1 = enum.auto()
|
||||
TRITON_MLA = enum.auto() # Supported by V1
|
||||
TRITON_MLA_VLLM_V1 = enum.auto()
|
||||
FLASHMLA_VLLM_V1 = enum.auto()
|
||||
FLASHMLA = enum.auto() # Supported by V1
|
||||
CUTLASS_MLA_VLLM_V1 = enum.auto()
|
||||
HPU_ATTN = enum.auto()
|
||||
PALLAS = enum.auto()
|
||||
PALLAS_VLLM_V1 = enum.auto()
|
||||
IPEX = enum.auto()
|
||||
BLOCK_SPARSE_FLASH_ATTN = enum.auto()
|
||||
DUAL_CHUNK_FLASH_ATTN = enum.auto()
|
||||
NO_ATTENTION = enum.auto()
|
||||
FLEX_ATTENTION = enum.auto()
|
||||
|
||||
|
||||
class PlatformEnum(enum.Enum):
|
||||
CUDA = enum.auto()
|
||||
ROCM = enum.auto()
|
||||
TPU = enum.auto()
|
||||
HPU = enum.auto()
|
||||
XPU = enum.auto()
|
||||
CPU = enum.auto()
|
||||
NEURON = enum.auto()
|
||||
OOT = enum.auto()
|
||||
UNSPECIFIED = enum.auto()
|
||||
|
||||
|
||||
class CpuArchEnum(enum.Enum):
|
||||
X86 = enum.auto()
|
||||
ARM = enum.auto()
|
||||
POWERPC = enum.auto()
|
||||
OTHER = enum.auto()
|
||||
UNKNOWN = enum.auto()
|
||||
|
||||
|
||||
class DeviceCapability(NamedTuple):
|
||||
major: int
|
||||
minor: int
|
||||
|
||||
def as_version_str(self) -> str:
|
||||
return f"{self.major}.{self.minor}"
|
||||
|
||||
def to_int(self) -> int:
|
||||
"""
|
||||
Express device capability as an integer `<major><minor>`.
|
||||
|
||||
It is assumed that the minor version is always a single digit.
|
||||
"""
|
||||
assert 0 <= self.minor < 10
|
||||
return self.major * 10 + self.minor
|
||||
|
||||
|
||||
class Platform:
|
||||
_enum: PlatformEnum
|
||||
device_name: str
|
||||
device_type: str
|
||||
|
||||
# available dispatch keys:
|
||||
# check https://github.com/pytorch/pytorch/blob/313dac6c1ca0fa0cde32477509cce32089f8532a/torchgen/model.py#L134 # noqa
|
||||
# use "CPU" as a fallback for platforms not registered in PyTorch
|
||||
dispatch_key: str = "CPU"
|
||||
|
||||
# available ray device keys:
|
||||
# https://github.com/ray-project/ray/blob/10ba5adadcc49c60af2c358a33bb943fb491a171/python/ray/_private/ray_constants.py#L438 # noqa
|
||||
# empty string means the device does not support ray
|
||||
ray_device_key: str = ""
|
||||
|
||||
# platform-agnostic way to specify the device control environment variable,
|
||||
# .e.g. CUDA_VISIBLE_DEVICES for CUDA.
|
||||
# hint: search for "get_visible_accelerator_ids_env_var" in
|
||||
# https://github.com/ray-project/ray/tree/master/python/ray/_private/accelerators # noqa
|
||||
device_control_env_var: str = "VLLM_DEVICE_CONTROL_ENV_VAR_PLACEHOLDER"
|
||||
|
||||
# The torch.compile backend for compiling simple and
|
||||
# standalone functions. The default value is "inductor" to keep
|
||||
# the same behavior as PyTorch.
|
||||
# NOTE: for the forward part of the model, vLLM has another separate
|
||||
# compilation strategy.
|
||||
simple_compile_backend: str = "inductor"
|
||||
|
||||
supported_quantization: list[str] = []
|
||||
|
||||
additional_env_vars: list[str] = []
|
||||
|
||||
@property
|
||||
def supported_dtypes(self) -> list[torch.dtype]:
|
||||
"""Returns the supported dtypes for the current platform."""
|
||||
# Be careful with the order of the dtypes. The first dtype will
|
||||
# be used as the default dtype fallback for the current platform,
|
||||
# when encountering unsupported dtypes in "auto" dtype.
|
||||
return [torch.bfloat16, torch.float16, torch.float32]
|
||||
|
||||
def is_cuda(self) -> bool:
|
||||
return self._enum == PlatformEnum.CUDA
|
||||
|
||||
def is_rocm(self) -> bool:
|
||||
return self._enum == PlatformEnum.ROCM
|
||||
|
||||
def is_tpu(self) -> bool:
|
||||
return self._enum == PlatformEnum.TPU
|
||||
|
||||
def is_hpu(self) -> bool:
|
||||
return self._enum == PlatformEnum.HPU
|
||||
|
||||
def is_xpu(self) -> bool:
|
||||
return self._enum == PlatformEnum.XPU
|
||||
|
||||
def is_cpu(self) -> bool:
|
||||
return self._enum == PlatformEnum.CPU
|
||||
|
||||
def is_neuron(self) -> bool:
|
||||
return self._enum == PlatformEnum.NEURON
|
||||
|
||||
def is_out_of_tree(self) -> bool:
|
||||
return self._enum == PlatformEnum.OOT
|
||||
|
||||
def is_cuda_alike(self) -> bool:
|
||||
"""Stateless version of [torch.cuda.is_available][]."""
|
||||
return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
|
||||
|
||||
def is_sleep_mode_available(self) -> bool:
|
||||
return self._enum == PlatformEnum.CUDA
|
||||
|
||||
@classmethod
|
||||
def device_id_to_physical_device_id(cls, device_id: int):
|
||||
if cls.device_control_env_var in os.environ:
|
||||
device_ids = os.environ[cls.device_control_env_var].split(",")
|
||||
if device_ids == [""]:
|
||||
msg = (f"{cls.device_control_env_var} is set to empty string, "
|
||||
"which means current platform support is disabled. If "
|
||||
"you are using ray, please unset the environment "
|
||||
f"variable `{cls.device_control_env_var}` inside the "
|
||||
"worker/actor. Check "
|
||||
"https://github.com/vllm-project/vllm/issues/8402 for "
|
||||
"more information.")
|
||||
raise RuntimeError(msg)
|
||||
physical_device_id = device_ids[device_id]
|
||||
return int(physical_device_id)
|
||||
else:
|
||||
return device_id
|
||||
|
||||
@classmethod
|
||||
def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
|
||||
dtype: torch.dtype, kv_cache_dtype: Optional[str],
|
||||
block_size: int, use_v1: bool,
|
||||
use_mla: bool) -> str:
|
||||
"""Get the attention backend class of a device."""
|
||||
return ""
|
||||
|
||||
@classmethod
|
||||
def get_device_capability(
|
||||
cls,
|
||||
device_id: int = 0,
|
||||
) -> Optional[DeviceCapability]:
|
||||
"""Stateless version of [torch.cuda.get_device_capability][]."""
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def has_device_capability(
|
||||
cls,
|
||||
capability: Union[tuple[int, int], int],
|
||||
device_id: int = 0,
|
||||
) -> bool:
|
||||
"""
|
||||
Test whether this platform is compatible with a device capability.
|
||||
|
||||
The `capability` argument can either be:
|
||||
|
||||
- A tuple `(major, minor)`.
|
||||
- An integer `<major><minor>`. (See
|
||||
[`DeviceCapability.to_int`][vllm.platforms.interface.DeviceCapability.to_int])
|
||||
"""
|
||||
current_capability = cls.get_device_capability(device_id=device_id)
|
||||
if current_capability is None:
|
||||
return False
|
||||
|
||||
if isinstance(capability, tuple):
|
||||
return current_capability >= capability
|
||||
|
||||
return current_capability.to_int() >= capability
|
||||
|
||||
@classmethod
|
||||
def is_device_capability(
|
||||
cls,
|
||||
capability: Union[tuple[int, int], int],
|
||||
device_id: int = 0,
|
||||
) -> bool:
|
||||
"""
|
||||
Test whether this platform has exactly the specified device capability.
|
||||
|
||||
The `capability` argument can either be:
|
||||
|
||||
- A tuple `(major, minor)`.
|
||||
- An integer `<major><minor>`. (See
|
||||
[`DeviceCapability.to_int`][vllm.platforms.interface.DeviceCapability.to_int])
|
||||
"""
|
||||
current_capability = cls.get_device_capability(device_id=device_id)
|
||||
if current_capability is None:
|
||||
return False
|
||||
|
||||
if isinstance(capability, tuple):
|
||||
return current_capability == capability
|
||||
|
||||
return current_capability.to_int() == capability
|
||||
|
||||
@classmethod
|
||||
def get_device_name(cls, device_id: int = 0) -> str:
|
||||
"""Get the name of a device."""
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def get_device_uuid(cls, device_id: int = 0) -> str:
|
||||
"""Get the uuid of a device, e.g. the PCI bus ID."""
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def get_device_total_memory(cls, device_id: int = 0) -> int:
|
||||
"""Get the total memory of a device in bytes."""
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
|
||||
"""
|
||||
Check if the current platform supports async output.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def inference_mode(cls):
|
||||
"""A device-specific wrapper of `torch.inference_mode`.
|
||||
|
||||
This wrapper is recommended because some hardware backends such as TPU
|
||||
do not support `torch.inference_mode`. In such a case, they will fall
|
||||
back to `torch.no_grad` by overriding this method.
|
||||
"""
|
||||
return torch.inference_mode(mode=True)
|
||||
|
||||
@classmethod
|
||||
def seed_everything(cls, seed: Optional[int] = None) -> None:
|
||||
"""
|
||||
Set the seed of each random module.
|
||||
`torch.manual_seed` will set seed on all devices.
|
||||
|
||||
Loosely based on: https://github.com/Lightning-AI/pytorch-lightning/blob/2.4.0/src/lightning/fabric/utilities/seed.py#L20
|
||||
"""
|
||||
if seed is not None:
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
|
||||
@classmethod
|
||||
def pre_register_and_update(cls,
|
||||
parser: Optional[FlexibleArgumentParser] = None
|
||||
) -> None:
|
||||
"""
|
||||
Do some pre-registration or update action for the current platform.
|
||||
|
||||
This function is called before global VllmConfig is initialized or cli
|
||||
arguments are parsed. It's used for out-of-tree platforms to register or
|
||||
update the configuration.
|
||||
|
||||
For example, the out-of-tree quantization config can be imported and
|
||||
registered here dynamically.
|
||||
"""
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
|
||||
"""
|
||||
Check and update the configuration for the current platform.
|
||||
|
||||
It can raise an exception if the configuration is not compatible with
|
||||
the current platform, or it can update the configuration to make it
|
||||
compatible with the current platform.
|
||||
|
||||
The config is passed by reference, so it can be modified in place.
|
||||
"""
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def verify_model_arch(cls, model_arch: str) -> None:
|
||||
"""
|
||||
Verify whether the current platform supports the specified model
|
||||
architecture.
|
||||
|
||||
- This will raise an Error or Warning based on the model support on
|
||||
the current platform.
|
||||
- By default all models are considered supported.
|
||||
"""
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def verify_quantization(cls, quant: str) -> None:
|
||||
"""
|
||||
Verify whether the quantization is supported by the current platform.
|
||||
"""
|
||||
if cls.supported_quantization and \
|
||||
quant not in cls.supported_quantization:
|
||||
raise ValueError(
|
||||
f"{quant} quantization is currently not supported in "
|
||||
f"{cls.device_name}.")
|
||||
|
||||
@classmethod
|
||||
def get_cpu_architecture(cls) -> CpuArchEnum:
|
||||
"""
|
||||
Determine the CPU architecture of the current system.
|
||||
Returns CpuArchEnum indicating the architecture type.
|
||||
"""
|
||||
machine = platform.machine().lower()
|
||||
|
||||
if machine in ("x86_64", "amd64", "i386", "i686"):
|
||||
return CpuArchEnum.X86
|
||||
elif machine.startswith("arm") or machine.startswith("aarch"):
|
||||
return CpuArchEnum.ARM
|
||||
elif machine.startswith("ppc"):
|
||||
return CpuArchEnum.POWERPC
|
||||
|
||||
return CpuArchEnum.OTHER if machine else CpuArchEnum.UNKNOWN
|
||||
|
||||
@classmethod
|
||||
def is_pin_memory_available(cls) -> bool:
|
||||
"""Checks whether pin memory is available on the current platform."""
|
||||
if in_wsl():
|
||||
# Pinning memory in WSL is not supported.
|
||||
# https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
|
||||
logger.warning("Using 'pin_memory=False' as WSL is detected. "
|
||||
"This may slow down the performance.")
|
||||
return False
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def get_current_memory_usage(cls,
|
||||
device: Optional[torch.types.Device] = None
|
||||
) -> float:
|
||||
"""
|
||||
Return the memory usage in bytes.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def get_punica_wrapper(cls) -> str:
|
||||
"""
|
||||
Return the punica wrapper for current platform.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def get_infinity_values(cls, dtype: torch.dtype) -> tuple[float, float]:
|
||||
"""
|
||||
Return the platform specific values for (-inf, inf)
|
||||
"""
|
||||
return float("-inf"), float("inf")
|
||||
|
||||
@classmethod
|
||||
def can_update_inplace(cls) -> bool:
|
||||
"""
|
||||
Checks if the platform allows inplace memory updates
|
||||
"""
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def get_lora_vocab_padding_size(cls) -> int:
|
||||
"""
|
||||
Returns how much padding the LoRA logits need for kernels
|
||||
"""
|
||||
return 256
|
||||
|
||||
@classmethod
|
||||
def get_device_communicator_cls(cls) -> str:
|
||||
"""
|
||||
Get device specific communicator class for distributed communication.
|
||||
"""
|
||||
return "vllm.distributed.device_communicators.base_device_communicator.DeviceCommunicatorBase" # noqa
|
||||
|
||||
@classmethod
|
||||
def supports_mx(cls) -> bool:
|
||||
"""
|
||||
Returns whether the current platform supports MX types.
|
||||
"""
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def supports_fp8(cls) -> bool:
|
||||
"""
|
||||
Returns whether the current platform supports FP8 types.
|
||||
"""
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def is_fp8_fnuz(cls) -> bool:
|
||||
"""
|
||||
Returns whether the preferred FP8 type is FNUZ on the current platform.
|
||||
|
||||
There are two representations of FP8, OCP FP8 and FNUZ FP8.
|
||||
The OCP specification can be found at https://tinyurl.com/b7jvwpft.
|
||||
The FNUZ specification can be found at https://tinyurl.com/5n6hwwu5.
|
||||
|
||||
AMD's MI300 and MI325 have native hardware support for FNUZ. All other
|
||||
hardware has converged on the OCP FP8 standard.
|
||||
"""
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def fp8_dtype(cls) -> torch.dtype:
|
||||
"""
|
||||
Returns the preferred FP8 type on the current platform.
|
||||
|
||||
See the documentation for is_fp8_fnuz for details.
|
||||
"""
|
||||
return torch.float8_e4m3fn
|
||||
|
||||
@classmethod
|
||||
def use_all_gather(cls) -> bool:
|
||||
"""
|
||||
Whether to use allgather in LogitsProcessor to gather the logits.
|
||||
"""
|
||||
import vllm.envs as envs
|
||||
from vllm.config import get_current_vllm_config
|
||||
|
||||
parallel_config = get_current_vllm_config().parallel_config
|
||||
return (envs.VLLM_USE_V1
|
||||
or parallel_config.distributed_executor_backend
|
||||
== "external_launcher")
|
||||
|
||||
@classmethod
|
||||
def supports_v1(cls, model_config: ModelConfig) -> bool:
|
||||
"""Returns whether the current platform can support v1 for the supplied
|
||||
model configuration.
|
||||
"""
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def use_custom_allreduce(cls) -> bool:
|
||||
"""
|
||||
Returns if custom allreduce is supported on the current platform
|
||||
"""
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def validate_request(
|
||||
cls,
|
||||
prompt: PromptType,
|
||||
params: Union[SamplingParams, PoolingParams],
|
||||
processed_inputs: ProcessorInputs,
|
||||
) -> None:
|
||||
"""Raises if this request is unsupported on this platform"""
|
||||
|
||||
def __getattr__(self, key: str):
|
||||
device = getattr(torch, self.device_type, None)
|
||||
if device is not None and hasattr(device, key):
|
||||
return getattr(device, key)
|
||||
else:
|
||||
logger.warning("Current platform %s does not have '%s'" \
|
||||
" attribute.", self.device_type, key)
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def get_cu_count(cls, device_id: int = 0) -> int:
|
||||
"""
|
||||
Returns the total number of compute units (CU) on single GPU.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def get_piecewise_backend_cls(cls) -> str:
|
||||
"""
|
||||
Get piecewise backend class for piecewise graph.
|
||||
"""
|
||||
return "vllm.compilation.base_piecewise_backend.AbstractPiecewiseBackend" # noqa
|
||||
|
||||
@classmethod
|
||||
def stateless_init_device_torch_dist_pg(
|
||||
cls,
|
||||
backend: str,
|
||||
prefix_store: PrefixStore,
|
||||
group_rank: int,
|
||||
group_size: int,
|
||||
timeout: timedelta,
|
||||
) -> ProcessGroup:
|
||||
"""
|
||||
Init platform-specific torch distributed process group.
|
||||
"""
|
||||
raise RuntimeError(f"Unsupported torch distributed backend: {backend}")
|
||||
|
||||
|
||||
class UnspecifiedPlatform(Platform):
|
||||
_enum = PlatformEnum.UNSPECIFIED
|
||||
device_type = ""
|
||||
150
platforms/neuron.py
Normal file
150
platforms/neuron.py
Normal file
@@ -0,0 +1,150 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import enum
|
||||
import os
|
||||
from functools import lru_cache
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
from vllm import envs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
|
||||
|
||||
from .interface import Platform, PlatformEnum
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import VllmConfig
|
||||
else:
|
||||
VllmConfig = None
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class NeuronFramework(enum.Enum):
|
||||
TRANSFORMERS_NEURONX = "transformers-neuronx"
|
||||
NEURONX_DISTRIBUTED_INFERENCE = "neuronx-distributed-inference"
|
||||
|
||||
|
||||
class NeuronPlatform(Platform):
|
||||
_enum = PlatformEnum.NEURON
|
||||
device_name: str = "neuron"
|
||||
device_type: str = "neuron"
|
||||
ray_device_key: str = "neuron_cores"
|
||||
supported_quantization: list[str] = ["neuron_quant", "fbgemm_fp8"]
|
||||
device_control_env_var: str = "NEURON_RT_VISIBLE_CORES"
|
||||
|
||||
@classmethod
|
||||
def get_device_name(cls, device_id: int = 0) -> str:
|
||||
return "neuron"
|
||||
|
||||
@classmethod
|
||||
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
|
||||
parallel_config = vllm_config.parallel_config
|
||||
if parallel_config.worker_cls == "auto":
|
||||
parallel_config.worker_cls = \
|
||||
"vllm.worker.neuron_worker.NeuronWorker"
|
||||
|
||||
if parallel_config.world_size > 1:
|
||||
parallel_config.distributed_executor_backend = "uni"
|
||||
|
||||
if vllm_config.cache_config and vllm_config.model_config:
|
||||
# neuron needs block_size = max_model_len
|
||||
vllm_config.cache_config.block_size = \
|
||||
vllm_config.model_config.max_model_len # type: ignore
|
||||
|
||||
if vllm_config.model_config and vllm_config.model_config.use_mla:
|
||||
logger.info(
|
||||
"MLA is enabled on a non-GPU platform; forcing chunked "
|
||||
"prefill and prefix caching to be disabled.")
|
||||
vllm_config.scheduler_config.enable_chunked_prefill = False
|
||||
vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
vllm_config.scheduler_config.max_num_batched_tokens = max(
|
||||
vllm_config.scheduler_config.max_model_len,
|
||||
DEFAULT_MAX_NUM_BATCHED_TOKENS)
|
||||
|
||||
@classmethod
|
||||
def is_pin_memory_available(cls) -> bool:
|
||||
logger.warning("Pin memory is not supported on Neuron.")
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def get_device_communicator_cls(cls) -> str:
|
||||
if envs.VLLM_USE_V1:
|
||||
return "vllm.distributed.device_communicators.neuron_communicator.NeuronCommunicator" # noqa
|
||||
else:
|
||||
return Platform.get_device_communicator_cls()
|
||||
|
||||
@classmethod
|
||||
def use_all_gather(cls) -> bool:
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
@lru_cache
|
||||
def is_neuronx_distributed_inference(cls) -> bool:
|
||||
try:
|
||||
import neuronx_distributed_inference
|
||||
except ImportError:
|
||||
neuronx_distributed_inference = None
|
||||
return neuronx_distributed_inference is not None
|
||||
|
||||
@classmethod
|
||||
@lru_cache
|
||||
def is_transformers_neuronx(cls) -> bool:
|
||||
try:
|
||||
import transformers_neuronx
|
||||
except ImportError:
|
||||
transformers_neuronx = None
|
||||
return transformers_neuronx is not None
|
||||
|
||||
def get_neuron_framework_to_use(self):
|
||||
"""Return the specified framework if corresponding installations are
|
||||
available.
|
||||
|
||||
If no framework is specified, use neuronx-distributed-inference by
|
||||
default.
|
||||
If that's unavailable, check and switch to transformers-neuronx.
|
||||
"""
|
||||
if not self.is_neuron():
|
||||
raise AssertionError(
|
||||
f"Neuron Framework unavailable for platform: {self}")
|
||||
|
||||
tnx_installed = self.is_transformers_neuronx()
|
||||
nxd_installed = self.is_neuronx_distributed_inference()
|
||||
|
||||
specified_framework = os.environ.get("VLLM_NEURON_FRAMEWORK")
|
||||
tnx_framework = NeuronFramework.TRANSFORMERS_NEURONX.value
|
||||
nxd_framework = NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE.value
|
||||
if specified_framework == tnx_framework and tnx_installed:
|
||||
return self.TRANSFORMERS_NEURONX
|
||||
|
||||
if ((specified_framework == nxd_framework and nxd_installed)
|
||||
or (specified_framework is None and nxd_installed)):
|
||||
return NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE
|
||||
|
||||
if specified_framework is None and tnx_installed:
|
||||
return NeuronFramework.TRANSFORMERS_NEURONX
|
||||
|
||||
return None
|
||||
|
||||
def use_neuronx_distributed(self):
|
||||
"""
|
||||
Return True if the framework determined in get_neuron_framework_to_use()
|
||||
is NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE, False otherwise. This
|
||||
is used to select the Neuron model framework and framework-specific
|
||||
configuration to apply during model compilation.
|
||||
"""
|
||||
nxd_framework = NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE
|
||||
return self.get_neuron_framework_to_use() == nxd_framework
|
||||
|
||||
def use_transformers_neuronx(self):
|
||||
"""
|
||||
Return True if the framework determined in get_neuron_framework_to_use()
|
||||
is NeuronFramework.TRANSFORMERS_NEURONX, False otherwise. This is used
|
||||
to select the Neuron model framework and framework-specific
|
||||
configuration to apply during model compilation.
|
||||
"""
|
||||
return self.get_neuron_framework_to_use(
|
||||
) == NeuronFramework.TRANSFORMERS_NEURONX
|
||||
435
platforms/rocm.py
Normal file
435
platforms/rocm.py
Normal file
@@ -0,0 +1,435 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
from datetime import timedelta
|
||||
from functools import cache, lru_cache, wraps
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
import torch
|
||||
from torch.distributed import PrefixStore, ProcessGroup
|
||||
from torch.distributed.distributed_c10d import is_nccl_available
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ModelConfig, VllmConfig
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
try:
|
||||
from amdsmi import (AmdSmiException, amdsmi_get_gpu_asic_info,
|
||||
amdsmi_get_processor_handles, amdsmi_init,
|
||||
amdsmi_shut_down, amdsmi_topo_get_link_type)
|
||||
except ImportError as e:
|
||||
logger.warning("Failed to import from amdsmi with %r", e)
|
||||
|
||||
try:
|
||||
import vllm._C # noqa: F401
|
||||
except ImportError as e:
|
||||
logger.warning("Failed to import from vllm._C with %r", e)
|
||||
|
||||
# import custom ops, trigger op registration
|
||||
try:
|
||||
import vllm._rocm_C # noqa: F401
|
||||
except ImportError as e:
|
||||
logger.warning("Failed to import from vllm._rocm_C with %r", e)
|
||||
|
||||
# Models not supported by ROCm.
|
||||
_ROCM_UNSUPPORTED_MODELS: list[str] = []
|
||||
|
||||
# Models partially supported by ROCm.
|
||||
# Architecture -> Reason.
|
||||
_ROCM_SWA_REASON = ("Sliding window attention (SWA) is not yet supported in "
|
||||
"Triton flash attention. For half-precision SWA support, "
|
||||
"please use CK flash attention by setting "
|
||||
"`VLLM_USE_TRITON_FLASH_ATTN=0`")
|
||||
_ROCM_PARTIALLY_SUPPORTED_MODELS: dict[str, str] = {
|
||||
"Qwen2ForCausalLM":
|
||||
_ROCM_SWA_REASON,
|
||||
"MistralForCausalLM":
|
||||
_ROCM_SWA_REASON,
|
||||
"MixtralForCausalLM":
|
||||
_ROCM_SWA_REASON,
|
||||
"PaliGemmaForConditionalGeneration":
|
||||
("ROCm flash attention does not yet "
|
||||
"fully support 32-bit precision on PaliGemma"),
|
||||
"Phi3VForCausalLM":
|
||||
("ROCm Triton flash attention may run into compilation errors due to "
|
||||
"excessive use of shared memory. If this happens, disable Triton FA "
|
||||
"by setting `VLLM_USE_TRITON_FLASH_ATTN=0`")
|
||||
}
|
||||
_ROCM_DEVICE_ID_NAME_MAP: dict[str, str] = {
|
||||
"0x74a0": "AMD_Instinct_MI300A",
|
||||
"0x74a1": "AMD_Instinct_MI300X",
|
||||
"0x74b5": "AMD_Instinct_MI300X", # MI300X VF
|
||||
"0x74a5": "AMD_Instinct_MI325X",
|
||||
"0x74b9": "AMD_Instinct_MI325X", # MI325X VF
|
||||
"0x74a9": "AMD_Instinct_MI300X_HF",
|
||||
"0x74bd": "AMD_Instinct_MI300X_HF",
|
||||
}
|
||||
|
||||
# Prevent use of clashing `{CUDA/HIP}_VISIBLE_DEVICES``
|
||||
if "HIP_VISIBLE_DEVICES" in os.environ:
|
||||
val = os.environ["HIP_VISIBLE_DEVICES"]
|
||||
if cuda_val := os.environ.get("CUDA_VISIBLE_DEVICES", None):
|
||||
assert val == cuda_val
|
||||
else:
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = val
|
||||
|
||||
# AMDSMI utils
|
||||
# Note that NVML is not affected by `{CUDA/HIP}_VISIBLE_DEVICES`,
|
||||
# all the related functions work on real physical device ids.
|
||||
# the major benefit of using AMDSMI is that it will not initialize CUDA
|
||||
|
||||
|
||||
def with_amdsmi_context(fn):
|
||||
|
||||
@wraps(fn)
|
||||
def wrapper(*args, **kwargs):
|
||||
amdsmi_init()
|
||||
try:
|
||||
return fn(*args, **kwargs)
|
||||
finally:
|
||||
amdsmi_shut_down()
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
@cache
|
||||
def on_gfx1x() -> bool:
|
||||
GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
|
||||
return any(arch in GPU_ARCH for arch in ["gfx11", "gfx12"])
|
||||
|
||||
|
||||
@cache
|
||||
def on_mi3xx() -> bool:
|
||||
GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
|
||||
return any(arch in GPU_ARCH for arch in ["gfx942", "gfx950"])
|
||||
|
||||
|
||||
@cache
|
||||
def on_gfx9() -> bool:
|
||||
GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
|
||||
return any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942", "gfx950"])
|
||||
|
||||
|
||||
@cache
|
||||
def use_rocm_custom_paged_attention(
|
||||
qtype: torch.dtype,
|
||||
head_size: int,
|
||||
block_size: int,
|
||||
gqa_ratio: int,
|
||||
max_seq_len: int,
|
||||
sliding_window: int,
|
||||
kv_cache_dtype: str,
|
||||
alibi_slopes: Optional[torch.Tensor] = None) -> bool:
|
||||
|
||||
GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
|
||||
ON_GFX9 = any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942", "gfx950"])
|
||||
ON_GFX11_GFX12 = any(arch in GPU_ARCH for arch in ["gfx11", "gfx12"])
|
||||
|
||||
# custom paged attn always supported on V0. On V1, requires sliding window
|
||||
# disabled due to observed numerical discrepancy.
|
||||
if ON_GFX9:
|
||||
return ((not envs.VLLM_USE_V1 or sliding_window == 0
|
||||
or sliding_window == (-1, -1))
|
||||
and (qtype == torch.half or qtype == torch.bfloat16)
|
||||
and (head_size == 64 or head_size == 128)
|
||||
and (block_size == 16 or block_size == 32)
|
||||
and (gqa_ratio >= 1 and gqa_ratio <= 16)
|
||||
and max_seq_len <= 32768 and (envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
|
||||
and not (envs.VLLM_ROCM_USE_AITER_PAGED_ATTN
|
||||
and envs.VLLM_ROCM_USE_AITER))
|
||||
|
||||
else:
|
||||
return (ON_GFX11_GFX12 and (not envs.VLLM_USE_V1 or sliding_window == 0
|
||||
or sliding_window == (-1, -1))
|
||||
and (qtype == torch.half or qtype == torch.bfloat16)
|
||||
and head_size == 128 and block_size == 16
|
||||
and (gqa_ratio >= 3 and gqa_ratio <= 16)
|
||||
and max_seq_len <= 32768 and alibi_slopes is None
|
||||
and kv_cache_dtype == "auto"
|
||||
and envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
|
||||
|
||||
|
||||
class RocmPlatform(Platform):
|
||||
_enum = PlatformEnum.ROCM
|
||||
device_name: str = "rocm"
|
||||
device_type: str = "cuda"
|
||||
dispatch_key: str = "CUDA"
|
||||
ray_device_key: str = "GPU"
|
||||
# rocm shares the same device control env var as CUDA
|
||||
device_control_env_var: str = "CUDA_VISIBLE_DEVICES"
|
||||
|
||||
supported_quantization: list[str] = [
|
||||
"awq", "gptq", "fp8", "compressed-tensors", "fbgemm_fp8", "gguf",
|
||||
"quark", "ptpc_fp8"
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
|
||||
kv_cache_dtype, block_size, use_v1,
|
||||
use_mla) -> str:
|
||||
if use_mla:
|
||||
from vllm.attention.backends.rocm_aiter_mla import (
|
||||
is_aiter_mla_enabled)
|
||||
|
||||
if selected_backend is None:
|
||||
selected_backend = (_Backend.ROCM_AITER_MLA if
|
||||
is_aiter_mla_enabled() or block_size == 1
|
||||
else _Backend.TRITON_MLA)
|
||||
|
||||
if selected_backend == _Backend.TRITON_MLA:
|
||||
if block_size != 1:
|
||||
logger.info("Using Triton MLA backend.")
|
||||
return "vllm.attention.backends.triton_mla.TritonMLABackend" # noqa: E501
|
||||
else:
|
||||
raise ValueError(
|
||||
f" The selected backend, {selected_backend.name},"
|
||||
f"does not support block size {block_size}.")
|
||||
elif selected_backend == _Backend.ROCM_AITER_MLA \
|
||||
or selected_backend == _Backend.ROCM_AITER_MLA_VLLM_V1:
|
||||
if block_size == 1:
|
||||
if use_v1:
|
||||
logger.info("Using AITER MLA backend on V1 engine.")
|
||||
return "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend" # noqa: E501
|
||||
else:
|
||||
logger.info("Using AITER MLA backend")
|
||||
return "vllm.attention.backends.rocm_aiter_mla.AiterMLABackend" # noqa: E501
|
||||
else:
|
||||
raise ValueError(
|
||||
f" The selected backend, {selected_backend.name},"
|
||||
f"does not support block size {block_size}."
|
||||
"(currently only supports block size 1)")
|
||||
else:
|
||||
raise ValueError(
|
||||
f" The selected backend, {selected_backend.name},"
|
||||
f"is not MLA type while requested for MLA backend.")
|
||||
|
||||
if selected_backend is None or selected_backend == _Backend.FLASH_ATTN:
|
||||
selected_backend = _Backend.ROCM_FLASH
|
||||
|
||||
if envs.VLLM_USE_V1:
|
||||
logger.info("Using Triton Attention backend on V1 engine.")
|
||||
return ("vllm.v1.attention.backends."
|
||||
"triton_attn.TritonAttentionBackend")
|
||||
if selected_backend == _Backend.ROCM_FLASH:
|
||||
if not cls.has_device_capability(90):
|
||||
# not Instinct series GPUs.
|
||||
logger.info("flash_attn is not supported on NAVI GPUs.")
|
||||
else:
|
||||
logger.info("%s is not supported in AMD GPUs.", selected_backend)
|
||||
logger.info("Using ROCmFlashAttention backend.")
|
||||
return "vllm.attention.backends.rocm_flash_attn.ROCmFlashAttentionBackend" # noqa: E501
|
||||
|
||||
@classmethod
|
||||
@lru_cache(maxsize=8)
|
||||
def get_device_capability(cls,
|
||||
device_id: int = 0
|
||||
) -> Optional[DeviceCapability]:
|
||||
major, minor = torch.cuda.get_device_capability(device_id)
|
||||
return DeviceCapability(major=major, minor=minor)
|
||||
|
||||
@classmethod
|
||||
@with_amdsmi_context
|
||||
def is_fully_connected(cls, physical_device_ids: list[int]) -> bool:
|
||||
"""
|
||||
Query if the set of gpus are fully connected by xgmi (1 hop)
|
||||
"""
|
||||
handles = [
|
||||
amdsmi_get_processor_handles()[i] for i in physical_device_ids
|
||||
]
|
||||
for i, handle in enumerate(handles):
|
||||
for j, peer_handle in enumerate(handles):
|
||||
if i < j:
|
||||
try:
|
||||
link_type = amdsmi_topo_get_link_type(
|
||||
handle, peer_handle)
|
||||
# type is 2 for XGMI
|
||||
if link_type["hops"] != 1 or link_type["type"] != 2:
|
||||
return False
|
||||
except AmdSmiException as error:
|
||||
logger.error("AMD 1 hop XGMI detection failed.",
|
||||
exc_info=error)
|
||||
return False
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
@with_amdsmi_context
|
||||
@lru_cache(maxsize=8)
|
||||
def get_device_name(cls, device_id: int = 0) -> str:
|
||||
physical_device_id = cls.device_id_to_physical_device_id(device_id)
|
||||
handle = amdsmi_get_processor_handles()[physical_device_id]
|
||||
asic_info = amdsmi_get_gpu_asic_info(handle)
|
||||
device_name: str = asic_info["device_id"]
|
||||
if device_name in _ROCM_DEVICE_ID_NAME_MAP:
|
||||
return _ROCM_DEVICE_ID_NAME_MAP[device_name]
|
||||
return asic_info["market_name"]
|
||||
|
||||
@classmethod
|
||||
def get_device_total_memory(cls, device_id: int = 0) -> int:
|
||||
device_props = torch.cuda.get_device_properties(device_id)
|
||||
return device_props.total_memory
|
||||
|
||||
@classmethod
|
||||
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
|
||||
if enforce_eager:
|
||||
logger.warning(
|
||||
"To see benefits of async output processing, enable CUDA "
|
||||
"graph. Since, enforce-eager is enabled, async output "
|
||||
"processor cannot be used")
|
||||
return False
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
|
||||
cache_config = vllm_config.cache_config
|
||||
if cache_config and cache_config.block_size is None:
|
||||
cache_config.block_size = 16
|
||||
|
||||
parallel_config = vllm_config.parallel_config
|
||||
scheduler_config = vllm_config.scheduler_config
|
||||
if parallel_config.worker_cls == "auto":
|
||||
if scheduler_config.is_multi_step:
|
||||
if envs.VLLM_USE_V1:
|
||||
raise NotImplementedError(
|
||||
"Multi-step scheduling is not supported (and not "
|
||||
"needed) on vLLM V1. Please launch without "
|
||||
"--num-scheduler-steps.")
|
||||
else:
|
||||
parallel_config.worker_cls = \
|
||||
"vllm.worker.multi_step_worker.MultiStepWorker"
|
||||
elif vllm_config.speculative_config:
|
||||
if envs.VLLM_USE_V1:
|
||||
raise NotImplementedError(
|
||||
"Speculative decoding is not yet supported on vLLM V1."
|
||||
)
|
||||
else:
|
||||
parallel_config.worker_cls = \
|
||||
"vllm.spec_decode.spec_decode_worker.create_spec_worker"
|
||||
parallel_config.sd_worker_cls = \
|
||||
"vllm.worker.worker.Worker"
|
||||
else:
|
||||
if envs.VLLM_USE_V1:
|
||||
parallel_config.worker_cls = \
|
||||
"vllm.v1.worker.gpu_worker.Worker"
|
||||
else:
|
||||
parallel_config.worker_cls = "vllm.worker.worker.Worker"
|
||||
|
||||
@classmethod
|
||||
def verify_model_arch(cls, model_arch: str) -> None:
|
||||
if model_arch in _ROCM_UNSUPPORTED_MODELS:
|
||||
raise ValueError(f"Model architecture '{model_arch}' is not "
|
||||
"supported by ROCm for now.")
|
||||
|
||||
if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS:
|
||||
msg = _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch]
|
||||
logger.warning(
|
||||
"Model architecture '%s' is partially "
|
||||
"supported by ROCm: %s", model_arch, msg)
|
||||
|
||||
@classmethod
|
||||
def verify_quantization(cls, quant: str) -> None:
|
||||
super().verify_quantization(quant)
|
||||
if quant == "awq" and not envs.VLLM_USE_TRITON_AWQ:
|
||||
logger.warning(
|
||||
"Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ"
|
||||
" is not set, enabling VLLM_USE_TRITON_AWQ.")
|
||||
envs.VLLM_USE_TRITON_AWQ = True
|
||||
|
||||
@classmethod
|
||||
def get_punica_wrapper(cls) -> str:
|
||||
return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU"
|
||||
|
||||
@classmethod
|
||||
def get_current_memory_usage(cls,
|
||||
device: Optional[torch.types.Device] = None
|
||||
) -> float:
|
||||
torch.cuda.reset_peak_memory_stats(device)
|
||||
return torch.cuda.mem_get_info(device)[1] - torch.cuda.mem_get_info(
|
||||
device)[0]
|
||||
|
||||
@classmethod
|
||||
def get_device_communicator_cls(cls) -> str:
|
||||
return "vllm.distributed.device_communicators.cuda_communicator.CudaCommunicator" # noqa
|
||||
|
||||
@classmethod
|
||||
def supports_mx(cls) -> bool:
|
||||
gcn_arch = torch.cuda.get_device_properties(0).gcnArchName
|
||||
return any(gfx in gcn_arch for gfx in ["gfx95"])
|
||||
|
||||
@classmethod
|
||||
def supports_fp8(cls) -> bool:
|
||||
gcn_arch = torch.cuda.get_device_properties(0).gcnArchName
|
||||
return any(gfx in gcn_arch for gfx in ['gfx94', 'gfx95', 'gfx12'])
|
||||
|
||||
@classmethod
|
||||
def is_fp8_fnuz(cls) -> bool:
|
||||
# only device 0 is checked, this assumes MI300 platforms are homogeneous
|
||||
return 'gfx94' in torch.cuda.get_device_properties(0).gcnArchName
|
||||
|
||||
@classmethod
|
||||
def fp8_dtype(cls) -> torch.dtype:
|
||||
if cls.is_fp8_fnuz():
|
||||
return torch.float8_e4m3fnuz
|
||||
else:
|
||||
return torch.float8_e4m3fn
|
||||
|
||||
@classmethod
|
||||
def supports_v1(cls, model_config: "ModelConfig") -> bool:
|
||||
# V1 support on AMD gpus is experimental
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def use_custom_allreduce(cls) -> bool:
|
||||
# We only enable custom allreduce for MI300 series
|
||||
gcn_arch = torch.cuda.get_device_properties(0).gcnArchName
|
||||
supported_archs = ['gfx94', 'gfx95']
|
||||
return any(gfx in gcn_arch for gfx in supported_archs)
|
||||
|
||||
@classmethod
|
||||
def get_cu_count(cls, device_id: int = 0) -> int:
|
||||
return torch.cuda.get_device_properties(
|
||||
device_id).multi_processor_count
|
||||
|
||||
@classmethod
|
||||
def is_navi(cls) -> bool:
|
||||
return 'gfx1' in torch.cuda.get_device_properties(0).gcnArchName
|
||||
|
||||
@classmethod
|
||||
def get_piecewise_backend_cls(cls) -> str:
|
||||
return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend" # noqa
|
||||
|
||||
@classmethod
|
||||
def stateless_init_device_torch_dist_pg(
|
||||
cls,
|
||||
backend: str,
|
||||
prefix_store: PrefixStore,
|
||||
group_rank: int,
|
||||
group_size: int,
|
||||
timeout: timedelta,
|
||||
) -> ProcessGroup:
|
||||
assert is_nccl_available()
|
||||
pg: ProcessGroup = ProcessGroup(
|
||||
prefix_store,
|
||||
group_rank,
|
||||
group_size,
|
||||
)
|
||||
from torch.distributed.distributed_c10d import ProcessGroupNCCL
|
||||
|
||||
backend_options = ProcessGroupNCCL.Options()
|
||||
backend_options._timeout = timeout
|
||||
|
||||
backend_class = ProcessGroupNCCL(prefix_store, group_rank, group_size,
|
||||
backend_options)
|
||||
backend_type = ProcessGroup.BackendType.NCCL
|
||||
device = torch.device("cuda")
|
||||
pg._set_default_backend(backend_type)
|
||||
backend_class._set_sequence_number_for_group()
|
||||
|
||||
pg._register_backend(device, backend_type, backend_class)
|
||||
return pg
|
||||
216
platforms/tpu.py
Normal file
216
platforms/tpu.py
Normal file
@@ -0,0 +1,216 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import TYPE_CHECKING, Optional, Union, cast
|
||||
|
||||
import torch
|
||||
from tpu_info import device
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.inputs import ProcessorInputs, PromptType
|
||||
from vllm.logger import init_logger
|
||||
from vllm.sampling_params import SamplingParams, SamplingType
|
||||
from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
|
||||
|
||||
from .interface import Platform, PlatformEnum, _Backend
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import BlockSize, ModelConfig, VllmConfig
|
||||
from vllm.pooling_params import PoolingParams
|
||||
else:
|
||||
BlockSize = None
|
||||
ModelConfig = None
|
||||
VllmConfig = None
|
||||
PoolingParams = None
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class TpuPlatform(Platform):
|
||||
_enum = PlatformEnum.TPU
|
||||
device_name: str = "tpu"
|
||||
device_type: str = "tpu"
|
||||
dispatch_key: str = "XLA"
|
||||
ray_device_key: str = "TPU"
|
||||
device_control_env_var: str = "TPU_VISIBLE_CHIPS"
|
||||
simple_compile_backend: str = "openxla"
|
||||
|
||||
supported_quantization: list[str] = ["tpu_int8", "compressed-tensors"]
|
||||
|
||||
additional_env_vars: list[str] = [
|
||||
"TPU_CHIPS_PER_HOST_BOUNDS", "TPU_HOST_BOUNDS"
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
|
||||
dtype: torch.dtype, kv_cache_dtype: Optional[str],
|
||||
block_size: int, use_v1: bool,
|
||||
use_mla: bool) -> str:
|
||||
if (selected_backend != _Backend.PALLAS
|
||||
and selected_backend != _Backend.PALLAS_VLLM_V1):
|
||||
logger.info("Cannot use %s backend on TPU.", selected_backend)
|
||||
|
||||
if use_v1:
|
||||
logger.info("Using Pallas V1 backend.")
|
||||
return "vllm.v1.attention.backends.pallas.PallasAttentionBackend"
|
||||
else:
|
||||
logger.info("Using Pallas backend.")
|
||||
return "vllm.attention.backends.pallas.PallasAttentionBackend"
|
||||
|
||||
@classmethod
|
||||
def get_device_name(cls, device_id: int = 0) -> str:
|
||||
chip_type, _ = device.get_local_chips()
|
||||
return f"TPU {chip_type.name}"
|
||||
|
||||
@classmethod
|
||||
def get_device_total_memory(cls, device_id: int = 0) -> int:
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
|
||||
return not envs.VLLM_USE_V1
|
||||
|
||||
@classmethod
|
||||
def get_punica_wrapper(cls) -> str:
|
||||
return "vllm.lora.punica_wrapper.punica_tpu.PunicaWrapperTPU"
|
||||
|
||||
@classmethod
|
||||
def get_infinity_values(cls, dtype: torch.dtype) -> tuple[float, float]:
|
||||
return torch.finfo(dtype).min, torch.finfo(dtype).max
|
||||
|
||||
@classmethod
|
||||
def can_update_inplace(cls):
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def get_lora_vocab_padding_size(cls) -> int:
|
||||
return 1
|
||||
|
||||
@classmethod
|
||||
def inference_mode(cls):
|
||||
return torch.no_grad()
|
||||
|
||||
@classmethod
|
||||
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
|
||||
from vllm.config import CompilationLevel
|
||||
|
||||
cache_config = vllm_config.cache_config
|
||||
# For v0, the default block size is 16.
|
||||
if cache_config and cache_config.block_size is None:
|
||||
cache_config.block_size = cast(BlockSize, 16)
|
||||
compilation_config = vllm_config.compilation_config
|
||||
|
||||
# TPU only supports DYNAMO_ONCE compilation level
|
||||
if compilation_config.level != CompilationLevel.DYNAMO_ONCE:
|
||||
logger.info("[TPU] Forcing DYNAMO_ONCE compilation level")
|
||||
compilation_config.level = CompilationLevel.DYNAMO_ONCE
|
||||
|
||||
if compilation_config.backend == "":
|
||||
compilation_config.backend = "openxla"
|
||||
|
||||
assert vllm_config.speculative_config is None, \
|
||||
"TPU does not support speculative decoding"
|
||||
|
||||
if vllm_config.model_config.dtype in (torch.float16, torch.float32):
|
||||
logger.warning(
|
||||
"The TPU backend currently does not support %s. "
|
||||
"Using bfloat16 instead.", vllm_config.model_config.dtype)
|
||||
vllm_config.model_config.dtype = torch.bfloat16
|
||||
|
||||
if envs.VLLM_USE_V1:
|
||||
from vllm.v1.attention.backends.pallas import (
|
||||
PallasAttentionBackend)
|
||||
cache_config.block_size = PallasAttentionBackend.get_page_size(
|
||||
vllm_config) # type: ignore[assignment]
|
||||
min_page_size = PallasAttentionBackend.get_min_page_size(
|
||||
vllm_config)
|
||||
if min_page_size > cache_config.block_size:
|
||||
logger.warning(
|
||||
"Increase the page size from %s to %s to make sure there's"
|
||||
"no SMEM OOM",
|
||||
cache_config.block_size,
|
||||
min_page_size,
|
||||
)
|
||||
cache_config.block_size = min_page_size # type: ignore[assignment]
|
||||
|
||||
parallel_config = vllm_config.parallel_config
|
||||
scheduler_config = vllm_config.scheduler_config
|
||||
if parallel_config.worker_cls == "auto":
|
||||
if scheduler_config.is_multi_step:
|
||||
if envs.VLLM_USE_V1:
|
||||
raise NotImplementedError(
|
||||
"Multi-step scheduling is not supported (and not "
|
||||
"needed) on vLLM V1. Please launch without "
|
||||
"--num-scheduler-steps.")
|
||||
else:
|
||||
parallel_config.worker_cls = \
|
||||
"vllm.worker.multi_step_tpu_worker.MultiStepTPUWorker"
|
||||
else:
|
||||
if envs.VLLM_USE_V1:
|
||||
parallel_config.worker_cls = \
|
||||
"vllm.v1.worker.tpu_worker.TPUWorker"
|
||||
else:
|
||||
parallel_config.worker_cls = \
|
||||
"vllm.worker.tpu_worker.TPUWorker"
|
||||
|
||||
assert not vllm_config.speculative_config, (
|
||||
"Speculative decoding is not yet supported for TPU backend")
|
||||
|
||||
if scheduler_config.is_multimodal_model and not \
|
||||
scheduler_config.disable_chunked_mm_input:
|
||||
logger.warning("TPU does not support running Multimodal models"\
|
||||
" without setting `--disable_chunked_mm_input`. " \
|
||||
"Forcing --disable_chunked_mm_input.")
|
||||
scheduler_config.disable_chunked_mm_input = True
|
||||
|
||||
if vllm_config.model_config and vllm_config.model_config.use_mla:
|
||||
logger.info(
|
||||
"MLA is enabled on a non-GPU platform; forcing chunked "
|
||||
"prefill and prefix caching to be disabled.")
|
||||
vllm_config.scheduler_config.enable_chunked_prefill = False
|
||||
vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
vllm_config.scheduler_config.max_num_batched_tokens = max(
|
||||
vllm_config.scheduler_config.max_model_len,
|
||||
DEFAULT_MAX_NUM_BATCHED_TOKENS)
|
||||
|
||||
@classmethod
|
||||
def is_pin_memory_available(cls):
|
||||
logger.warning("Pin memory is not supported on TPU.")
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def get_device_communicator_cls(cls) -> str:
|
||||
return "vllm.distributed.device_communicators.tpu_communicator.TpuCommunicator" # noqa
|
||||
|
||||
@classmethod
|
||||
def use_all_gather(cls) -> bool:
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def supports_v1(cls, model_config: ModelConfig) -> bool:
|
||||
# V1 support on TPU is experimental
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def validate_request(
|
||||
cls,
|
||||
prompt: PromptType,
|
||||
params: Union[SamplingParams, PoolingParams],
|
||||
processed_inputs: ProcessorInputs,
|
||||
) -> None:
|
||||
"""Raises if this request is unsupported on this platform"""
|
||||
if isinstance(params, SamplingParams):
|
||||
if params.guided_decoding is not None and not envs.VLLM_USE_V1:
|
||||
raise ValueError("Structured output is not supported on "
|
||||
f"{cls.device_name} V0.")
|
||||
if params.sampling_type == SamplingType.RANDOM_SEED:
|
||||
raise ValueError(
|
||||
"Torch XLA does not support per-request seed.")
|
||||
|
||||
|
||||
try:
|
||||
from tpu_commons.platforms import TpuPlatform as TpuCommonsPlatform
|
||||
TpuPlatform = TpuCommonsPlatform # type: ignore
|
||||
except ImportError:
|
||||
logger.info("tpu_commons not found, using vLLM's TpuPlatform")
|
||||
pass
|
||||
156
platforms/xpu.py
Normal file
156
platforms/xpu.py
Normal file
@@ -0,0 +1,156 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
|
||||
|
||||
from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import VllmConfig
|
||||
else:
|
||||
VllmConfig = None
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class XPUPlatform(Platform):
|
||||
_enum = PlatformEnum.XPU
|
||||
device_name: str = "xpu"
|
||||
device_type: str = "xpu"
|
||||
dispatch_key: str = "XPU"
|
||||
# Intel XPU's device key is "GPU" for Ray.
|
||||
# see https://github.com/ray-project/ray/blob/6a5eb5865eeb9ccf058a79b44f107e327e360673/python/ray/_private/accelerators/intel_gpu.py#L20 # noqa: E501
|
||||
ray_device_key: str = "GPU"
|
||||
device_control_env_var: str = "ONEAPI_DEVICE_SELECTOR"
|
||||
|
||||
@classmethod
|
||||
def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
|
||||
dtype: torch.dtype, kv_cache_dtype: Optional[str],
|
||||
block_size: int, use_v1: bool,
|
||||
use_mla: bool) -> str:
|
||||
if selected_backend != _Backend.IPEX:
|
||||
logger.info("Cannot use %s backend on XPU.", selected_backend)
|
||||
logger.info("Using IPEX attention backend.")
|
||||
return "vllm.attention.backends.ipex_attn.IpexAttnBackend"
|
||||
|
||||
@classmethod
|
||||
def get_device_capability(
|
||||
cls,
|
||||
device_id: int = 0,
|
||||
) -> Optional[DeviceCapability]:
|
||||
# capacity format differs from cuda's and will cause unexpected
|
||||
# failure, so use None directly
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def get_device_name(cls, device_id: int = 0) -> str:
|
||||
return torch.xpu.get_device_name(device_id)
|
||||
|
||||
@classmethod
|
||||
def get_device_total_memory(cls, device_id: int = 0) -> int:
|
||||
device_props = torch.xpu.get_device_properties(device_id)
|
||||
return device_props.total_memory
|
||||
|
||||
@classmethod
|
||||
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def inference_mode(cls):
|
||||
return torch.no_grad()
|
||||
|
||||
@classmethod
|
||||
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
|
||||
cache_config = vllm_config.cache_config
|
||||
if cache_config and cache_config.block_size is None:
|
||||
cache_config.block_size = 16
|
||||
|
||||
# check and update model config
|
||||
model_config = vllm_config.model_config
|
||||
if model_config.dtype == torch.bfloat16:
|
||||
bf16_supported = cls.device_support_bf16()
|
||||
if not bf16_supported:
|
||||
logger.warning(
|
||||
"bfloat16 is only supported on Intel Data Center GPU, "
|
||||
"Intel Arc GPU is not supported yet. Your device is %s,"
|
||||
" which is not supported. will fallback to float16",
|
||||
cls.get_device_name())
|
||||
model_config.dtype = torch.float16
|
||||
if not model_config.enforce_eager:
|
||||
logger.warning(
|
||||
"CUDA graph is not supported on XPU, fallback to the eager "
|
||||
"mode.")
|
||||
model_config.enforce_eager = True
|
||||
|
||||
if vllm_config.speculative_config is not None:
|
||||
raise NotImplementedError(
|
||||
"XPU does not support speculative decoding")
|
||||
|
||||
if vllm_config.device_config is not None:
|
||||
assert vllm_config.device_config.device_type == "xpu"
|
||||
|
||||
# check and update parallel config
|
||||
parallel_config = vllm_config.parallel_config
|
||||
if parallel_config.worker_cls == "auto":
|
||||
parallel_config.worker_cls = "vllm.worker.xpu_worker.XPUWorker"
|
||||
|
||||
if parallel_config.distributed_executor_backend is None:
|
||||
parallel_config.distributed_executor_backend = "ray"
|
||||
elif parallel_config.distributed_executor_backend == "mp":
|
||||
# FIXME(kunshang):
|
||||
# spawn needs calling `if __name__ == '__main__':``
|
||||
# fork is not supported for xpu start new process.
|
||||
logger.error(
|
||||
"Both start methods (spawn and fork) have issue "
|
||||
"on XPU if you use mp backend, setting it to ray instead.")
|
||||
parallel_config.distributed_executor_backend = "ray"
|
||||
|
||||
elif parallel_config.distributed_executor_backend != "ray":
|
||||
logger.warning(
|
||||
"%s is not supported on XPU, fallback to ray distributed"
|
||||
" executor backend.",
|
||||
parallel_config.distributed_executor_backend)
|
||||
parallel_config.distributed_executor_backend = "ray"
|
||||
|
||||
if vllm_config.model_config and vllm_config.model_config.use_mla:
|
||||
logger.info(
|
||||
"MLA is enabled on a non-GPU platform; forcing chunked "
|
||||
"prefill and prefix caching to be disabled.")
|
||||
vllm_config.scheduler_config.enable_chunked_prefill = False
|
||||
vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
vllm_config.scheduler_config.max_num_batched_tokens = max(
|
||||
vllm_config.scheduler_config.max_model_len,
|
||||
DEFAULT_MAX_NUM_BATCHED_TOKENS)
|
||||
|
||||
@classmethod
|
||||
def is_pin_memory_available(cls):
|
||||
logger.warning("Pin memory is not supported on XPU.")
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def get_current_memory_usage(cls,
|
||||
device: Optional[torch.types.Device] = None
|
||||
) -> float:
|
||||
torch.xpu.reset_peak_memory_stats(device)
|
||||
return torch.xpu.max_memory_allocated(device)
|
||||
|
||||
@classmethod
|
||||
def device_support_bf16(cls) -> bool:
|
||||
device_name = cls.get_device_name().lower()
|
||||
if device_name.count("arc") > 0:
|
||||
return False
|
||||
elif device_name.count("data center gpu") > 0:
|
||||
return True
|
||||
else:
|
||||
logger.warning("Unknown device name %s, always use float16",
|
||||
device_name)
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def get_device_communicator_cls(cls) -> str:
|
||||
return "vllm.distributed.device_communicators.xpu_communicator.XpuCommunicator" # noqa
|
||||
Reference in New Issue
Block a user