forked from EngineX-Cambricon/enginex-mlu370-vllm
add qwen3
This commit is contained in:
124
vllm-v0.6.2/vllm/platforms/__init__.py
Normal file
124
vllm-v0.6.2/vllm/platforms/__init__.py
Normal file
@@ -0,0 +1,124 @@
|
||||
from .interface import Platform, PlatformEnum, UnspecifiedPlatform
|
||||
|
||||
current_platform: Platform
|
||||
|
||||
# NOTE: we don't use `torch.version.cuda` / `torch.version.hip` because
|
||||
# they only indicate the build configuration, not the runtime environment.
|
||||
# For example, people can install a cuda build of pytorch but run on tpu.
|
||||
|
||||
is_tpu = False
|
||||
try:
|
||||
# While it's technically possible to install libtpu on a non-TPU machine,
|
||||
# this is a very uncommon scenario. Therefore, we assume that libtpu is
|
||||
# installed if and only if the machine has TPUs.
|
||||
import libtpu # noqa: F401
|
||||
is_tpu = True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
is_cuda = False
|
||||
|
||||
try:
|
||||
import pynvml
|
||||
pynvml.nvmlInit()
|
||||
try:
|
||||
if pynvml.nvmlDeviceGetCount() > 0:
|
||||
is_cuda = True
|
||||
finally:
|
||||
pynvml.nvmlShutdown()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
is_rocm = False
|
||||
|
||||
try:
|
||||
import amdsmi
|
||||
amdsmi.amdsmi_init()
|
||||
try:
|
||||
if len(amdsmi.amdsmi_get_processor_handles()) > 0:
|
||||
is_rocm = True
|
||||
finally:
|
||||
amdsmi.amdsmi_shut_down()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
is_hpu = False
|
||||
try:
|
||||
from importlib import util
|
||||
is_hpu = util.find_spec('habana_frameworks') is not None
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
is_xpu = False
|
||||
|
||||
try:
|
||||
# installed IPEX if the machine has XPUs.
|
||||
import intel_extension_for_pytorch # noqa: F401
|
||||
import oneccl_bindings_for_pytorch # noqa: F401
|
||||
import torch
|
||||
if hasattr(torch, 'xpu') and torch.xpu.is_available():
|
||||
is_xpu = True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
is_cpu = False
|
||||
try:
|
||||
from importlib.metadata import version
|
||||
is_cpu = "cpu" in version("vllm")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
is_neuron = False
|
||||
try:
|
||||
import transformers_neuronx # noqa: F401
|
||||
is_neuron = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
is_openvino = False
|
||||
try:
|
||||
from importlib.metadata import version
|
||||
is_openvino = "openvino" in version("vllm")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
is_mlu = False
|
||||
try:
|
||||
import torch_mlu
|
||||
is_mlu = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
if is_tpu:
|
||||
# people might install pytorch built with cuda but run on tpu
|
||||
# so we need to check tpu first
|
||||
from .tpu import TpuPlatform
|
||||
current_platform = TpuPlatform()
|
||||
elif is_cuda:
|
||||
from .cuda import CudaPlatform
|
||||
current_platform = CudaPlatform()
|
||||
elif is_rocm:
|
||||
from .rocm import RocmPlatform
|
||||
current_platform = RocmPlatform()
|
||||
elif is_hpu:
|
||||
from .hpu import HpuPlatform
|
||||
current_platform = HpuPlatform()
|
||||
elif is_xpu:
|
||||
from .xpu import XPUPlatform
|
||||
current_platform = XPUPlatform()
|
||||
elif is_cpu:
|
||||
from .cpu import CpuPlatform
|
||||
current_platform = CpuPlatform()
|
||||
elif is_neuron:
|
||||
from .neuron import NeuronPlatform
|
||||
current_platform = NeuronPlatform()
|
||||
elif is_openvino:
|
||||
from .openvino import OpenVinoPlatform
|
||||
current_platform = OpenVinoPlatform()
|
||||
elif is_mlu:
|
||||
from .mlu import MluPlatform
|
||||
current_platform = MluPlatform()
|
||||
else:
|
||||
current_platform = UnspecifiedPlatform()
|
||||
|
||||
__all__ = ['Platform', 'PlatformEnum', 'current_platform']
|
||||
BIN
vllm-v0.6.2/vllm/platforms/__pycache__/__init__.cpython-310.pyc
Normal file
BIN
vllm-v0.6.2/vllm/platforms/__pycache__/__init__.cpython-310.pyc
Normal file
Binary file not shown.
BIN
vllm-v0.6.2/vllm/platforms/__pycache__/interface.cpython-310.pyc
Normal file
BIN
vllm-v0.6.2/vllm/platforms/__pycache__/interface.cpython-310.pyc
Normal file
Binary file not shown.
BIN
vllm-v0.6.2/vllm/platforms/__pycache__/mlu.cpython-310.pyc
Normal file
BIN
vllm-v0.6.2/vllm/platforms/__pycache__/mlu.cpython-310.pyc
Normal file
Binary file not shown.
20
vllm-v0.6.2/vllm/platforms/cpu.py
Normal file
20
vllm-v0.6.2/vllm/platforms/cpu.py
Normal file
@@ -0,0 +1,20 @@
|
||||
import psutil
|
||||
import torch
|
||||
|
||||
from .interface import Platform, PlatformEnum
|
||||
|
||||
|
||||
class CpuPlatform(Platform):
|
||||
_enum = PlatformEnum.CPU
|
||||
|
||||
@classmethod
|
||||
def get_device_name(cls, device_id: int = 0) -> str:
|
||||
return "cpu"
|
||||
|
||||
@classmethod
|
||||
def get_device_total_memory(cls, device_id: int = 0) -> int:
|
||||
return psutil.virtual_memory().total
|
||||
|
||||
@classmethod
|
||||
def inference_mode(cls):
|
||||
return torch.no_grad()
|
||||
150
vllm-v0.6.2/vllm/platforms/cuda.py
Normal file
150
vllm-v0.6.2/vllm/platforms/cuda.py
Normal file
@@ -0,0 +1,150 @@
|
||||
"""Code inside this file can safely assume cuda platform, e.g. importing
|
||||
pynvml. However, it should not initialize cuda context.
|
||||
"""
|
||||
|
||||
import os
|
||||
from functools import lru_cache, wraps
|
||||
from typing import Callable, List, Tuple, TypeVar
|
||||
|
||||
import pynvml
|
||||
import torch
|
||||
from typing_extensions import ParamSpec
|
||||
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from .interface import DeviceCapability, Platform, PlatformEnum
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
_P = ParamSpec("_P")
|
||||
_R = TypeVar("_R")
|
||||
|
||||
if pynvml.__file__.endswith("__init__.py"):
|
||||
logger.warning(
|
||||
"You are using a deprecated `pynvml` package. Please install"
|
||||
" `nvidia-ml-py` instead, and make sure to uninstall `pynvml`."
|
||||
" When both of them are installed, `pynvml` will take precedence"
|
||||
" and cause errors. See https://pypi.org/project/pynvml "
|
||||
"for more information.")
|
||||
|
||||
# pytorch 2.5 uses cudnn sdpa by default, which will cause crash on some models
|
||||
# see https://github.com/huggingface/diffusers/issues/9704 for details
|
||||
torch.backends.cuda.enable_cudnn_sdp(False)
|
||||
|
||||
# NVML utils
|
||||
# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
|
||||
# all the related functions work on real physical device ids.
|
||||
# the major benefit of using NVML is that it will not initialize CUDA
|
||||
|
||||
|
||||
def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
|
||||
|
||||
@wraps(fn)
|
||||
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
|
||||
pynvml.nvmlInit()
|
||||
try:
|
||||
return fn(*args, **kwargs)
|
||||
finally:
|
||||
pynvml.nvmlShutdown()
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
@lru_cache(maxsize=8)
|
||||
@with_nvml_context
|
||||
def get_physical_device_capability(device_id: int = 0) -> Tuple[int, int]:
|
||||
handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
|
||||
return pynvml.nvmlDeviceGetCudaComputeCapability(handle)
|
||||
|
||||
|
||||
@lru_cache(maxsize=8)
|
||||
@with_nvml_context
|
||||
def get_physical_device_name(device_id: int = 0) -> str:
|
||||
handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
|
||||
return pynvml.nvmlDeviceGetName(handle)
|
||||
|
||||
|
||||
@lru_cache(maxsize=8)
|
||||
@with_nvml_context
|
||||
def get_physical_device_total_memory(device_id: int = 0) -> int:
|
||||
handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
|
||||
return int(pynvml.nvmlDeviceGetMemoryInfo(handle).total)
|
||||
|
||||
|
||||
@with_nvml_context
|
||||
def warn_if_different_devices():
|
||||
device_ids: int = pynvml.nvmlDeviceGetCount()
|
||||
if device_ids > 1:
|
||||
device_names = [get_physical_device_name(i) for i in range(device_ids)]
|
||||
if len(set(device_names)) > 1 and os.environ.get(
|
||||
"CUDA_DEVICE_ORDER") != "PCI_BUS_ID":
|
||||
logger.warning(
|
||||
"Detected different devices in the system: \n%s\nPlease"
|
||||
" make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to "
|
||||
"avoid unexpected behavior.", "\n".join(device_names))
|
||||
|
||||
|
||||
try:
|
||||
from sphinx.ext.autodoc.mock import _MockModule
|
||||
|
||||
if not isinstance(pynvml, _MockModule):
|
||||
warn_if_different_devices()
|
||||
except ModuleNotFoundError:
|
||||
warn_if_different_devices()
|
||||
|
||||
|
||||
def device_id_to_physical_device_id(device_id: int) -> int:
|
||||
if "CUDA_VISIBLE_DEVICES" in os.environ:
|
||||
device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
|
||||
if device_ids == [""]:
|
||||
raise RuntimeError("CUDA_VISIBLE_DEVICES is set to empty string,"
|
||||
" which means GPU support is disabled.")
|
||||
physical_device_id = device_ids[device_id]
|
||||
return int(physical_device_id)
|
||||
else:
|
||||
return device_id
|
||||
|
||||
|
||||
class CudaPlatform(Platform):
|
||||
_enum = PlatformEnum.CUDA
|
||||
|
||||
@classmethod
|
||||
def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
|
||||
physical_device_id = device_id_to_physical_device_id(device_id)
|
||||
major, minor = get_physical_device_capability(physical_device_id)
|
||||
return DeviceCapability(major=major, minor=minor)
|
||||
|
||||
@classmethod
|
||||
def get_device_name(cls, device_id: int = 0) -> str:
|
||||
physical_device_id = device_id_to_physical_device_id(device_id)
|
||||
return get_physical_device_name(physical_device_id)
|
||||
|
||||
@classmethod
|
||||
def get_device_total_memory(cls, device_id: int = 0) -> int:
|
||||
physical_device_id = device_id_to_physical_device_id(device_id)
|
||||
return get_physical_device_total_memory(physical_device_id)
|
||||
|
||||
@classmethod
|
||||
@with_nvml_context
|
||||
def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool:
|
||||
"""
|
||||
query if the set of gpus are fully connected by nvlink (1 hop)
|
||||
"""
|
||||
handles = [
|
||||
pynvml.nvmlDeviceGetHandleByIndex(i) for i in physical_device_ids
|
||||
]
|
||||
for i, handle in enumerate(handles):
|
||||
for j, peer_handle in enumerate(handles):
|
||||
if i < j:
|
||||
try:
|
||||
p2p_status = pynvml.nvmlDeviceGetP2PStatus(
|
||||
handle, peer_handle,
|
||||
pynvml.NVML_P2P_CAPS_INDEX_NVLINK)
|
||||
if p2p_status != pynvml.NVML_P2P_STATUS_OK:
|
||||
return False
|
||||
except pynvml.NVMLError:
|
||||
logger.exception(
|
||||
"NVLink detection failed. This is normal if your"
|
||||
" machine has no NVLink equipped.")
|
||||
return False
|
||||
return True
|
||||
11
vllm-v0.6.2/vllm/platforms/hpu.py
Normal file
11
vllm-v0.6.2/vllm/platforms/hpu.py
Normal file
@@ -0,0 +1,11 @@
|
||||
import torch
|
||||
|
||||
from .interface import Platform, PlatformEnum
|
||||
|
||||
|
||||
class HpuPlatform(Platform):
|
||||
_enum = PlatformEnum.HPU
|
||||
|
||||
@staticmethod
|
||||
def inference_mode():
|
||||
return torch.no_grad()
|
||||
141
vllm-v0.6.2/vllm/platforms/interface.py
Normal file
141
vllm-v0.6.2/vllm/platforms/interface.py
Normal file
@@ -0,0 +1,141 @@
|
||||
import enum
|
||||
import random
|
||||
from typing import NamedTuple, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
|
||||
class PlatformEnum(enum.Enum):
|
||||
CUDA = enum.auto()
|
||||
ROCM = enum.auto()
|
||||
TPU = enum.auto()
|
||||
HPU = enum.auto()
|
||||
XPU = enum.auto()
|
||||
CPU = enum.auto()
|
||||
NEURON = enum.auto()
|
||||
OPENVINO = enum.auto()
|
||||
MLU = enum.auto()
|
||||
UNSPECIFIED = enum.auto()
|
||||
|
||||
|
||||
class DeviceCapability(NamedTuple):
|
||||
major: int
|
||||
minor: int
|
||||
|
||||
def as_version_str(self) -> str:
|
||||
return f"{self.major}.{self.minor}"
|
||||
|
||||
def to_int(self) -> int:
|
||||
"""
|
||||
Express device capability as an integer ``<major><minor>``.
|
||||
|
||||
It is assumed that the minor version is always a single digit.
|
||||
"""
|
||||
assert 0 <= self.minor < 10
|
||||
return self.major * 10 + self.minor
|
||||
|
||||
|
||||
class Platform:
|
||||
_enum: PlatformEnum
|
||||
|
||||
def is_cuda(self) -> bool:
|
||||
return self._enum == PlatformEnum.CUDA
|
||||
|
||||
def is_rocm(self) -> bool:
|
||||
return self._enum == PlatformEnum.ROCM
|
||||
|
||||
def is_tpu(self) -> bool:
|
||||
return self._enum == PlatformEnum.TPU
|
||||
|
||||
def is_hpu(self) -> bool:
|
||||
return self._enum == PlatformEnum.HPU
|
||||
|
||||
def is_xpu(self) -> bool:
|
||||
return self._enum == PlatformEnum.XPU
|
||||
|
||||
def is_cpu(self) -> bool:
|
||||
return self._enum == PlatformEnum.CPU
|
||||
|
||||
def is_neuron(self) -> bool:
|
||||
return self._enum == PlatformEnum.NEURON
|
||||
|
||||
def is_openvino(self) -> bool:
|
||||
return self._enum == PlatformEnum.OPENVINO
|
||||
|
||||
def is_mlu(self) -> bool:
|
||||
return self._enum == PlatformEnum.MLU
|
||||
|
||||
def is_cuda_alike(self) -> bool:
|
||||
"""Stateless version of :func:`torch.cuda.is_available`."""
|
||||
return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
|
||||
|
||||
@classmethod
|
||||
def get_device_capability(
|
||||
cls,
|
||||
device_id: int = 0,
|
||||
) -> Optional[DeviceCapability]:
|
||||
"""Stateless version of :func:`torch.cuda.get_device_capability`."""
|
||||
|
||||
@staticmethod
|
||||
def get_device_capability(device_id: int = 0) -> Optional[Tuple[int, int]]:
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def has_device_capability(
|
||||
cls,
|
||||
capability: Union[Tuple[int, int], int],
|
||||
device_id: int = 0,
|
||||
) -> bool:
|
||||
"""
|
||||
Test whether this platform is compatible with a device capability.
|
||||
|
||||
The ``capability`` argument can either be:
|
||||
|
||||
- A tuple ``(major, minor)``.
|
||||
- An integer ``<major><minor>``. (See :meth:`DeviceCapability.to_int`)
|
||||
"""
|
||||
current_capability = cls.get_device_capability(device_id=device_id)
|
||||
if current_capability is None:
|
||||
return False
|
||||
|
||||
if isinstance(capability, tuple):
|
||||
return current_capability >= capability
|
||||
|
||||
return current_capability.to_int() >= capability
|
||||
|
||||
@classmethod
|
||||
def get_device_name(cls, device_id: int = 0) -> str:
|
||||
"""Get the name of a device."""
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def get_device_total_memory(cls, device_id: int = 0) -> int:
|
||||
"""Get the total memory of a device in bytes."""
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def inference_mode(cls):
|
||||
"""A device-specific wrapper of `torch.inference_mode`.
|
||||
|
||||
This wrapper is recommended because some hardware backends such as TPU
|
||||
do not support `torch.inference_mode`. In such a case, they will fall
|
||||
back to `torch.no_grad` by overriding this method.
|
||||
"""
|
||||
return torch.inference_mode(mode=True)
|
||||
|
||||
@classmethod
|
||||
def seed_everything(cls, seed: int) -> None:
|
||||
"""
|
||||
Set the seed of each random module.
|
||||
`torch.manual_seed` will set seed on all devices.
|
||||
|
||||
Loosely based on: https://github.com/Lightning-AI/pytorch-lightning/blob/2.4.0/src/lightning/fabric/utilities/seed.py#L20
|
||||
"""
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
|
||||
|
||||
class UnspecifiedPlatform(Platform):
|
||||
_enum = PlatformEnum.UNSPECIFIED
|
||||
25
vllm-v0.6.2/vllm/platforms/mlu.py
Normal file
25
vllm-v0.6.2/vllm/platforms/mlu.py
Normal file
@@ -0,0 +1,25 @@
|
||||
from functools import lru_cache
|
||||
|
||||
import torch
|
||||
|
||||
from .interface import DeviceCapability, Platform, PlatformEnum
|
||||
|
||||
|
||||
class MluPlatform(Platform):
|
||||
_enum = PlatformEnum.MLU
|
||||
|
||||
@classmethod
|
||||
@lru_cache(maxsize=8)
|
||||
def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
|
||||
major, minor = torch.mlu.get_device_capability(device_id)
|
||||
return DeviceCapability(major=major, minor=minor)
|
||||
|
||||
@classmethod
|
||||
@lru_cache(maxsize=8)
|
||||
def get_device_name(cls, device_id: int = 0) -> str:
|
||||
return torch.mlu.get_device_name(device_id)
|
||||
|
||||
@classmethod
|
||||
def get_device_total_memory(cls, device_id: int = 0) -> int:
|
||||
device_props = torch.mlu.get_device_properties(device_id)
|
||||
return device_props.total_memory
|
||||
9
vllm-v0.6.2/vllm/platforms/neuron.py
Normal file
9
vllm-v0.6.2/vllm/platforms/neuron.py
Normal file
@@ -0,0 +1,9 @@
|
||||
from .interface import Platform, PlatformEnum
|
||||
|
||||
|
||||
class NeuronPlatform(Platform):
|
||||
_enum = PlatformEnum.NEURON
|
||||
|
||||
@classmethod
|
||||
def get_device_name(cls, device_id: int = 0) -> str:
|
||||
return "neuron"
|
||||
33
vllm-v0.6.2/vllm/platforms/openvino.py
Normal file
33
vllm-v0.6.2/vllm/platforms/openvino.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import torch
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from .interface import Platform, PlatformEnum
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class OpenVinoPlatform(Platform):
|
||||
_enum = PlatformEnum.OPENVINO
|
||||
|
||||
@classmethod
|
||||
def get_device_name(self, device_id: int = 0) -> str:
|
||||
return "openvino"
|
||||
|
||||
@classmethod
|
||||
def inference_mode(self):
|
||||
return torch.inference_mode(mode=True)
|
||||
|
||||
@classmethod
|
||||
def is_openvino_cpu(self) -> bool:
|
||||
return "CPU" in envs.VLLM_OPENVINO_DEVICE
|
||||
|
||||
@classmethod
|
||||
def is_openvino_gpu(self) -> bool:
|
||||
return "GPU" in envs.VLLM_OPENVINO_DEVICE
|
||||
|
||||
@classmethod
|
||||
def is_pin_memory_available(self) -> bool:
|
||||
logger.warning("Pin memory is not supported on OpenViNO.")
|
||||
return False
|
||||
36
vllm-v0.6.2/vllm/platforms/rocm.py
Normal file
36
vllm-v0.6.2/vllm/platforms/rocm.py
Normal file
@@ -0,0 +1,36 @@
|
||||
import os
|
||||
from functools import lru_cache
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from .interface import DeviceCapability, Platform, PlatformEnum
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", None) in ["fork", None]:
|
||||
logger.warning("`fork` method is not supported by ROCm. "
|
||||
"VLLM_WORKER_MULTIPROC_METHOD is overridden to"
|
||||
" `spawn` instead.")
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
|
||||
class RocmPlatform(Platform):
|
||||
_enum = PlatformEnum.ROCM
|
||||
|
||||
@classmethod
|
||||
@lru_cache(maxsize=8)
|
||||
def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
|
||||
major, minor = torch.cuda.get_device_capability(device_id)
|
||||
return DeviceCapability(major=major, minor=minor)
|
||||
|
||||
@classmethod
|
||||
@lru_cache(maxsize=8)
|
||||
def get_device_name(cls, device_id: int = 0) -> str:
|
||||
return torch.cuda.get_device_name(device_id)
|
||||
|
||||
@classmethod
|
||||
def get_device_total_memory(cls, device_id: int = 0) -> int:
|
||||
device_props = torch.cuda.get_device_properties(device_id)
|
||||
return device_props.total_memory
|
||||
33
vllm-v0.6.2/vllm/platforms/tpu.py
Normal file
33
vllm-v0.6.2/vllm/platforms/tpu.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import os
|
||||
|
||||
import torch
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.compilation.levels import CompilationLevel
|
||||
from vllm.plugins import set_torch_compile_backend
|
||||
|
||||
from .interface import Platform, PlatformEnum
|
||||
|
||||
if "VLLM_TORCH_COMPILE_LEVEL" not in os.environ:
|
||||
os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.DYNAMO_ONCE)
|
||||
|
||||
assert envs.VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.PIECEWISE,\
|
||||
"TPU does not support Inductor."
|
||||
|
||||
set_torch_compile_backend("openxla")
|
||||
|
||||
|
||||
class TpuPlatform(Platform):
|
||||
_enum = PlatformEnum.TPU
|
||||
|
||||
@classmethod
|
||||
def get_device_name(cls, device_id: int = 0) -> str:
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def get_device_total_memory(cls, device_id: int = 0) -> int:
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def inference_mode(cls):
|
||||
return torch.no_grad()
|
||||
26
vllm-v0.6.2/vllm/platforms/xpu.py
Normal file
26
vllm-v0.6.2/vllm/platforms/xpu.py
Normal file
@@ -0,0 +1,26 @@
|
||||
import torch
|
||||
|
||||
from .interface import DeviceCapability, Platform, PlatformEnum
|
||||
|
||||
|
||||
class XPUPlatform(Platform):
|
||||
_enum = PlatformEnum.XPU
|
||||
|
||||
@staticmethod
|
||||
def get_device_capability(device_id: int = 0) -> DeviceCapability:
|
||||
major, minor, *_ = torch.xpu.get_device_capability(
|
||||
device_id)['version'].split('.')
|
||||
return DeviceCapability(major=int(major), minor=int(minor))
|
||||
|
||||
@staticmethod
|
||||
def get_device_name(device_id: int = 0) -> str:
|
||||
return torch.xpu.get_device_name(device_id)
|
||||
|
||||
@classmethod
|
||||
def get_device_total_memory(cls, device_id: int = 0) -> int:
|
||||
device_props = torch.xpu.get_device_properties(device_id)
|
||||
return device_props.total_memory
|
||||
|
||||
@staticmethod
|
||||
def inference_mode():
|
||||
return torch.no_grad()
|
||||
Reference in New Issue
Block a user