add qwen3

2026-02-04 17:22:39 +08:00
parent d1c0f68ab4
commit 8511fe8530
1932 changed files with 300426 additions and 0 deletions
--- a/vllm-v0.6.2/vllm/platforms/init.py
+++ b/vllm-v0.6.2/vllm/platforms/init.py
@@ -0,0 +1,124 @@
+from .interface import Platform, PlatformEnum, UnspecifiedPlatform
+
+current_platform: Platform
+
+# NOTE: we don't use `torch.version.cuda` / `torch.version.hip` because
+# they only indicate the build configuration, not the runtime environment.
+# For example, people can install a cuda build of pytorch but run on tpu.
+
+is_tpu = False
+try:
+    # While it's technically possible to install libtpu on a non-TPU machine,
+    # this is a very uncommon scenario. Therefore, we assume that libtpu is
+    # installed if and only if the machine has TPUs.
+    import libtpu  # noqa: F401
+    is_tpu = True
+except Exception:
+    pass
+
+is_cuda = False
+
+try:
+    import pynvml
+    pynvml.nvmlInit()
+    try:
+        if pynvml.nvmlDeviceGetCount() > 0:
+            is_cuda = True
+    finally:
+        pynvml.nvmlShutdown()
+except Exception:
+    pass
+
+is_rocm = False
+
+try:
+    import amdsmi
+    amdsmi.amdsmi_init()
+    try:
+        if len(amdsmi.amdsmi_get_processor_handles()) > 0:
+            is_rocm = True
+    finally:
+        amdsmi.amdsmi_shut_down()
+except Exception:
+    pass
+
+is_hpu = False
+try:
+    from importlib import util
+    is_hpu = util.find_spec('habana_frameworks') is not None
+except Exception:
+    pass
+
+is_xpu = False
+
+try:
+    # installed IPEX if the machine has XPUs.
+    import intel_extension_for_pytorch  # noqa: F401
+    import oneccl_bindings_for_pytorch  # noqa: F401
+    import torch
+    if hasattr(torch, 'xpu') and torch.xpu.is_available():
+        is_xpu = True
+except Exception:
+    pass
+
+is_cpu = False
+try:
+    from importlib.metadata import version
+    is_cpu = "cpu" in version("vllm")
+except Exception:
+    pass
+
+is_neuron = False
+try:
+    import transformers_neuronx  # noqa: F401
+    is_neuron = True
+except ImportError:
+    pass
+
+is_openvino = False
+try:
+    from importlib.metadata import version
+    is_openvino = "openvino" in version("vllm")
+except Exception:
+    pass
+
+is_mlu = False
+try:
+    import torch_mlu
+    is_mlu = True
+except ImportError:
+    pass
+
+if is_tpu:
+    # people might install pytorch built with cuda but run on tpu
+    # so we need to check tpu first
+    from .tpu import TpuPlatform
+    current_platform = TpuPlatform()
+elif is_cuda:
+    from .cuda import CudaPlatform
+    current_platform = CudaPlatform()
+elif is_rocm:
+    from .rocm import RocmPlatform
+    current_platform = RocmPlatform()
+elif is_hpu:
+    from .hpu import HpuPlatform
+    current_platform = HpuPlatform()
+elif is_xpu:
+    from .xpu import XPUPlatform
+    current_platform = XPUPlatform()
+elif is_cpu:
+    from .cpu import CpuPlatform
+    current_platform = CpuPlatform()
+elif is_neuron:
+    from .neuron import NeuronPlatform
+    current_platform = NeuronPlatform()
+elif is_openvino:
+    from .openvino import OpenVinoPlatform
+    current_platform = OpenVinoPlatform()
+elif is_mlu:
+    from .mlu import MluPlatform
+    current_platform = MluPlatform()
+else:
+    current_platform = UnspecifiedPlatform()
+
+__all__ = ['Platform', 'PlatformEnum', 'current_platform']
--- a/vllm-v0.6.2/vllm/platforms/pycache/init.cpython-310.pyc
+++ b/vllm-v0.6.2/vllm/platforms/pycache/init.cpython-310.pyc
--- a/vllm-v0.6.2/vllm/platforms/pycache/interface.cpython-310.pyc
+++ b/vllm-v0.6.2/vllm/platforms/pycache/interface.cpython-310.pyc
--- a/vllm-v0.6.2/vllm/platforms/pycache/mlu.cpython-310.pyc
+++ b/vllm-v0.6.2/vllm/platforms/pycache/mlu.cpython-310.pyc
--- a/vllm-v0.6.2/vllm/platforms/cpu.py
+++ b/vllm-v0.6.2/vllm/platforms/cpu.py
@@ -0,0 +1,20 @@
+import psutil
+import torch
+
+from .interface import Platform, PlatformEnum
+
+
+class CpuPlatform(Platform):
+    _enum = PlatformEnum.CPU
+
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return "cpu"
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        return psutil.virtual_memory().total
+
+    @classmethod
+    def inference_mode(cls):
+        return torch.no_grad()
--- a/vllm-v0.6.2/vllm/platforms/cuda.py
+++ b/vllm-v0.6.2/vllm/platforms/cuda.py
@@ -0,0 +1,150 @@
+"""Code inside this file can safely assume cuda platform, e.g. importing
+pynvml. However, it should not initialize cuda context.
+"""
+
+import os
+from functools import lru_cache, wraps
+from typing import Callable, List, Tuple, TypeVar
+
+import pynvml
+import torch
+from typing_extensions import ParamSpec
+
+from vllm.logger import init_logger
+
+from .interface import DeviceCapability, Platform, PlatformEnum
+
+logger = init_logger(__name__)
+
+_P = ParamSpec("_P")
+_R = TypeVar("_R")
+
+if pynvml.__file__.endswith("__init__.py"):
+    logger.warning(
+        "You are using a deprecated `pynvml` package. Please install"
+        " `nvidia-ml-py` instead, and make sure to uninstall `pynvml`."
+        " When both of them are installed, `pynvml` will take precedence"
+        " and cause errors. See https://pypi.org/project/pynvml "
+        "for more information.")
+
+# pytorch 2.5 uses cudnn sdpa by default, which will cause crash on some models
+# see https://github.com/huggingface/diffusers/issues/9704 for details
+torch.backends.cuda.enable_cudnn_sdp(False)
+
+# NVML utils
+# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
+# all the related functions work on real physical device ids.
+# the major benefit of using NVML is that it will not initialize CUDA
+
+
+def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
+
+    @wraps(fn)
+    def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
+        pynvml.nvmlInit()
+        try:
+            return fn(*args, **kwargs)
+        finally:
+            pynvml.nvmlShutdown()
+
+    return wrapper
+
+
+@lru_cache(maxsize=8)
+@with_nvml_context
+def get_physical_device_capability(device_id: int = 0) -> Tuple[int, int]:
+    handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
+    return pynvml.nvmlDeviceGetCudaComputeCapability(handle)
+
+
+@lru_cache(maxsize=8)
+@with_nvml_context
+def get_physical_device_name(device_id: int = 0) -> str:
+    handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
+    return pynvml.nvmlDeviceGetName(handle)
+
+
+@lru_cache(maxsize=8)
+@with_nvml_context
+def get_physical_device_total_memory(device_id: int = 0) -> int:
+    handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
+    return int(pynvml.nvmlDeviceGetMemoryInfo(handle).total)
+
+
+@with_nvml_context
+def warn_if_different_devices():
+    device_ids: int = pynvml.nvmlDeviceGetCount()
+    if device_ids > 1:
+        device_names = [get_physical_device_name(i) for i in range(device_ids)]
+        if len(set(device_names)) > 1 and os.environ.get(
+                "CUDA_DEVICE_ORDER") != "PCI_BUS_ID":
+            logger.warning(
+                "Detected different devices in the system: \n%s\nPlease"
+                " make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to "
+                "avoid unexpected behavior.", "\n".join(device_names))
+
+
+try:
+    from sphinx.ext.autodoc.mock import _MockModule
+
+    if not isinstance(pynvml, _MockModule):
+        warn_if_different_devices()
+except ModuleNotFoundError:
+    warn_if_different_devices()
+
+
+def device_id_to_physical_device_id(device_id: int) -> int:
+    if "CUDA_VISIBLE_DEVICES" in os.environ:
+        device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
+        if device_ids == [""]:
+            raise RuntimeError("CUDA_VISIBLE_DEVICES is set to empty string,"
+                               " which means GPU support is disabled.")
+        physical_device_id = device_ids[device_id]
+        return int(physical_device_id)
+    else:
+        return device_id
+
+
+class CudaPlatform(Platform):
+    _enum = PlatformEnum.CUDA
+
+    @classmethod
+    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
+        physical_device_id = device_id_to_physical_device_id(device_id)
+        major, minor = get_physical_device_capability(physical_device_id)
+        return DeviceCapability(major=major, minor=minor)
+
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        physical_device_id = device_id_to_physical_device_id(device_id)
+        return get_physical_device_name(physical_device_id)
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        physical_device_id = device_id_to_physical_device_id(device_id)
+        return get_physical_device_total_memory(physical_device_id)
+
+    @classmethod
+    @with_nvml_context
+    def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool:
+        """
+        query if the set of gpus are fully connected by nvlink (1 hop)
+        """
+        handles = [
+            pynvml.nvmlDeviceGetHandleByIndex(i) for i in physical_device_ids
+        ]
+        for i, handle in enumerate(handles):
+            for j, peer_handle in enumerate(handles):
+                if i < j:
+                    try:
+                        p2p_status = pynvml.nvmlDeviceGetP2PStatus(
+                            handle, peer_handle,
+                            pynvml.NVML_P2P_CAPS_INDEX_NVLINK)
+                        if p2p_status != pynvml.NVML_P2P_STATUS_OK:
+                            return False
+                    except pynvml.NVMLError:
+                        logger.exception(
+                            "NVLink detection failed. This is normal if your"
+                            " machine has no NVLink equipped.")
+                        return False
+        return True
--- a/vllm-v0.6.2/vllm/platforms/hpu.py
+++ b/vllm-v0.6.2/vllm/platforms/hpu.py
@@ -0,0 +1,11 @@
+import torch
+
+from .interface import Platform, PlatformEnum
+
+
+class HpuPlatform(Platform):
+    _enum = PlatformEnum.HPU
+
+    @staticmethod
+    def inference_mode():
+        return torch.no_grad()
--- a/vllm-v0.6.2/vllm/platforms/interface.py
+++ b/vllm-v0.6.2/vllm/platforms/interface.py
@@ -0,0 +1,141 @@
+import enum
+import random
+from typing import NamedTuple, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+
+class PlatformEnum(enum.Enum):
+    CUDA = enum.auto()
+    ROCM = enum.auto()
+    TPU = enum.auto()
+    HPU = enum.auto()
+    XPU = enum.auto()
+    CPU = enum.auto()
+    NEURON = enum.auto()
+    OPENVINO = enum.auto()
+    MLU = enum.auto()
+    UNSPECIFIED = enum.auto()
+
+
+class DeviceCapability(NamedTuple):
+    major: int
+    minor: int
+
+    def as_version_str(self) -> str:
+        return f"{self.major}.{self.minor}"
+
+    def to_int(self) -> int:
+        """
+        Express device capability as an integer ``<major><minor>``.
+
+        It is assumed that the minor version is always a single digit.
+        """
+        assert 0 <= self.minor < 10
+        return self.major * 10 + self.minor
+
+
+class Platform:
+    _enum: PlatformEnum
+
+    def is_cuda(self) -> bool:
+        return self._enum == PlatformEnum.CUDA
+
+    def is_rocm(self) -> bool:
+        return self._enum == PlatformEnum.ROCM
+
+    def is_tpu(self) -> bool:
+        return self._enum == PlatformEnum.TPU
+
+    def is_hpu(self) -> bool:
+        return self._enum == PlatformEnum.HPU
+
+    def is_xpu(self) -> bool:
+        return self._enum == PlatformEnum.XPU
+
+    def is_cpu(self) -> bool:
+        return self._enum == PlatformEnum.CPU
+
+    def is_neuron(self) -> bool:
+        return self._enum == PlatformEnum.NEURON
+
+    def is_openvino(self) -> bool:
+        return self._enum == PlatformEnum.OPENVINO
+
+    def is_mlu(self) -> bool:
+        return self._enum == PlatformEnum.MLU
+
+    def is_cuda_alike(self) -> bool:
+        """Stateless version of :func:`torch.cuda.is_available`."""
+        return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
+
+    @classmethod
+    def get_device_capability(
+        cls,
+        device_id: int = 0,
+    ) -> Optional[DeviceCapability]:
+        """Stateless version of :func:`torch.cuda.get_device_capability`."""
+
+    @staticmethod
+    def get_device_capability(device_id: int = 0) -> Optional[Tuple[int, int]]:
+        return None
+
+    @classmethod
+    def has_device_capability(
+        cls,
+        capability: Union[Tuple[int, int], int],
+        device_id: int = 0,
+    ) -> bool:
+        """
+        Test whether this platform is compatible with a device capability.
+
+        The ``capability`` argument can either be:
+
+        - A tuple ``(major, minor)``.
+        - An integer ``<major><minor>``. (See :meth:`DeviceCapability.to_int`)
+        """
+        current_capability = cls.get_device_capability(device_id=device_id)
+        if current_capability is None:
+            return False
+
+        if isinstance(capability, tuple):
+            return current_capability >= capability
+
+        return current_capability.to_int() >= capability
+
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        """Get the name of a device."""
+        raise NotImplementedError
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        """Get the total memory of a device in bytes."""
+        raise NotImplementedError
+
+    @classmethod
+    def inference_mode(cls):
+        """A device-specific wrapper of `torch.inference_mode`.
+
+        This wrapper is recommended because some hardware backends such as TPU
+        do not support `torch.inference_mode`. In such a case, they will fall
+        back to `torch.no_grad` by overriding this method.
+        """
+        return torch.inference_mode(mode=True)
+
+    @classmethod
+    def seed_everything(cls, seed: int) -> None:
+        """
+        Set the seed of each random module.
+        `torch.manual_seed` will set seed on all devices.
+
+        Loosely based on: https://github.com/Lightning-AI/pytorch-lightning/blob/2.4.0/src/lightning/fabric/utilities/seed.py#L20
+        """
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+
+
+class UnspecifiedPlatform(Platform):
+    _enum = PlatformEnum.UNSPECIFIED
--- a/vllm-v0.6.2/vllm/platforms/mlu.py
+++ b/vllm-v0.6.2/vllm/platforms/mlu.py
@@ -0,0 +1,25 @@
+from functools import lru_cache
+
+import torch
+
+from .interface import DeviceCapability, Platform, PlatformEnum
+
+
+class MluPlatform(Platform):
+    _enum = PlatformEnum.MLU
+
+    @classmethod
+    @lru_cache(maxsize=8)
+    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
+        major, minor = torch.mlu.get_device_capability(device_id)
+        return DeviceCapability(major=major, minor=minor)
+
+    @classmethod
+    @lru_cache(maxsize=8)
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return torch.mlu.get_device_name(device_id)
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        device_props = torch.mlu.get_device_properties(device_id)
+        return device_props.total_memory
--- a/vllm-v0.6.2/vllm/platforms/neuron.py
+++ b/vllm-v0.6.2/vllm/platforms/neuron.py
@@ -0,0 +1,9 @@
+from .interface import Platform, PlatformEnum
+
+
+class NeuronPlatform(Platform):
+    _enum = PlatformEnum.NEURON
+
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return "neuron"
--- a/vllm-v0.6.2/vllm/platforms/openvino.py
+++ b/vllm-v0.6.2/vllm/platforms/openvino.py
@@ -0,0 +1,33 @@
+import torch
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+
+from .interface import Platform, PlatformEnum
+
+logger = init_logger(__name__)
+
+
+class OpenVinoPlatform(Platform):
+    _enum = PlatformEnum.OPENVINO
+
+    @classmethod
+    def get_device_name(self, device_id: int = 0) -> str:
+        return "openvino"
+
+    @classmethod
+    def inference_mode(self):
+        return torch.inference_mode(mode=True)
+
+    @classmethod
+    def is_openvino_cpu(self) -> bool:
+        return "CPU" in envs.VLLM_OPENVINO_DEVICE
+
+    @classmethod
+    def is_openvino_gpu(self) -> bool:
+        return "GPU" in envs.VLLM_OPENVINO_DEVICE
+
+    @classmethod
+    def is_pin_memory_available(self) -> bool:
+        logger.warning("Pin memory is not supported on OpenViNO.")
+        return False
--- a/vllm-v0.6.2/vllm/platforms/rocm.py
+++ b/vllm-v0.6.2/vllm/platforms/rocm.py
@@ -0,0 +1,36 @@
+import os
+from functools import lru_cache
+
+import torch
+
+from vllm.logger import init_logger
+
+from .interface import DeviceCapability, Platform, PlatformEnum
+
+logger = init_logger(__name__)
+
+if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", None) in ["fork", None]:
+    logger.warning("`fork` method is not supported by ROCm. "
+                   "VLLM_WORKER_MULTIPROC_METHOD is overridden to"
+                   " `spawn` instead.")
+    os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+
+class RocmPlatform(Platform):
+    _enum = PlatformEnum.ROCM
+
+    @classmethod
+    @lru_cache(maxsize=8)
+    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
+        major, minor = torch.cuda.get_device_capability(device_id)
+        return DeviceCapability(major=major, minor=minor)
+
+    @classmethod
+    @lru_cache(maxsize=8)
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return torch.cuda.get_device_name(device_id)
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        device_props = torch.cuda.get_device_properties(device_id)
+        return device_props.total_memory
--- a/vllm-v0.6.2/vllm/platforms/tpu.py
+++ b/vllm-v0.6.2/vllm/platforms/tpu.py
@@ -0,0 +1,33 @@
+import os
+
+import torch
+
+import vllm.envs as envs
+from vllm.compilation.levels import CompilationLevel
+from vllm.plugins import set_torch_compile_backend
+
+from .interface import Platform, PlatformEnum
+
+if "VLLM_TORCH_COMPILE_LEVEL" not in os.environ:
+    os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.DYNAMO_ONCE)
+
+assert envs.VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.PIECEWISE,\
+     "TPU does not support Inductor."
+
+set_torch_compile_backend("openxla")
+
+
+class TpuPlatform(Platform):
+    _enum = PlatformEnum.TPU
+
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        raise NotImplementedError
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        raise NotImplementedError
+
+    @classmethod
+    def inference_mode(cls):
+        return torch.no_grad()
--- a/vllm-v0.6.2/vllm/platforms/xpu.py
+++ b/vllm-v0.6.2/vllm/platforms/xpu.py
@@ -0,0 +1,26 @@
+import torch
+
+from .interface import DeviceCapability, Platform, PlatformEnum
+
+
+class XPUPlatform(Platform):
+    _enum = PlatformEnum.XPU
+
+    @staticmethod
+    def get_device_capability(device_id: int = 0) -> DeviceCapability:
+        major, minor, *_ = torch.xpu.get_device_capability(
+            device_id)['version'].split('.')
+        return DeviceCapability(major=int(major), minor=int(minor))
+
+    @staticmethod
+    def get_device_name(device_id: int = 0) -> str:
+        return torch.xpu.get_device_name(device_id)
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        device_props = torch.xpu.get_device_properties(device_id)
+        return device_props.total_memory
+
+    @staticmethod
+    def inference_mode():
+        return torch.no_grad()