From 9a405274e287ce370a7788c6c70c4d40b06b688b Mon Sep 17 00:00:00 2001 From: JieXin Liang Date: Thu, 15 May 2025 15:51:30 +0800 Subject: [PATCH] [misc] remove redundant platform codes (#6298) --- python/sglang/srt/platforms/interface.py | 371 ----------------------- python/sglang/srt/utils.py | 4 - 2 files changed, 375 deletions(-) delete mode 100644 python/sglang/srt/platforms/interface.py diff --git a/python/sglang/srt/platforms/interface.py b/python/sglang/srt/platforms/interface.py deleted file mode 100644 index eb8dbd469..000000000 --- a/python/sglang/srt/platforms/interface.py +++ /dev/null @@ -1,371 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -# Adapted from -# https://github.com/vllm-project/vllm/blob/v0.8.2/vllm/platforms/interface.py - -import enum -import platform -import random -from platform import uname -from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Tuple, Union - -if TYPE_CHECKING: - from sglang.srt.server_args import ServerArgs - from sglang.srt.configs.model_config import ModelConfig - -import logging - -import numpy as np -import torch - -logger = logging.getLogger(__name__) - - -def in_wsl() -> bool: - # Reference: https://github.com/microsoft/WSL/issues/4071 - return "microsoft" in " ".join(uname()).lower() - - -class PlatformEnum(enum.Enum): - CUDA = enum.auto() - ROCM = enum.auto() - HPU = enum.auto() - XPU = enum.auto() - CPU = enum.auto() - OOT = enum.auto() - UNSPECIFIED = enum.auto() - - -class CpuArchEnum(enum.Enum): - X86 = enum.auto() - ARM = enum.auto() - POWERPC = enum.auto() - OTHER = enum.auto() - UNKNOWN = enum.auto() - - -class DeviceCapability(NamedTuple): - major: int - minor: int - - def as_version_str(self) -> str: - return f"{self.major}.{self.minor}" - - def to_int(self) -> int: - """ - Express device capability as an integer ````. - - It is assumed that the minor version is always a single digit. - """ - assert 0 <= self.minor < 10 - return self.major * 10 + self.minor - - -class Platform: - _enum: PlatformEnum - - # Real device name of current platform. - device_name: str - - # For specifying torch device for cuda alike platform's capability. - device_type: str - - # The torch.distributed backend on current platform - torch_distributed_backend: str - - # The torch.compile backend for compiling simple and - # standalone functions. The default value is "inductor" to keep - # the same behavior as PyTorch. - torch_compile_backend: str = "inductor" - - supported_quantization: list[str] = [] - - supported_speculative_algorithm: list[str] = [] - - # Use first element as default dtype - supported_dtype: list[str] = [] - - # Use first element as default backend - supported_attntion_backend: list[str] = [] - - # Use first element as default backend - supported_sampling_backend: list[str] = [] - - # Use first element as default backend - supported_lora_backend: list[str] = [] - - def is_cuda(self) -> bool: - return self._enum == PlatformEnum.CUDA - - def is_rocm(self) -> bool: - return self._enum == PlatformEnum.ROCM - - def is_hpu(self) -> bool: - return self._enum == PlatformEnum.HPU - - def is_xpu(self) -> bool: - return self._enum == PlatformEnum.XPU - - def is_cpu(self) -> bool: - return self._enum == PlatformEnum.CPU - - def is_out_of_tree(self) -> bool: - return self._enum == PlatformEnum.OOT - - def is_cuda_alike(self) -> bool: - """Stateless version of :func:`torch.cuda.is_available`.""" - return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM) - - @classmethod - def get_device_capability( - cls, - device_id: int = 0, - ) -> Optional[DeviceCapability]: - """Stateless version of :func:`torch.cuda.get_device_capability`.""" - return None - - @classmethod - def has_device_capability( - cls, - capability: Union[Tuple[int, int], int], - device_id: int = 0, - ) -> bool: - """ - Test whether this platform is compatible with a device capability. - - The ``capability`` argument can either be: - - - A tuple ``(major, minor)``. - - An integer ````. (See :meth:`DeviceCapability.to_int`) - """ - current_capability = cls.get_device_capability(device_id=device_id) - if current_capability is None: - return False - - if isinstance(capability, tuple): - return current_capability >= capability - - return current_capability.to_int() >= capability - - @classmethod - def get_device_module(cls) -> Any: - """Get `torch.device_module` like `torch.cuda` of current platform.""" - raise NotImplementedError - - @classmethod - def get_device_sku(cls, device_id: int = 0) -> str: - """Get the SKU name of a device.""" - raise NotImplementedError - - @classmethod - def get_device_uuid(cls, device_id: int = 0) -> str: - """Get the uuid of a device, e.g. the PCI bus ID.""" - raise NotImplementedError - - @classmethod - def get_device_core_count(cls, device_id: int = 0) -> str: - """Get the core count of a device, e.g. SMs of CUDA, CUs of ROCM.""" - raise NotImplementedError - - @classmethod - def get_device_count(cls) -> int: - """Get device count on current platform""" - raise NotImplementedError - - @classmethod - def get_device_total_memory(cls, device_id: int = 0, distributed=False) -> float: - """ - Get total memory for device_type:device_id device in gigabytes. - """ - raise NotImplementedError - - @classmethod - def get_device_available_memory( - cls, device_id: int = 0, distributed=False, empty_cache=True - ) -> float: - """ - Get available memory for device_type:device_id device in gigabytes. - When distributed is True, the available memory is the minimum available memory of all GPUs. - """ - raise NotImplementedError - - @classmethod - def supports_overlap_scheduler(cls) -> bool: - """ - Check if the current platform supports overlap scheduler - """ - raise NotImplementedError - - @classmethod - def seed_everything(cls, seed: Optional[int] = None) -> None: - """ - Set the seed of each random module. - `torch.manual_seed` will set seed on all devices. - - Loosely based on: https://github.com/Lightning-AI/pytorch-lightning/blob/2.4.0/src/lightning/fabric/utilities/seed.py#L20 - """ - if seed is not None: - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) - - @classmethod - def check_and_update_server_args(cls, server_args: ServerArgs) -> None: - """ - Check and update the server arguments for the current platform. - - It can raise an exception if the configuration is not compatible with - the current platform, or it can update the configuration to make it - compatible with the current platform. - - The config is passed by reference, so it can be modified in place. - """ - pass - - @classmethod - def check_and_update_model_dtype(cls, model_config: ModelConfig, dtype: str) -> str: - """ - Check and update the model's dtype for the current platform. - """ - if cls.supported_dtype and dtype not in cls.supported_dtype: - logger.warning( - f"dtype {dtype} is currently not supported in " - f"{cls.device_name}. use {cls.supported_dtype[0]} instead" - ) - return cls.supported_dtype[0] - return dtype - - @classmethod - def check_and_update_attntion_backend( - cls, model_config: ModelConfig, backend: str - ) -> str: - """ - Check and update the attntion backend for the current platform. - """ - raise NotImplementedError - - @classmethod - def check_and_update_sampling_backend(cls, backend: str) -> str: - """ - Check and update the sampling backend for the current platform. - """ - raise NotImplementedError - - @classmethod - def check_and_update_lora_backend(cls, backend: str) -> str: - """ - Check and update the lora backend for the current platform. - """ - raise NotImplementedError - - @classmethod - def verify_model_arch(cls, model_arch: str) -> None: - """ - Verify whether the current platform supports the specified model - architecture. - - - This will raise an Error or Warning based on the model support on - the current platform. - - By default all models are considered supported. - """ - pass - - @classmethod - def verify_quantization(cls, quant: str) -> None: - """ - Verify whether the quantization is supported by the current platform. - """ - if cls.supported_quantization and quant not in cls.supported_quantization: - raise ValueError( - f"{quant} quantization is currently not supported in " - f"{cls.device_name}." - ) - - @classmethod - def verify_speculative_algorithm(cls, algo: str) -> None: - """ - Verify whether the speculative algorithm is supported by the current platform. - """ - if ( - cls.supported_speculative_algorithm - and algo not in cls.supported_speculative_algorithm - ): - raise ValueError( - f"speculative algorithm {algo} is currently not supported in " - f"{cls.device_name}." - ) - - @classmethod - def get_cpu_architecture(cls) -> CpuArchEnum: - """ - Determine the CPU architecture of the current system. - Returns CpuArchEnum indicating the architecture type. - """ - machine = platform.machine().lower() - - if machine in ("x86_64", "amd64", "i386", "i686"): - return CpuArchEnum.X86 - elif machine.startswith("arm") or machine.startswith("aarch"): - return CpuArchEnum.ARM - elif machine.startswith("ppc"): - return CpuArchEnum.POWERPC - - return CpuArchEnum.OTHER if machine else CpuArchEnum.UNKNOWN - - @classmethod - def is_pin_memory_available(cls) -> bool: - """Checks whether pin memory is available on the current platform.""" - if in_wsl(): - # Pinning memory in WSL is not supported. - # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications - logger.warning( - "Using 'pin_memory=False' as WSL is detected. " - "This may slow down the performance." - ) - return False - return True - - @classmethod - def get_device_communicator_cls(cls) -> str: - """ - Get device specific communicator class for distributed communication. - """ - raise NotImplementedError - - @classmethod - def supports_fp8(cls) -> bool: - return False - - @classmethod - def fp8_dtype(cls) -> torch.dtype: - """ - Returns the preferred FP8 type on the current platform. - """ - return torch.float8_e4m3fn - - @classmethod - def fp8_min_max(cls) -> Tuple[float, float]: - """ - Returns the preferred FP8 max value on the current platform. - """ - fp8_max = torch.finfo(cls.fp8_dtype()).max - return (-fp8_max, fp8_max) - - @classmethod - def is_triton_avaliable(cls) -> bool: - raise NotImplementedError - - @classmethod - def init_environments(cls) -> None: - """ - Init environments on current platform. - - - Init platform specific env vars. - - Init platform specific patches. - """ - pass - - -class UnspecifiedPlatform(Platform): - _enum = PlatformEnum.UNSPECIFIED - device_type = "" diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index 85cb807c5..ffc453d88 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -125,10 +125,6 @@ builtins.FP8_E4M3_MAX = FP8_E4M3_MAX builtins.FP8_E4M3_MIN = FP8_E4M3_MIN -def is_rocm() -> bool: - return torch.cuda.is_available() and torch.version.hip - - def is_cuda(): return torch.cuda.is_available() and torch.version.cuda