Initial commit for vLLM-Kunlun Plugin

2025-12-10 12:05:39 +08:00
commit c728e52505
131 changed files with 28816 additions and 0 deletions
--- a/vllm_kunlun/platforms/init.py
+++ b/vllm_kunlun/platforms/init.py
@@ -0,0 +1,5 @@
+from .kunlun import KunlunPlatform
+
+current_platform = KunlunPlatform()
+
+__all__ = ["current_platform", "KunlunPlatform"]
--- a/vllm_kunlun/platforms/envs.py
+++ b/vllm_kunlun/platforms/envs.py
@@ -0,0 +1,111 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+from typing import TYPE_CHECKING, Any, Callable, Optional
+
+if TYPE_CHECKING:
+    VLLM_MULTI_LOGPATH: str = ("./log",)
+    ENABLE_VLLM_MULTI_LOG: bool = (False,)
+    ENABLE_VLLM_INFER_HOOK: bool = (False,)
+    ENABLE_VLLM_OPS_HOOK: bool = (False,)
+    ENABLE_VLLM_MODULE_HOOK: bool = False
+
+
+def maybe_convert_int(value: Optional[str]) -> Optional[int]:
+    """
+    If the value is None, return None; otherwise, convert the string to an integer and return it.
+
+    Args:
+        value (Optional[str], optional): The optional string to convert. Defaults to None.
+
+    Returns:
+        Optional[int]: If the value is None, return None; otherwise, convert the string to an integer and return it.
+    """
+    if value is None:
+        return None
+    return int(value)
+
+
+# The begin-* and end* here are used by the documentation generator
+# to extract the used env vars.
+
+# begin-env-vars-definition
+
+xvllm_environment_variables: dict[str, Callable[[], Any]] = {
+    # path to the logs of redirect-output, abstrac of related are ok
+    "VLLM_MULTI_LOGPATH": lambda: os.environ.get("VLLM_MULTI_LOGPATH", "./logs"),
+    # turn on / off multi-log of multi nodes & multi cards
+    "ENABLE_VLLM_MULTI_LOG": lambda: (
+        os.environ.get("ENABLE_VLLM_MULTI_LOG", "False").lower() in ("true", "1")
+    ),
+    # turn on / off XVLLM infer stage log ability
+    "ENABLE_VLLM_INFER_HOOK": lambda: (
+        os.environ.get("ENABLE_VLLM_INFER_HOOK", "False").lower() in ("true", "1")
+    ),
+    # turn on / off XVLLM infer_ops log ability
+    "ENABLE_VLLM_OPS_HOOK": lambda: (
+        os.environ.get("ENABLE_VLLM_OPS_HOOK", "False").lower() in ("true", "1")
+    ),
+    "ENABLE_VLLM_MODULE_HOOK": lambda: (
+        os.environ.get("ENABLE_VLLM_MODULE_HOOK", "False").lower() in ("true", "1")
+    ),
+    # fuse sorted op with fused_moe kernel
+    "ENABLE_VLLM_MOE_FC_SORTED": lambda: (
+        os.environ.get("ENABLE_VLLM_MOE_FC_SORTED", "False").lower() in ("true", "1")
+    ),
+    # enable custom dpsk scaling rope
+    "ENABLE_CUSTOM_DPSK_SCALING_ROPE": lambda: (
+        os.environ.get("ENABLE_CUSTOM_DPSK_SCALING_ROPE", "False").lower()
+        in ("true", "1")
+    ),
+    # fuse qkv split & qk norm & qk rope
+    # only works for qwen3 dense and qwen3 moe models
+    "ENABLE_VLLM_FUSED_QKV_SPLIT_NORM_ROPE": lambda: (
+        os.environ.get("ENABLE_VLLM_FUSED_QKV_SPLIT_NORM_ROPE", "False").lower()
+        in ("true", "1")
+    ),
+}
+
+# end-env-vars-definition
+
+
+def __getattr__(name: str):
+    """
+    This function is called when an attribute that doesn't exist is accessed.
+    If the attribute is one of the xvllm_environment_variables, return the corresponding value.
+    Otherwise, raise an AttributeError.
+
+    Args:
+        name (str): The name of the attribute to retrieve.
+
+    Raises:
+        AttributeError (Exception): If the attribute is not one of xvllm_environment_variables, this exception is raised.
+
+    Returns:
+        Any, optional: If the attribute is one of xvllm_environment_variables, the corresponding value is returned; otherwise, None is returned.
+    """
+    # lazy evaluation of environment variables
+    if name in xvllm_environment_variables:
+        return xvllm_environment_variables[name]()
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+
+def __dir__():
+    """
+    Returns a list of all visible variable names.
+
+    Returns:
+        list: A list of all visible variable names, which are defined through the `xvllm_environment_variables` dictionary.
+
+    Returns:
+        List[str]: A list of all visible variable names.
+                   These variables are defined through the `xvllm_environment_variables` dictionary.
+    """
+    return list(xvllm_environment_variables.keys())
+
+
+def is_set(name: str):
+    """Check if an environment variable is explicitly set."""
+    if name in xvllm_environment_variables:
+        return name in os.environ
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
--- a/vllm_kunlun/platforms/kunlun.py
+++ b/vllm_kunlun/platforms/kunlun.py
@@ -0,0 +1,289 @@
+"""kunlun"""
+import psutil
+import torch
+
+from vllm.platforms.interface import DeviceCapability, Platform, PlatformEnum, _Backend
+from typing import Optional, Union
+import vllm.envs as envs
+from vllm.logger import init_logger
+
+
+logger = init_logger(__name__)
+
+class KunlunPlatform(Platform):
+    """KunlunPlatform"""
+    _enum = PlatformEnum.CUDA 
+    dist_backend:str = "nccl"
+    ray_device_key: str = "GPU"
+    device_name: str = "xpu"
+
+    @property
+    def device_type(self):
+        """Returns the device type, which is fixed as 'cuda'.
+        """
+        return "cuda"
+
+    def is_kunlun(self) -> bool:
+        """is_kunlun"""
+        return self._enum == PlatformEnum.CUDA
+
+    def is_cuda(self) -> bool:
+        """is_cuda"""
+        return False
+
+    def is_rocm(self) -> bool:
+        """is_rocm"""
+        return self._enum == PlatformEnum.ROCM
+
+    def is_tpu(self) -> bool:
+        """is_tpu"""
+        return self._enum == PlatformEnum.TPU
+
+    def is_hpu(self) -> bool:
+        """is_hpu"""
+        return self._enum == PlatformEnum.HPU
+
+    def is_xpu(self) -> bool:
+        """is_xpu"""
+        return self._enum == PlatformEnum.XPU
+
+    def is_cpu(self) -> bool:
+        """is_cpu"""
+        return self._enum == PlatformEnum.CPU
+
+    def is_neuron(self) -> bool:
+        """is_neuron"""
+        return self._enum == PlatformEnum.NEURON
+
+    def is_out_of_tree(self) -> bool:
+        """is_out_of_tree"""
+        return self._enum == PlatformEnum.OOT
+
+    def is_cuda_alike(self) -> bool:
+        """Stateless version of [torch.cuda.is_available][]."""
+        return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
+
+    def is_sleep_mode_available(self) -> bool:
+        """is_sleep_mode_available"""
+        return self._enum == PlatformEnum.CUDA
+
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        """Returns the device name, which defaults to "kunlun".
+
+        Args:
+            device_id (int, optional): The device ID, default is 0. Ignored in this method. Defaults to 0.
+
+        Returns:
+            str: The device name, which is fixed as "kunlun".
+        """
+        return "kunlun"
+
+    @classmethod
+    def get_piecewise_backend_cls(cls) -> str:
+        return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend"  # noqa
+
+    @classmethod
+    def get_static_graph_wrapper_cls(cls) -> str:
+        return "vllm.compilation.cuda_graph.CUDAGraphWrapper"  # noqa
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        """Returns the total memory size of the device in bytes (B). Defaults to the total memory size of the first device.
+        If the `device_id` parameter is not an integer or exceeds the available device range, a ValueError will be raised.
+
+        Args:
+            device_id (int, optional): The device ID, default is 0. Defaults to 0.
+
+        Raises:
+            ValueError: If the `device_id` parameter is not an integer or exceeds the available device range, this exception is raised.
+
+        Returns:
+            int: The total memory size of the device in bytes (B).
+        """
+        return psutil.virtual_memory().total
+
+    @classmethod
+    def inference_mode(cls):
+        """Returns a context manager that disables gradient computation.
+        """
+        return torch.no_grad()
+
+    @classmethod
+    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
+        """get_device_capability"""
+        major, minor = torch.cuda.get_device_capability()
+        return DeviceCapability(major=major, minor=minor)
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
+        """Updates the default values of various components based on the configuration.
+        If not specified, automatically selects the worker class based on certain conditions.
+        If the block size is not set in the cache configuration, it is set to 16.
+        If using MLA and `VLLM_ATTENTION_BACKEND` is not set or is set to "FLASHMLA",
+        the cache block size is set to 64.
+        If running in DeepEP high throughput backend, data parallelism greater than 1, and CUDA graph mode,
+        it forces the use of eager mode, as DP + DeepEP high throughput kernels are not CUDA graph compatible,
+        and using DeepEP low latency kernels can resolve this issue.
+
+        Args:
+            vllm_config (VllmConfig): VLLM configuration object.
+
+        Raises:
+            NotImplementedError: If multi-step scheduling is used on vLLM V1, this exception is raised.
+            Please remove the --num-scheduler-steps argument from the command line.
+            NotImplementedError: If MLA is used on vLLM V1, this exception is raised.
+            Please ensure that the `VLLM_ATTENTION_BACKEND` environment variable is set before using MLA.
+
+        Returns:
+            None: No return value.
+        """
+        parallel_config = vllm_config.parallel_config
+        scheduler_config = vllm_config.scheduler_config
+        model_config = vllm_config.model_config
+
+        if parallel_config.worker_cls == "auto":
+            if vllm_config.speculative_config:
+                if envs.VLLM_USE_V1:
+                    parallel_config.worker_cls = \
+                            "vllm.v1.worker.gpu_worker.Worker"
+                else:
+                    parallel_config.worker_cls = \
+                        "vllm.spec_decode.spec_decode_worker.create_spec_worker"
+                    parallel_config.sd_worker_cls = \
+                        "vllm.worker.worker.Worker"
+            else:
+                print(f"envs.VLLM_USE_V1 = {envs.VLLM_USE_V1}")
+                if envs.VLLM_USE_V1:
+                    parallel_config.worker_cls = \
+                            "vllm.v1.worker.gpu_worker.Worker"
+                else:
+                    parallel_config.worker_cls = "vllm.worker.worker.Worker"
+
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
+        # TODO(lucas): handle this more gracefully
+        # Note: model_config may be None during testing
+        if model_config is not None and model_config.use_mla:
+            # if `VLLM_ATTENTION_BACKEND` is not set and we are using MLA, then
+            # we default to FlashMLA backend, so we need to force the blocksize
+            # here
+            use_flashmla = (envs.VLLM_ATTENTION_BACKEND is None \
+                or envs.VLLM_ATTENTION_BACKEND == "FLASHMLA")
+            from vllm.attention.ops.flashmla import is_flashmla_supported
+            if use_flashmla and is_flashmla_supported()[0] \
+                and cache_config.block_size != 64:
+                cache_config.block_size = 64
+                logger.info(
+                    "Forcing kv cache block size to 64 for FlashMLA backend.")
+
+        if (envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput"
+                and parallel_config.data_parallel_size > 1
+                and vllm_config.compilation_config.use_cudagraph):
+            logger.info(
+                "Data Parallel: Forcing enforce eager to be True since DP "
+                "with DeepEP high-throughput kernels are not CUDA Graph "
+                "compatible. The DeepEP low-latency kernels are CUDA Graph "
+                "compatible. Set the all_to_all backend to deepep_low_latency "
+                "to use those kernels instead.")
+            vllm_config.compilation_config.use_cudagraph = False
+            vllm_config.model_config.enforce_eager = True
+            # TODO (varun): Turning this ON gives incorrect results for the
+            # Deepseek-V2-lite model.
+            vllm_config.compilation_config.use_inductor = False
+        if vllm_config.compilation_config.use_cudagraph and envs.VLLM_USE_V1:
+            vllm_config.compilation_config.custom_ops = ["all"]
+            vllm_config.compilation_config.pass_config.enable_fusion = False
+            vllm_config.compilation_config.use_inductor = False
+
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
+                             kv_cache_dtype, block_size, use_v1, use_mla,use_sink):
+        """
+            Returns the class of attention backend based on the selected backend and other parameters.
+        
+        Args:
+            selected_backend (str): Selected backend name. Currently supported backends are 'kunlun' and 'default'.
+            head_size (int): Size of the attention heads.
+            dtype (torch.dtype): Data type of the input tensor.
+            kv_cache_dtype (torch.dtype): Data type of the key-value cache.
+            block_size (int): Block size used in the attention computation.
+            use_v1 (bool, optional): Whether to use v1 version of the backend. Defaults to False.
+            use_mla (bool, optional): Whether to use MLA version of the backend. Defaults to False.
+        
+        Returns:
+            str: Class name of the attention backend.
+        """
+        if use_v1:
+            return "vllm_kunlun.v1.attention.backends.kunlun_attn.KunlunAttentionBackend"
+        elif not use_mla:                     
+            return "vllm_kunlun.ops.attention.backends.kunlun_attn.KunlunAttentionBackend"
+        else:
+            return "vllm_kunlun.attention.backends.kunlun_mla.KunlunMLAAttentionBackend"
+
+    @classmethod
+    def get_current_memory_usage(cls,
+                                 device: Optional[torch.types.Device] = None
+                                 ) -> float:
+        """Gets the current memory usage of the device, including allocated and max allocated.
+        If no device is specified, defaults to the current context's device.
+
+        Args:
+            device (Optional[torch.types.Device], optional): Optional device object, defaults to None. Defaults to the current context's device.
+
+        Returns:
+            float: Returns a float representing the current memory usage of the device, in bytes.
+
+            Raises:
+                None.
+        """
+        torch.cuda.reset_peak_memory_stats(device)
+        return torch.cuda.max_memory_allocated(device)
+
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        """Checks if asynchronous output is supported.
+        By default, Kunlun does not support asynchronous output.
+
+        Args:
+            enforce_eager (Optional[bool], optional): Whether to enforce eager execution. Defaults to None.
+                None means not to force eager execution, but to automatically select based on the current environment.
+
+        Returns:
+            bool: True means asynchronous output is supported, False means asynchronous output is not supported.
+        """
+        # Assume Kunlun does not support asynchronous output
+        return False
+
+    @classmethod
+    def supports_v1(cls, model_config: "ModelConfig") -> bool:
+        """
+            Check if the model config is supported by this class in v1.
+        
+        Args:
+            model_config (ModelConfig): Model configuration to be checked.
+        
+        Returns:
+            bool: Whether the model config is supported in v1. Always returns True for this class.
+        """
+        return True
+
+    @classmethod
+    def set_device(cls, device: torch.device) -> None:
+        """
+        Set the device for the current platform.
+        """
+        torch.cuda.set_device(device)
+
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        '''
+       communicator
+       '''
+        return "vllm_kunlun.distributed.kunlun_communicator.KunlunCommunicator"
+
+    @classmethod
+    def get_punica_wrapper(cls):
+        return "vllm_kunlun.lora.punica_wrapper.punica_kunlun.PunicaWrapperKunlun"
--- a/vllm_kunlun/platforms/version.py
+++ b/vllm_kunlun/platforms/version.py
@@ -0,0 +1,8 @@
+"""vllm_kunlun version.py"""
+vllm_version = "0.9.2"
+
+xvllm_version_tuple = (0, 9, 2)
+
+def get_xvllm_version():
+    major, minor, patch = xvllm_version_tuple
+    return f"{major}.{minor}.{patch}"