提交vllm0.11.0开发分支

2025-12-10 17:51:24 +08:00
parent deab7dd0b6
commit 7c22d621fb
175 changed files with 31856 additions and 8683 deletions
--- a/vllm_kunlun/platforms/envs.py
+++ b/vllm_kunlun/platforms/envs.py
@@ -4,28 +4,26 @@ import os
 from typing import TYPE_CHECKING, Any, Callable, Optional

 if TYPE_CHECKING:
-    VLLM_MULTI_LOGPATH: str = ("./log",)
-    ENABLE_VLLM_MULTI_LOG: bool = (False,)
-    ENABLE_VLLM_INFER_HOOK: bool = (False,)
-    ENABLE_VLLM_OPS_HOOK: bool = (False,)
-    ENABLE_VLLM_MODULE_HOOK: bool = False
-
+    VLLM_MULTI_LOGPATH : str = "./log",
+    ENABLE_VLLM_MULTI_LOG : bool = False,
+    ENABLE_VLLM_INFER_HOOK : bool = False,
+    ENABLE_VLLM_OPS_HOOK : bool = False,
+    ENABLE_VLLM_MODULE_HOOK : bool = False

 def maybe_convert_int(value: Optional[str]) -> Optional[int]:
    """
-    If the value is None, return None; otherwise, convert the string to an integer and return it.
-
+    如果值是None，则返回None；否则将字符串转换为整数并返回。
+    
    Args:
-        value (Optional[str], optional): The optional string to convert. Defaults to None.
-
+        value (Optional[str], optional): 要转换的可选字符串. Defaults to None.
+    
    Returns:
-        Optional[int]: If the value is None, return None; otherwise, convert the string to an integer and return it.
+        Optional[int]: 如果值是None，则返回None；否则将字符串转换为整数并返回.
    """
    if value is None:
        return None
    return int(value)

-
 # The begin-* and end* here are used by the documentation generator
 # to extract the used env vars.

@@ -33,56 +31,59 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:

 xvllm_environment_variables: dict[str, Callable[[], Any]] = {
    # path to the logs of redirect-output, abstrac of related are ok
-    "VLLM_MULTI_LOGPATH": lambda: os.environ.get("VLLM_MULTI_LOGPATH", "./logs"),
-    # turn on / off multi-log of multi nodes & multi cards
-    "ENABLE_VLLM_MULTI_LOG": lambda: (
-        os.environ.get("ENABLE_VLLM_MULTI_LOG", "False").lower() in ("true", "1")
-    ),
-    # turn on / off XVLLM infer stage log ability
-    "ENABLE_VLLM_INFER_HOOK": lambda: (
-        os.environ.get("ENABLE_VLLM_INFER_HOOK", "False").lower() in ("true", "1")
-    ),
-    # turn on / off XVLLM infer_ops log ability
-    "ENABLE_VLLM_OPS_HOOK": lambda: (
-        os.environ.get("ENABLE_VLLM_OPS_HOOK", "False").lower() in ("true", "1")
-    ),
-    "ENABLE_VLLM_MODULE_HOOK": lambda: (
-        os.environ.get("ENABLE_VLLM_MODULE_HOOK", "False").lower() in ("true", "1")
-    ),
+    "VLLM_MULTI_LOGPATH":
+    lambda: os.environ.get("VLLM_MULTI_LOGPATH", "./logs"),
+
+    # turn on / off multi-log of multi nodes & multi cards 
+    "ENABLE_VLLM_MULTI_LOG":
+    lambda: (os.environ.get("ENABLE_VLLM_MULTI_LOG", "False").lower() in 
+             ("true", "1")),
+
+    # turn on / off XVLLM infer stage log ability 
+    "ENABLE_VLLM_INFER_HOOK":
+    lambda: (os.environ.get("ENABLE_VLLM_INFER_HOOK", "False").lower() in
+            ("true", "1")),
+
+    # turn on / off XVLLM infer_ops log ability 
+    "ENABLE_VLLM_OPS_HOOK":
+    lambda: (os.environ.get("ENABLE_VLLM_OPS_HOOK", "False").lower() in
+            ("true", "1")),
+
+    "ENABLE_VLLM_MODULE_HOOK":
+    lambda: (os.environ.get("ENABLE_VLLM_MODULE_HOOK", "False").lower() in
+            ("true", "1")),
+
    # fuse sorted op with fused_moe kernel
-    "ENABLE_VLLM_MOE_FC_SORTED": lambda: (
-        os.environ.get("ENABLE_VLLM_MOE_FC_SORTED", "False").lower() in ("true", "1")
-    ),
+    "ENABLE_VLLM_MOE_FC_SORTED":
+    lambda: (os.environ.get("ENABLE_VLLM_MOE_FC_SORTED", "False").lower() in 
+             ("true", "1")),
+
    # enable custom dpsk scaling rope
-    "ENABLE_CUSTOM_DPSK_SCALING_ROPE": lambda: (
-        os.environ.get("ENABLE_CUSTOM_DPSK_SCALING_ROPE", "False").lower()
-        in ("true", "1")
-    ),
+    "ENABLE_CUSTOM_DPSK_SCALING_ROPE":
+    lambda: (os.environ.get("ENABLE_CUSTOM_DPSK_SCALING_ROPE", "False").lower() in 
+             ("true", "1")),
+
    # fuse qkv split & qk norm & qk rope
    # only works for qwen3 dense and qwen3 moe models
-    "ENABLE_VLLM_FUSED_QKV_SPLIT_NORM_ROPE": lambda: (
-        os.environ.get("ENABLE_VLLM_FUSED_QKV_SPLIT_NORM_ROPE", "False").lower()
-        in ("true", "1")
-    ),
+    "ENABLE_VLLM_FUSED_QKV_SPLIT_NORM_ROPE":
+    lambda: (os.environ.get("ENABLE_VLLM_FUSED_QKV_SPLIT_NORM_ROPE", "False").lower() in 
+             ("true", "1")),
 }

 # end-env-vars-definition

-
 def __getattr__(name: str):
    """
-    This function is called when an attribute that doesn't exist is accessed.
-    If the attribute is one of the xvllm_environment_variables, return the corresponding value.
-    Otherwise, raise an AttributeError.
-
+    当调用不存在的属性时，该函数被调用。如果属性是xvllm_environment_variables中的一个，则返回相应的值。否则引发AttributeError异常。
+    
    Args:
-        name (str): The name of the attribute to retrieve.
-
+        name (str): 要获取的属性名称。
+    
    Raises:
-        AttributeError (Exception): If the attribute is not one of xvllm_environment_variables, this exception is raised.
-
+        AttributeError (Exception): 如果属性不是xvllm_environment_variables中的一个，则会引发此异常。
+    
    Returns:
-        Any, optional: If the attribute is one of xvllm_environment_variables, the corresponding value is returned; otherwise, None is returned.
+        Any, optional: 如果属性是xvllm_environment_variables中的一个，则返回相应的值；否则返回None。
    """
    # lazy evaluation of environment variables
    if name in xvllm_environment_variables:
@@ -92,14 +93,13 @@ def __getattr__(name: str):

 def __dir__():
    """
-    Returns a list of all visible variable names.
-
+    返回一个包含所有可见的变量名称的列表。
+    
+    返回值（list）：一个包含所有可见的变量名称的列表，这些变量是通过`xvllm_environment_variables`字典定义的。
+    
    Returns:
-        list: A list of all visible variable names, which are defined through the `xvllm_environment_variables` dictionary.
-
-    Returns:
-        List[str]: A list of all visible variable names.
-                   These variables are defined through the `xvllm_environment_variables` dictionary.
+        List[str]: 一个包含所有可见的变量名称的列表。
+                   这些变量是通过`xvllm_environment_variables`字典定义的。
    """
    return list(xvllm_environment_variables.keys())

--- a/vllm_kunlun/platforms/kunlun.py
+++ b/vllm_kunlun/platforms/kunlun.py
@@ -19,10 +19,11 @@ class KunlunPlatform(Platform):

    @property
    def device_type(self):
-        """Returns the device type, which is fixed as 'cuda'.
+        """
+        返回设备类型，固定为'cuda'。
        """
        return "cuda"
-
+    
    def is_kunlun(self) -> bool:
        """is_kunlun"""
        return self._enum == PlatformEnum.CUDA
@@ -69,13 +70,14 @@ class KunlunPlatform(Platform):

    @classmethod
    def get_device_name(cls, device_id: int = 0) -> str:
-        """Returns the device name, which defaults to "kunlun".
-
+        """
+            获取设备名称，默认返回 "kunlun"。
+        
        Args:
-            device_id (int, optional): The device ID, default is 0. Ignored in this method. Defaults to 0.
-
+            device_id (int, optional): 设备ID，默认为0. Ignored in this method. Defaults to 0.
+        
        Returns:
-            str: The device name, which is fixed as "kunlun".
+            str: 设备名称，固定返回 "kunlun".
        """
        return "kunlun"

@@ -89,23 +91,26 @@ class KunlunPlatform(Platform):

    @classmethod
    def get_device_total_memory(cls, device_id: int = 0) -> int:
-        """Returns the total memory size of the device in bytes (B). Defaults to the total memory size of the first device.
-        If the `device_id` parameter is not an integer or exceeds the available device range, a ValueError will be raised.
-
+        """
+            获取设备总内存大小，单位为字节（B）。默认返回第一个设备的总内存大小。
+        如果传入参数`device_id`不是整数或者超出了可用设备范围，将会引发ValueError异常。
+        
        Args:
-            device_id (int, optional): The device ID, default is 0. Defaults to 0.
-
+            device_id (int, optional): 设备ID，默认为0. Defaults to 0.
+        
        Raises:
-            ValueError: If the `device_id` parameter is not an integer or exceeds the available device range, this exception is raised.
-
+            ValueError: 当传入的`device_id`不是整数或者超出了可用设备范围时引发此异常。
+        
        Returns:
-            int: The total memory size of the device in bytes (B).
+            int: 设备总内存大小，单位为字节（B）。
        """
        return psutil.virtual_memory().total

    @classmethod
    def inference_mode(cls):
-        """Returns a context manager that disables gradient computation.
+        """
+            进入推理模式，禁止计算梯度。
+        返回：torch.no_grad()，一个上下文管理器，用于禁止计算梯度。
        """
        return torch.no_grad()

@@ -114,29 +119,31 @@ class KunlunPlatform(Platform):
        """get_device_capability"""
        major, minor = torch.cuda.get_device_capability()
        return DeviceCapability(major=major, minor=minor)
+    

    @classmethod
    def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
-        """Updates the default values of various components based on the configuration.
-        If not specified, automatically selects the worker class based on certain conditions.
-        If the block size is not set in the cache configuration, it is set to 16.
-        If using MLA and `VLLM_ATTENTION_BACKEND` is not set or is set to "FLASHMLA",
-        the cache block size is set to 64.
-        If running in DeepEP high throughput backend, data parallelism greater than 1, and CUDA graph mode,
-        it forces the use of eager mode, as DP + DeepEP high throughput kernels are not CUDA graph compatible,
-        and using DeepEP low latency kernels can resolve this issue.
-
+        """
+            根据配置更新各个部分的默认值。
+        如果未指定，则根据某些条件自动选择worker类。
+        如果缓存配置中没有设置块大小，则将其设置为16。
+        如果使用MLA，并且`VLLM_ATTENTION_BACKEND`未设置或设置为"FLASHMLA"，
+        则将缓存块大小设置为64。
+        如果在DeepEP高吞吐量后端、数据并行大于1和CUDA图形模式下运行，则强制
+        强制执行即时模式，因为DP + DeepEP高吞吐量内核不是CUDA图形兼容的，而且
+        使用DeepEP低延迟内核可以解决这个问题。
+        
        Args:
-            vllm_config (VllmConfig): VLLM configuration object.
-
+            vllm_config (VllmConfig): VLLM配置对象。
+        
        Raises:
-            NotImplementedError: If multi-step scheduling is used on vLLM V1, this exception is raised.
-            Please remove the --num-scheduler-steps argument from the command line.
-            NotImplementedError: If MLA is used on vLLM V1, this exception is raised.
-            Please ensure that the `VLLM_ATTENTION_BACKEND` environment variable is set before using MLA.
-
+            NotImplementedError: 如果在vLLM V1上使用多步调度，则会引发NotImplementedError。
+            请从命令行中删除--num-scheduler-steps参数。
+            NotImplementedError: 如果在vLLM V1上使用MLA，则会引发NotImplementedError。
+            请确保在使用MLA之前设置了`VLLM_ATTENTION_BACKEND`环境变量。
+        
        Returns:
-            None: No return value.
+            None: 无返回值。
        """
        parallel_config = vllm_config.parallel_config
        scheduler_config = vllm_config.scheduler_config
@@ -159,7 +166,7 @@ class KunlunPlatform(Platform):
                            "vllm.v1.worker.gpu_worker.Worker"
                else:
                    parallel_config.worker_cls = "vllm.worker.worker.Worker"
-
+        
        cache_config = vllm_config.cache_config
        if cache_config and cache_config.block_size is None:
            cache_config.block_size = 16
@@ -198,9 +205,10 @@ class KunlunPlatform(Platform):
            vllm_config.compilation_config.pass_config.enable_fusion = False
            vllm_config.compilation_config.use_inductor = False

+
    @classmethod
    def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
-                             kv_cache_dtype, block_size, use_v1, use_mla,use_sink):
+                             kv_cache_dtype, block_size, use_v1, use_mla,use_sink, use_sparse=False):
        """
            Returns the class of attention backend based on the selected backend and other parameters.
        
@@ -227,15 +235,16 @@ class KunlunPlatform(Platform):
    def get_current_memory_usage(cls,
                                 device: Optional[torch.types.Device] = None
                                 ) -> float:
-        """Gets the current memory usage of the device, including allocated and max allocated.
-        If no device is specified, defaults to the current context's device.
-
-        Args:
-            device (Optional[torch.types.Device], optional): Optional device object, defaults to None. Defaults to the current context's device.
-
-        Returns:
-            float: Returns a float representing the current memory usage of the device, in bytes.
-
+        """
+        获取当前设备的内存使用情况，包括已分配和最大分配。
+            如果未指定设备，则默认为当前上下文中的设备。
+        
+            Args:
+                device (Optional[torch.types.Device], optional): 可选的设备对象，默认为None。默认为当前上下文中的设备。
+        
+            Returns:
+                float: 返回一个浮点数，表示当前设备的内存使用情况，单位是字节（bytes）。
+        
            Raises:
                None.
        """
@@ -244,17 +253,18 @@ class KunlunPlatform(Platform):

    @classmethod
    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
-        """Checks if asynchronous output is supported.
-        By default, Kunlun does not support asynchronous output.
-
-        Args:
-            enforce_eager (Optional[bool], optional): Whether to enforce eager execution. Defaults to None.
-                None means not to force eager execution, but to automatically select based on the current environment.
-
-        Returns:
-            bool: True means asynchronous output is supported, False means asynchronous output is not supported.
        """
-        # Assume Kunlun does not support asynchronous output
+            判断是否支持异步输出。
+        默认情况下，Kunlun 不支持异步输出。
+        
+        Args:
+            enforce_eager (Optional[bool], optional): 是否强制使用 eager execution. Defaults to None.
+                None 表示不强制使用 eager execution，而是根据当前环境自动选择。
+        
+        Returns:
+            bool: True 表示支持异步输出，False 表示不支持异步输出。
+        """
+        # 假设 Kunlun 不支持异步输出
        return False

    @classmethod
@@ -279,11 +289,42 @@ class KunlunPlatform(Platform):

    @classmethod
    def get_device_communicator_cls(cls) -> str:
-        '''
+       '''
       communicator
       '''
-        return "vllm_kunlun.distributed.kunlun_communicator.KunlunCommunicator"
+       return "vllm_kunlun.distributed.kunlun_communicator.KunlunCommunicator"

    @classmethod
    def get_punica_wrapper(cls):
-        return "vllm_kunlun.lora.punica_wrapper.punica_kunlun.PunicaWrapperKunlun"
+        return "vllm.lora.punica_wrapper.punica_cpu.PunicaWrapperCPU"
+    
+    @classmethod
+    def check_if_supports_dtype(cls, torch_dtype: torch.dtype):
+        '''
+        Kunlun3平台支持的数据类型
+        '''
+        supported_dtypes = {
+            torch.float32,
+            torch.float16,
+            torch.bfloat16,
+            torch.int8,
+        }
+        if torch_dtype not in supported_dtypes:
+            raise ValueError(
+                f"Kunlun platform does not support dtype {torch_dtype}. "
+                "Supported dtypes are: fp32, fp16, bf16, int8."
+            )
+       
+    def opaque_attention_op(cls) -> bool:
+        '''
+        确保V1 Graph在Kunlun3平台使用vllm.unified_attention_with_output_kunlun作为split ops 
+        '''
+        return True
+    
+    @classmethod
+    def support_hybrid_kv_cache(cls) -> bool:
+        return True
+
+    @classmethod
+    def support_static_graph_mode(cls) -> bool:
+        return True