提交vllm0.11.0开发分支

This commit is contained in:
chenyili
2025-12-10 17:51:24 +08:00
parent deab7dd0b6
commit 7c22d621fb
175 changed files with 31856 additions and 8683 deletions

View File

@@ -4,28 +4,26 @@ import os
from typing import TYPE_CHECKING, Any, Callable, Optional
if TYPE_CHECKING:
VLLM_MULTI_LOGPATH: str = ("./log",)
ENABLE_VLLM_MULTI_LOG: bool = (False,)
ENABLE_VLLM_INFER_HOOK: bool = (False,)
ENABLE_VLLM_OPS_HOOK: bool = (False,)
ENABLE_VLLM_MODULE_HOOK: bool = False
VLLM_MULTI_LOGPATH : str = "./log",
ENABLE_VLLM_MULTI_LOG : bool = False,
ENABLE_VLLM_INFER_HOOK : bool = False,
ENABLE_VLLM_OPS_HOOK : bool = False,
ENABLE_VLLM_MODULE_HOOK : bool = False
def maybe_convert_int(value: Optional[str]) -> Optional[int]:
"""
If the value is None, return None; otherwise, convert the string to an integer and return it.
如果值是None则返回None否则将字符串转换为整数并返回。
Args:
value (Optional[str], optional): The optional string to convert. Defaults to None.
value (Optional[str], optional): 要转换的可选字符串. Defaults to None.
Returns:
Optional[int]: If the value is None, return None; otherwise, convert the string to an integer and return it.
Optional[int]: 如果值是None则返回None否则将字符串转换为整数并返回.
"""
if value is None:
return None
return int(value)
# The begin-* and end* here are used by the documentation generator
# to extract the used env vars.
@@ -33,56 +31,59 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
xvllm_environment_variables: dict[str, Callable[[], Any]] = {
# path to the logs of redirect-output, abstrac of related are ok
"VLLM_MULTI_LOGPATH": lambda: os.environ.get("VLLM_MULTI_LOGPATH", "./logs"),
# turn on / off multi-log of multi nodes & multi cards
"ENABLE_VLLM_MULTI_LOG": lambda: (
os.environ.get("ENABLE_VLLM_MULTI_LOG", "False").lower() in ("true", "1")
),
# turn on / off XVLLM infer stage log ability
"ENABLE_VLLM_INFER_HOOK": lambda: (
os.environ.get("ENABLE_VLLM_INFER_HOOK", "False").lower() in ("true", "1")
),
# turn on / off XVLLM infer_ops log ability
"ENABLE_VLLM_OPS_HOOK": lambda: (
os.environ.get("ENABLE_VLLM_OPS_HOOK", "False").lower() in ("true", "1")
),
"ENABLE_VLLM_MODULE_HOOK": lambda: (
os.environ.get("ENABLE_VLLM_MODULE_HOOK", "False").lower() in ("true", "1")
),
"VLLM_MULTI_LOGPATH":
lambda: os.environ.get("VLLM_MULTI_LOGPATH", "./logs"),
# turn on / off multi-log of multi nodes & multi cards
"ENABLE_VLLM_MULTI_LOG":
lambda: (os.environ.get("ENABLE_VLLM_MULTI_LOG", "False").lower() in
("true", "1")),
# turn on / off XVLLM infer stage log ability
"ENABLE_VLLM_INFER_HOOK":
lambda: (os.environ.get("ENABLE_VLLM_INFER_HOOK", "False").lower() in
("true", "1")),
# turn on / off XVLLM infer_ops log ability
"ENABLE_VLLM_OPS_HOOK":
lambda: (os.environ.get("ENABLE_VLLM_OPS_HOOK", "False").lower() in
("true", "1")),
"ENABLE_VLLM_MODULE_HOOK":
lambda: (os.environ.get("ENABLE_VLLM_MODULE_HOOK", "False").lower() in
("true", "1")),
# fuse sorted op with fused_moe kernel
"ENABLE_VLLM_MOE_FC_SORTED": lambda: (
os.environ.get("ENABLE_VLLM_MOE_FC_SORTED", "False").lower() in ("true", "1")
),
"ENABLE_VLLM_MOE_FC_SORTED":
lambda: (os.environ.get("ENABLE_VLLM_MOE_FC_SORTED", "False").lower() in
("true", "1")),
# enable custom dpsk scaling rope
"ENABLE_CUSTOM_DPSK_SCALING_ROPE": lambda: (
os.environ.get("ENABLE_CUSTOM_DPSK_SCALING_ROPE", "False").lower()
in ("true", "1")
),
"ENABLE_CUSTOM_DPSK_SCALING_ROPE":
lambda: (os.environ.get("ENABLE_CUSTOM_DPSK_SCALING_ROPE", "False").lower() in
("true", "1")),
# fuse qkv split & qk norm & qk rope
# only works for qwen3 dense and qwen3 moe models
"ENABLE_VLLM_FUSED_QKV_SPLIT_NORM_ROPE": lambda: (
os.environ.get("ENABLE_VLLM_FUSED_QKV_SPLIT_NORM_ROPE", "False").lower()
in ("true", "1")
),
"ENABLE_VLLM_FUSED_QKV_SPLIT_NORM_ROPE":
lambda: (os.environ.get("ENABLE_VLLM_FUSED_QKV_SPLIT_NORM_ROPE", "False").lower() in
("true", "1")),
}
# end-env-vars-definition
def __getattr__(name: str):
"""
This function is called when an attribute that doesn't exist is accessed.
If the attribute is one of the xvllm_environment_variables, return the corresponding value.
Otherwise, raise an AttributeError.
当调用不存在的属性时该函数被调用。如果属性是xvllm_environment_variables中的一个则返回相应的值。否则引发AttributeError异常。
Args:
name (str): The name of the attribute to retrieve.
name (str): 要获取的属性名称。
Raises:
AttributeError (Exception): If the attribute is not one of xvllm_environment_variables, this exception is raised.
AttributeError (Exception): 如果属性不是xvllm_environment_variables中的一个则会引发此异常。
Returns:
Any, optional: If the attribute is one of xvllm_environment_variables, the corresponding value is returned; otherwise, None is returned.
Any, optional: 如果属性是xvllm_environment_variables中的一个则返回相应的值否则返回None。
"""
# lazy evaluation of environment variables
if name in xvllm_environment_variables:
@@ -92,14 +93,13 @@ def __getattr__(name: str):
def __dir__():
"""
Returns a list of all visible variable names.
返回一个包含所有可见的变量名称的列表。
返回值list一个包含所有可见的变量名称的列表这些变量是通过`xvllm_environment_variables`字典定义的。
Returns:
list: A list of all visible variable names, which are defined through the `xvllm_environment_variables` dictionary.
Returns:
List[str]: A list of all visible variable names.
These variables are defined through the `xvllm_environment_variables` dictionary.
List[str]: 一个包含所有可见的变量名称的列表。
这些变量是通过`xvllm_environment_variables`字典定义的。
"""
return list(xvllm_environment_variables.keys())

View File

@@ -19,10 +19,11 @@ class KunlunPlatform(Platform):
@property
def device_type(self):
"""Returns the device type, which is fixed as 'cuda'.
"""
返回设备类型,固定为'cuda'
"""
return "cuda"
def is_kunlun(self) -> bool:
"""is_kunlun"""
return self._enum == PlatformEnum.CUDA
@@ -69,13 +70,14 @@ class KunlunPlatform(Platform):
@classmethod
def get_device_name(cls, device_id: int = 0) -> str:
"""Returns the device name, which defaults to "kunlun".
"""
获取设备名称,默认返回 "kunlun"
Args:
device_id (int, optional): The device ID, default is 0. Ignored in this method. Defaults to 0.
device_id (int, optional): 设备ID默认为0. Ignored in this method. Defaults to 0.
Returns:
str: The device name, which is fixed as "kunlun".
str: 设备名称,固定返回 "kunlun".
"""
return "kunlun"
@@ -89,23 +91,26 @@ class KunlunPlatform(Platform):
@classmethod
def get_device_total_memory(cls, device_id: int = 0) -> int:
"""Returns the total memory size of the device in bytes (B). Defaults to the total memory size of the first device.
If the `device_id` parameter is not an integer or exceeds the available device range, a ValueError will be raised.
"""
获取设备总内存大小单位为字节B。默认返回第一个设备的总内存大小。
如果传入参数`device_id`不是整数或者超出了可用设备范围将会引发ValueError异常。
Args:
device_id (int, optional): The device ID, default is 0. Defaults to 0.
device_id (int, optional): 设备ID默认为0. Defaults to 0.
Raises:
ValueError: If the `device_id` parameter is not an integer or exceeds the available device range, this exception is raised.
ValueError: 当传入的`device_id`不是整数或者超出了可用设备范围时引发此异常。
Returns:
int: The total memory size of the device in bytes (B).
int: 设备总内存大小单位为字节B
"""
return psutil.virtual_memory().total
@classmethod
def inference_mode(cls):
"""Returns a context manager that disables gradient computation.
"""
进入推理模式,禁止计算梯度。
返回torch.no_grad(),一个上下文管理器,用于禁止计算梯度。
"""
return torch.no_grad()
@@ -114,29 +119,31 @@ class KunlunPlatform(Platform):
"""get_device_capability"""
major, minor = torch.cuda.get_device_capability()
return DeviceCapability(major=major, minor=minor)
@classmethod
def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
"""Updates the default values of various components based on the configuration.
If not specified, automatically selects the worker class based on certain conditions.
If the block size is not set in the cache configuration, it is set to 16.
If using MLA and `VLLM_ATTENTION_BACKEND` is not set or is set to "FLASHMLA",
the cache block size is set to 64.
If running in DeepEP high throughput backend, data parallelism greater than 1, and CUDA graph mode,
it forces the use of eager mode, as DP + DeepEP high throughput kernels are not CUDA graph compatible,
and using DeepEP low latency kernels can resolve this issue.
"""
根据配置更新各个部分的默认值。
如果未指定则根据某些条件自动选择worker类。
如果缓存配置中没有设置块大小则将其设置为16。
如果使用MLA并且`VLLM_ATTENTION_BACKEND`未设置或设置为"FLASHMLA"
则将缓存块大小设置为64。
如果在DeepEP高吞吐量后端、数据并行大于1和CUDA图形模式下运行则强制
强制执行即时模式因为DP + DeepEP高吞吐量内核不是CUDA图形兼容的而且
使用DeepEP低延迟内核可以解决这个问题。
Args:
vllm_config (VllmConfig): VLLM configuration object.
vllm_config (VllmConfig): VLLM配置对象。
Raises:
NotImplementedError: If multi-step scheduling is used on vLLM V1, this exception is raised.
Please remove the --num-scheduler-steps argument from the command line.
NotImplementedError: If MLA is used on vLLM V1, this exception is raised.
Please ensure that the `VLLM_ATTENTION_BACKEND` environment variable is set before using MLA.
NotImplementedError: 如果在vLLM V1上使用多步调度则会引发NotImplementedError。
请从命令行中删除--num-scheduler-steps参数。
NotImplementedError: 如果在vLLM V1上使用MLA则会引发NotImplementedError。
请确保在使用MLA之前设置了`VLLM_ATTENTION_BACKEND`环境变量。
Returns:
None: No return value.
None: 无返回值。
"""
parallel_config = vllm_config.parallel_config
scheduler_config = vllm_config.scheduler_config
@@ -159,7 +166,7 @@ class KunlunPlatform(Platform):
"vllm.v1.worker.gpu_worker.Worker"
else:
parallel_config.worker_cls = "vllm.worker.worker.Worker"
cache_config = vllm_config.cache_config
if cache_config and cache_config.block_size is None:
cache_config.block_size = 16
@@ -198,9 +205,10 @@ class KunlunPlatform(Platform):
vllm_config.compilation_config.pass_config.enable_fusion = False
vllm_config.compilation_config.use_inductor = False
@classmethod
def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
kv_cache_dtype, block_size, use_v1, use_mla,use_sink):
kv_cache_dtype, block_size, use_v1, use_mla,use_sink, use_sparse=False):
"""
Returns the class of attention backend based on the selected backend and other parameters.
@@ -227,15 +235,16 @@ class KunlunPlatform(Platform):
def get_current_memory_usage(cls,
device: Optional[torch.types.Device] = None
) -> float:
"""Gets the current memory usage of the device, including allocated and max allocated.
If no device is specified, defaults to the current context's device.
Args:
device (Optional[torch.types.Device], optional): Optional device object, defaults to None. Defaults to the current context's device.
Returns:
float: Returns a float representing the current memory usage of the device, in bytes.
"""
获取当前设备的内存使用情况,包括已分配和最大分配。
如果未指定设备,则默认为当前上下文中的设备。
Args:
device (Optional[torch.types.Device], optional): 可选的设备对象默认为None。默认为当前上下文中的设备。
Returns:
float: 返回一个浮点数表示当前设备的内存使用情况单位是字节bytes
Raises:
None.
"""
@@ -244,17 +253,18 @@ class KunlunPlatform(Platform):
@classmethod
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
"""Checks if asynchronous output is supported.
By default, Kunlun does not support asynchronous output.
Args:
enforce_eager (Optional[bool], optional): Whether to enforce eager execution. Defaults to None.
None means not to force eager execution, but to automatically select based on the current environment.
Returns:
bool: True means asynchronous output is supported, False means asynchronous output is not supported.
"""
# Assume Kunlun does not support asynchronous output
判断是否支持异步输出。
默认情况下Kunlun 不支持异步输出。
Args:
enforce_eager (Optional[bool], optional): 是否强制使用 eager execution. Defaults to None.
None 表示不强制使用 eager execution而是根据当前环境自动选择。
Returns:
bool: True 表示支持异步输出False 表示不支持异步输出。
"""
# 假设 Kunlun 不支持异步输出
return False
@classmethod
@@ -279,11 +289,42 @@ class KunlunPlatform(Platform):
@classmethod
def get_device_communicator_cls(cls) -> str:
'''
'''
communicator
'''
return "vllm_kunlun.distributed.kunlun_communicator.KunlunCommunicator"
return "vllm_kunlun.distributed.kunlun_communicator.KunlunCommunicator"
@classmethod
def get_punica_wrapper(cls):
return "vllm_kunlun.lora.punica_wrapper.punica_kunlun.PunicaWrapperKunlun"
return "vllm.lora.punica_wrapper.punica_cpu.PunicaWrapperCPU"
@classmethod
def check_if_supports_dtype(cls, torch_dtype: torch.dtype):
'''
Kunlun3平台支持的数据类型
'''
supported_dtypes = {
torch.float32,
torch.float16,
torch.bfloat16,
torch.int8,
}
if torch_dtype not in supported_dtypes:
raise ValueError(
f"Kunlun platform does not support dtype {torch_dtype}. "
"Supported dtypes are: fp32, fp16, bf16, int8."
)
def opaque_attention_op(cls) -> bool:
'''
确保V1 Graph在Kunlun3平台使用vllm.unified_attention_with_output_kunlun作为split ops
'''
return True
@classmethod
def support_hybrid_kv_cache(cls) -> bool:
return True
@classmethod
def support_static_graph_mode(cls) -> bool:
return True