2025-12-10 12:05:39 +08:00
|
|
|
|
"""kunlun"""
|
|
|
|
|
|
import psutil
|
|
|
|
|
|
import torch
|
|
|
|
|
|
|
|
|
|
|
|
from vllm.platforms.interface import DeviceCapability, Platform, PlatformEnum, _Backend
|
|
|
|
|
|
from typing import Optional, Union
|
|
|
|
|
|
import vllm.envs as envs
|
|
|
|
|
|
from vllm.logger import init_logger
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logger = init_logger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
class KunlunPlatform(Platform):
|
|
|
|
|
|
"""KunlunPlatform"""
|
|
|
|
|
|
_enum = PlatformEnum.CUDA
|
|
|
|
|
|
dist_backend:str = "nccl"
|
|
|
|
|
|
ray_device_key: str = "GPU"
|
|
|
|
|
|
device_name: str = "xpu"
|
|
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
|
def device_type(self):
|
2025-12-10 17:51:24 +08:00
|
|
|
|
"""
|
|
|
|
|
|
返回设备类型,固定为'cuda'。
|
2025-12-10 12:05:39 +08:00
|
|
|
|
"""
|
|
|
|
|
|
return "cuda"
|
2025-12-10 17:51:24 +08:00
|
|
|
|
|
2025-12-10 12:05:39 +08:00
|
|
|
|
def is_kunlun(self) -> bool:
|
|
|
|
|
|
"""is_kunlun"""
|
|
|
|
|
|
return self._enum == PlatformEnum.CUDA
|
|
|
|
|
|
|
|
|
|
|
|
def is_cuda(self) -> bool:
|
|
|
|
|
|
"""is_cuda"""
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
def is_rocm(self) -> bool:
|
|
|
|
|
|
"""is_rocm"""
|
|
|
|
|
|
return self._enum == PlatformEnum.ROCM
|
|
|
|
|
|
|
|
|
|
|
|
def is_tpu(self) -> bool:
|
|
|
|
|
|
"""is_tpu"""
|
|
|
|
|
|
return self._enum == PlatformEnum.TPU
|
|
|
|
|
|
|
|
|
|
|
|
def is_hpu(self) -> bool:
|
|
|
|
|
|
"""is_hpu"""
|
|
|
|
|
|
return self._enum == PlatformEnum.HPU
|
|
|
|
|
|
|
|
|
|
|
|
def is_xpu(self) -> bool:
|
|
|
|
|
|
"""is_xpu"""
|
|
|
|
|
|
return self._enum == PlatformEnum.XPU
|
|
|
|
|
|
|
|
|
|
|
|
def is_cpu(self) -> bool:
|
|
|
|
|
|
"""is_cpu"""
|
|
|
|
|
|
return self._enum == PlatformEnum.CPU
|
|
|
|
|
|
|
|
|
|
|
|
def is_neuron(self) -> bool:
|
|
|
|
|
|
"""is_neuron"""
|
|
|
|
|
|
return self._enum == PlatformEnum.NEURON
|
|
|
|
|
|
|
|
|
|
|
|
def is_out_of_tree(self) -> bool:
|
|
|
|
|
|
"""is_out_of_tree"""
|
|
|
|
|
|
return self._enum == PlatformEnum.OOT
|
|
|
|
|
|
|
|
|
|
|
|
def is_cuda_alike(self) -> bool:
|
|
|
|
|
|
"""Stateless version of [torch.cuda.is_available][]."""
|
|
|
|
|
|
return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
|
|
|
|
|
|
|
|
|
|
|
|
def is_sleep_mode_available(self) -> bool:
|
|
|
|
|
|
"""is_sleep_mode_available"""
|
|
|
|
|
|
return self._enum == PlatformEnum.CUDA
|
|
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
|
def get_device_name(cls, device_id: int = 0) -> str:
|
2025-12-10 17:51:24 +08:00
|
|
|
|
"""
|
|
|
|
|
|
获取设备名称,默认返回 "kunlun"。
|
|
|
|
|
|
|
2025-12-10 12:05:39 +08:00
|
|
|
|
Args:
|
2025-12-10 17:51:24 +08:00
|
|
|
|
device_id (int, optional): 设备ID,默认为0. Ignored in this method. Defaults to 0.
|
|
|
|
|
|
|
2025-12-10 12:05:39 +08:00
|
|
|
|
Returns:
|
2025-12-10 17:51:24 +08:00
|
|
|
|
str: 设备名称,固定返回 "kunlun".
|
2025-12-10 12:05:39 +08:00
|
|
|
|
"""
|
|
|
|
|
|
return "kunlun"
|
|
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
|
def get_piecewise_backend_cls(cls) -> str:
|
|
|
|
|
|
return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend" # noqa
|
|
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
|
def get_static_graph_wrapper_cls(cls) -> str:
|
|
|
|
|
|
return "vllm.compilation.cuda_graph.CUDAGraphWrapper" # noqa
|
|
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
|
def get_device_total_memory(cls, device_id: int = 0) -> int:
|
2025-12-10 17:51:24 +08:00
|
|
|
|
"""
|
|
|
|
|
|
获取设备总内存大小,单位为字节(B)。默认返回第一个设备的总内存大小。
|
|
|
|
|
|
如果传入参数`device_id`不是整数或者超出了可用设备范围,将会引发ValueError异常。
|
|
|
|
|
|
|
2025-12-10 12:05:39 +08:00
|
|
|
|
Args:
|
2025-12-10 17:51:24 +08:00
|
|
|
|
device_id (int, optional): 设备ID,默认为0. Defaults to 0.
|
|
|
|
|
|
|
2025-12-10 12:05:39 +08:00
|
|
|
|
Raises:
|
2025-12-10 17:51:24 +08:00
|
|
|
|
ValueError: 当传入的`device_id`不是整数或者超出了可用设备范围时引发此异常。
|
|
|
|
|
|
|
2025-12-10 12:05:39 +08:00
|
|
|
|
Returns:
|
2025-12-10 17:51:24 +08:00
|
|
|
|
int: 设备总内存大小,单位为字节(B)。
|
2025-12-10 12:05:39 +08:00
|
|
|
|
"""
|
|
|
|
|
|
return psutil.virtual_memory().total
|
|
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
|
def inference_mode(cls):
|
2025-12-10 17:51:24 +08:00
|
|
|
|
"""
|
|
|
|
|
|
进入推理模式,禁止计算梯度。
|
|
|
|
|
|
返回:torch.no_grad(),一个上下文管理器,用于禁止计算梯度。
|
2025-12-10 12:05:39 +08:00
|
|
|
|
"""
|
|
|
|
|
|
return torch.no_grad()
|
|
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
|
def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
|
|
|
|
|
|
"""get_device_capability"""
|
|
|
|
|
|
major, minor = torch.cuda.get_device_capability()
|
|
|
|
|
|
return DeviceCapability(major=major, minor=minor)
|
2025-12-10 17:51:24 +08:00
|
|
|
|
|
2025-12-10 12:05:39 +08:00
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
|
def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
|
2025-12-10 17:51:24 +08:00
|
|
|
|
"""
|
|
|
|
|
|
根据配置更新各个部分的默认值。
|
|
|
|
|
|
如果未指定,则根据某些条件自动选择worker类。
|
|
|
|
|
|
如果缓存配置中没有设置块大小,则将其设置为16。
|
|
|
|
|
|
如果使用MLA,并且`VLLM_ATTENTION_BACKEND`未设置或设置为"FLASHMLA",
|
|
|
|
|
|
则将缓存块大小设置为64。
|
|
|
|
|
|
如果在DeepEP高吞吐量后端、数据并行大于1和CUDA图形模式下运行,则强制
|
|
|
|
|
|
强制执行即时模式,因为DP + DeepEP高吞吐量内核不是CUDA图形兼容的,而且
|
|
|
|
|
|
使用DeepEP低延迟内核可以解决这个问题。
|
|
|
|
|
|
|
2025-12-10 12:05:39 +08:00
|
|
|
|
Args:
|
2025-12-10 17:51:24 +08:00
|
|
|
|
vllm_config (VllmConfig): VLLM配置对象。
|
|
|
|
|
|
|
2025-12-10 12:05:39 +08:00
|
|
|
|
Raises:
|
2025-12-10 17:51:24 +08:00
|
|
|
|
NotImplementedError: 如果在vLLM V1上使用多步调度,则会引发NotImplementedError。
|
|
|
|
|
|
请从命令行中删除--num-scheduler-steps参数。
|
|
|
|
|
|
NotImplementedError: 如果在vLLM V1上使用MLA,则会引发NotImplementedError。
|
|
|
|
|
|
请确保在使用MLA之前设置了`VLLM_ATTENTION_BACKEND`环境变量。
|
|
|
|
|
|
|
2025-12-10 12:05:39 +08:00
|
|
|
|
Returns:
|
2025-12-10 17:51:24 +08:00
|
|
|
|
None: 无返回值。
|
2025-12-10 12:05:39 +08:00
|
|
|
|
"""
|
|
|
|
|
|
parallel_config = vllm_config.parallel_config
|
|
|
|
|
|
scheduler_config = vllm_config.scheduler_config
|
|
|
|
|
|
model_config = vllm_config.model_config
|
|
|
|
|
|
|
|
|
|
|
|
if parallel_config.worker_cls == "auto":
|
|
|
|
|
|
if vllm_config.speculative_config:
|
|
|
|
|
|
if envs.VLLM_USE_V1:
|
|
|
|
|
|
parallel_config.worker_cls = \
|
|
|
|
|
|
"vllm.v1.worker.gpu_worker.Worker"
|
|
|
|
|
|
else:
|
|
|
|
|
|
parallel_config.worker_cls = \
|
|
|
|
|
|
"vllm.spec_decode.spec_decode_worker.create_spec_worker"
|
|
|
|
|
|
parallel_config.sd_worker_cls = \
|
|
|
|
|
|
"vllm.worker.worker.Worker"
|
|
|
|
|
|
else:
|
|
|
|
|
|
print(f"envs.VLLM_USE_V1 = {envs.VLLM_USE_V1}")
|
|
|
|
|
|
if envs.VLLM_USE_V1:
|
|
|
|
|
|
parallel_config.worker_cls = \
|
|
|
|
|
|
"vllm.v1.worker.gpu_worker.Worker"
|
|
|
|
|
|
else:
|
|
|
|
|
|
parallel_config.worker_cls = "vllm.worker.worker.Worker"
|
2025-12-10 17:51:24 +08:00
|
|
|
|
|
2025-12-10 12:05:39 +08:00
|
|
|
|
cache_config = vllm_config.cache_config
|
|
|
|
|
|
if cache_config and cache_config.block_size is None:
|
|
|
|
|
|
cache_config.block_size = 16
|
|
|
|
|
|
|
|
|
|
|
|
# TODO(lucas): handle this more gracefully
|
|
|
|
|
|
# Note: model_config may be None during testing
|
|
|
|
|
|
if model_config is not None and model_config.use_mla:
|
|
|
|
|
|
# if `VLLM_ATTENTION_BACKEND` is not set and we are using MLA, then
|
|
|
|
|
|
# we default to FlashMLA backend, so we need to force the blocksize
|
|
|
|
|
|
# here
|
2026-01-05 22:55:35 +08:00
|
|
|
|
use_sparse = hasattr(vllm_config.model_config.hf_config,
|
|
|
|
|
|
"index_topk")
|
2025-12-10 12:05:39 +08:00
|
|
|
|
use_flashmla = (envs.VLLM_ATTENTION_BACKEND is None \
|
|
|
|
|
|
or envs.VLLM_ATTENTION_BACKEND == "FLASHMLA")
|
|
|
|
|
|
from vllm.attention.ops.flashmla import is_flashmla_supported
|
|
|
|
|
|
if use_flashmla and is_flashmla_supported()[0] \
|
|
|
|
|
|
and cache_config.block_size != 64:
|
|
|
|
|
|
cache_config.block_size = 64
|
|
|
|
|
|
logger.info(
|
|
|
|
|
|
"Forcing kv cache block size to 64 for FlashMLA backend.")
|
2026-01-05 22:55:35 +08:00
|
|
|
|
if use_sparse and cache_config.block_size != 64:
|
|
|
|
|
|
cache_config.block_size = 64
|
|
|
|
|
|
logger.info(
|
|
|
|
|
|
"Forcing kv cache block size to 64 for FlashMLASparse "
|
|
|
|
|
|
"backend.")
|
2025-12-10 12:05:39 +08:00
|
|
|
|
|
|
|
|
|
|
if (envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput"
|
|
|
|
|
|
and parallel_config.data_parallel_size > 1
|
|
|
|
|
|
and vllm_config.compilation_config.use_cudagraph):
|
|
|
|
|
|
logger.info(
|
|
|
|
|
|
"Data Parallel: Forcing enforce eager to be True since DP "
|
|
|
|
|
|
"with DeepEP high-throughput kernels are not CUDA Graph "
|
|
|
|
|
|
"compatible. The DeepEP low-latency kernels are CUDA Graph "
|
|
|
|
|
|
"compatible. Set the all_to_all backend to deepep_low_latency "
|
|
|
|
|
|
"to use those kernels instead.")
|
|
|
|
|
|
vllm_config.compilation_config.use_cudagraph = False
|
|
|
|
|
|
vllm_config.model_config.enforce_eager = True
|
|
|
|
|
|
# TODO (varun): Turning this ON gives incorrect results for the
|
|
|
|
|
|
# Deepseek-V2-lite model.
|
|
|
|
|
|
vllm_config.compilation_config.use_inductor = False
|
|
|
|
|
|
if vllm_config.compilation_config.use_cudagraph and envs.VLLM_USE_V1:
|
|
|
|
|
|
vllm_config.compilation_config.custom_ops = ["all"]
|
|
|
|
|
|
vllm_config.compilation_config.pass_config.enable_fusion = False
|
|
|
|
|
|
vllm_config.compilation_config.use_inductor = False
|
|
|
|
|
|
|
2025-12-10 17:51:24 +08:00
|
|
|
|
|
2025-12-10 12:05:39 +08:00
|
|
|
|
@classmethod
|
|
|
|
|
|
def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
|
2025-12-10 17:51:24 +08:00
|
|
|
|
kv_cache_dtype, block_size, use_v1, use_mla,use_sink, use_sparse=False):
|
2025-12-10 12:05:39 +08:00
|
|
|
|
"""
|
|
|
|
|
|
Returns the class of attention backend based on the selected backend and other parameters.
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
selected_backend (str): Selected backend name. Currently supported backends are 'kunlun' and 'default'.
|
|
|
|
|
|
head_size (int): Size of the attention heads.
|
|
|
|
|
|
dtype (torch.dtype): Data type of the input tensor.
|
|
|
|
|
|
kv_cache_dtype (torch.dtype): Data type of the key-value cache.
|
|
|
|
|
|
block_size (int): Block size used in the attention computation.
|
|
|
|
|
|
use_v1 (bool, optional): Whether to use v1 version of the backend. Defaults to False.
|
|
|
|
|
|
use_mla (bool, optional): Whether to use MLA version of the backend. Defaults to False.
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
str: Class name of the attention backend.
|
|
|
|
|
|
"""
|
2026-01-05 22:55:35 +08:00
|
|
|
|
if use_mla:
|
|
|
|
|
|
if use_sparse:
|
|
|
|
|
|
logger.info_once("Using Sparse MLA backend on V1 engine.")
|
|
|
|
|
|
# return ("vllm.v1.attention.backends.mla.flashmla_sparse."
|
|
|
|
|
|
# "FlashMLASparseBackend")
|
|
|
|
|
|
return ("vllm_kunlun.v1.attention.backends.mla.flashmla_sparse."
|
|
|
|
|
|
"FlashMLASparseBackend")
|
|
|
|
|
|
return "vllm_kunlun.v1.attention.backends.mla.flashmla.FlashMLABackend"
|
2025-12-10 12:05:39 +08:00
|
|
|
|
if use_v1:
|
|
|
|
|
|
return "vllm_kunlun.v1.attention.backends.kunlun_attn.KunlunAttentionBackend"
|
|
|
|
|
|
elif not use_mla:
|
|
|
|
|
|
return "vllm_kunlun.ops.attention.backends.kunlun_attn.KunlunAttentionBackend"
|
|
|
|
|
|
else:
|
|
|
|
|
|
return "vllm_kunlun.attention.backends.kunlun_mla.KunlunMLAAttentionBackend"
|
|
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
|
def get_current_memory_usage(cls,
|
|
|
|
|
|
device: Optional[torch.types.Device] = None
|
|
|
|
|
|
) -> float:
|
2025-12-10 17:51:24 +08:00
|
|
|
|
"""
|
|
|
|
|
|
获取当前设备的内存使用情况,包括已分配和最大分配。
|
|
|
|
|
|
如果未指定设备,则默认为当前上下文中的设备。
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
device (Optional[torch.types.Device], optional): 可选的设备对象,默认为None。默认为当前上下文中的设备。
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
float: 返回一个浮点数,表示当前设备的内存使用情况,单位是字节(bytes)。
|
|
|
|
|
|
|
2025-12-10 12:05:39 +08:00
|
|
|
|
Raises:
|
|
|
|
|
|
None.
|
|
|
|
|
|
"""
|
|
|
|
|
|
torch.cuda.reset_peak_memory_stats(device)
|
|
|
|
|
|
return torch.cuda.max_memory_allocated(device)
|
|
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
|
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
|
2025-12-10 17:51:24 +08:00
|
|
|
|
"""
|
|
|
|
|
|
判断是否支持异步输出。
|
|
|
|
|
|
默认情况下,Kunlun 不支持异步输出。
|
|
|
|
|
|
|
2025-12-10 12:05:39 +08:00
|
|
|
|
Args:
|
2025-12-10 17:51:24 +08:00
|
|
|
|
enforce_eager (Optional[bool], optional): 是否强制使用 eager execution. Defaults to None.
|
|
|
|
|
|
None 表示不强制使用 eager execution,而是根据当前环境自动选择。
|
|
|
|
|
|
|
2025-12-10 12:05:39 +08:00
|
|
|
|
Returns:
|
2025-12-10 17:51:24 +08:00
|
|
|
|
bool: True 表示支持异步输出,False 表示不支持异步输出。
|
2025-12-10 12:05:39 +08:00
|
|
|
|
"""
|
2025-12-10 17:51:24 +08:00
|
|
|
|
# 假设 Kunlun 不支持异步输出
|
2025-12-10 12:05:39 +08:00
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
|
def supports_v1(cls, model_config: "ModelConfig") -> bool:
|
|
|
|
|
|
"""
|
|
|
|
|
|
Check if the model config is supported by this class in v1.
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
model_config (ModelConfig): Model configuration to be checked.
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
bool: Whether the model config is supported in v1. Always returns True for this class.
|
|
|
|
|
|
"""
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
|
def set_device(cls, device: torch.device) -> None:
|
|
|
|
|
|
"""
|
|
|
|
|
|
Set the device for the current platform.
|
|
|
|
|
|
"""
|
|
|
|
|
|
torch.cuda.set_device(device)
|
|
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
|
def get_device_communicator_cls(cls) -> str:
|
2025-12-10 17:51:24 +08:00
|
|
|
|
'''
|
2025-12-10 12:05:39 +08:00
|
|
|
|
communicator
|
|
|
|
|
|
'''
|
2025-12-10 17:51:24 +08:00
|
|
|
|
return "vllm_kunlun.distributed.kunlun_communicator.KunlunCommunicator"
|
2025-12-10 12:05:39 +08:00
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
|
def get_punica_wrapper(cls):
|
2025-12-10 17:51:24 +08:00
|
|
|
|
return "vllm.lora.punica_wrapper.punica_cpu.PunicaWrapperCPU"
|
|
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
|
def check_if_supports_dtype(cls, torch_dtype: torch.dtype):
|
|
|
|
|
|
'''
|
|
|
|
|
|
Kunlun3平台支持的数据类型
|
|
|
|
|
|
'''
|
|
|
|
|
|
supported_dtypes = {
|
|
|
|
|
|
torch.float32,
|
|
|
|
|
|
torch.float16,
|
|
|
|
|
|
torch.bfloat16,
|
|
|
|
|
|
torch.int8,
|
|
|
|
|
|
}
|
|
|
|
|
|
if torch_dtype not in supported_dtypes:
|
|
|
|
|
|
raise ValueError(
|
|
|
|
|
|
f"Kunlun platform does not support dtype {torch_dtype}. "
|
|
|
|
|
|
"Supported dtypes are: fp32, fp16, bf16, int8."
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def opaque_attention_op(cls) -> bool:
|
|
|
|
|
|
'''
|
|
|
|
|
|
确保V1 Graph在Kunlun3平台使用vllm.unified_attention_with_output_kunlun作为split ops
|
|
|
|
|
|
'''
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
|
def support_hybrid_kv_cache(cls) -> bool:
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
|
def support_static_graph_mode(cls) -> bool:
|
|
|
|
|
|
return True
|