update

2026-04-09 11:19:36 +08:00
parent 809cecae09
commit 8082d5f4b2
2579 changed files with 3675 additions and 0 deletions
--- a/vllm_old/attention/backends/init.py
+++ b/vllm_old/attention/backends/init.py
--- a/vllm_old/attention/backends/pycache/init.cpython-312.pyc
+++ b/vllm_old/attention/backends/pycache/init.cpython-312.pyc
--- a/vllm_old/attention/backends/pycache/abstract.cpython-312.pyc
+++ b/vllm_old/attention/backends/pycache/abstract.cpython-312.pyc
--- a/vllm_old/attention/backends/pycache/registry.cpython-312.pyc
+++ b/vllm_old/attention/backends/pycache/registry.cpython-312.pyc
--- a/vllm_old/attention/backends/pycache/utils.cpython-312.pyc
+++ b/vllm_old/attention/backends/pycache/utils.cpython-312.pyc
--- a/vllm_old/attention/backends/abstract.py
+++ b/vllm_old/attention/backends/abstract.py
@@ -0,0 +1,391 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, ClassVar, Generic, Protocol, TypeVar, get_args
+
+import torch
+
+from vllm.model_executor.layers.linear import ColumnParallelLinear
+from vllm.model_executor.layers.quantization.utils.quant_utils import QuantKey
+
+if TYPE_CHECKING:
+    from vllm.config.cache import CacheDType
+    from vllm.platforms.interface import DeviceCapability
+    from vllm.v1.attention.backends.utils import KVCacheLayoutType
+
+
+class AttentionType:
+    """
+    Attention type.
+    Use string to be compatible with `torch.compile`.
+    """
+
+    DECODER = "decoder"
+    """Decoder attention between previous layer Q/K/V."""
+    ENCODER = "encoder"
+    """Encoder attention between previous layer Q/K/V for encoder-decoder."""
+    ENCODER_ONLY = "encoder_only"
+    """Encoder attention between previous layer Q/K/V."""
+    ENCODER_DECODER = "encoder_decoder"
+    """Attention between dec. Q and enc. K/V for encoder-decoder."""
+
+
+class MultipleOf:
+    base: int
+
+    def __init__(self, base: int):
+        self.base = base
+
+
+class AttentionBackend(ABC):
+    """Abstract class for attention backends."""
+
+    # For some attention backends, we allocate an output tensor before
+    # calling the custom op. When piecewise cudagraph is enabled, this
+    # makes sure the output tensor is allocated inside the cudagraph.
+    accept_output_buffer: bool = False
+    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(1)]
+    supported_kv_cache_dtypes: ClassVar[list["CacheDType"]] = ["auto"]
+
+    @staticmethod
+    @abstractmethod
+    def get_name() -> str:
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def get_impl_cls() -> type["AttentionImpl"]:
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def get_builder_cls():  # -> Type["AttentionMetadataBuilder"]:
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+        cache_dtype_str: str = "auto",
+    ) -> tuple[int, ...]:
+        raise NotImplementedError
+
+    @staticmethod
+    def get_kv_cache_stride_order() -> tuple[int, ...]:
+        raise NotImplementedError
+
+    @classmethod
+    def full_cls_name(cls) -> tuple[str, str]:
+        return (cls.__module__, cls.__qualname__)
+
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
+        return []
+
+    @classmethod
+    def supports_head_size(cls, head_size: int) -> bool:
+        supported_head_sizes = cls.get_supported_head_sizes()
+        return (not supported_head_sizes) or head_size in supported_head_sizes
+
+    @classmethod
+    def supports_dtype(cls, dtype: torch.dtype) -> bool:
+        return dtype in cls.supported_dtypes
+
+    @classmethod
+    def supports_kv_cache_dtype(cls, kv_cache_dtype: "CacheDType | None") -> bool:
+        if kv_cache_dtype is None:
+            return True
+        return (not cls.supported_kv_cache_dtypes) or (
+            kv_cache_dtype in cls.supported_kv_cache_dtypes
+        )
+
+    @classmethod
+    def supports_block_size(cls, block_size: int | None) -> bool:
+        from vllm.config.cache import BlockSize
+
+        if block_size is None:
+            return True
+
+        valid_sizes = get_args(BlockSize)
+        if block_size not in valid_sizes:
+            return False
+
+        if not cls.supported_kernel_block_sizes:
+            return True
+
+        for supported_size in cls.supported_kernel_block_sizes:
+            is_multiple_of = (
+                isinstance(supported_size, MultipleOf)
+                and block_size % supported_size.base == 0
+            )
+            is_int_equal = (
+                isinstance(supported_size, int) and block_size == supported_size
+            )
+            if is_multiple_of or is_int_equal:
+                return True
+        return False
+
+    @classmethod
+    def is_mla(cls) -> bool:
+        return False
+
+    @classmethod
+    def supports_sink(cls) -> bool:
+        return False
+
+    @classmethod
+    def is_sparse(cls) -> bool:
+        return False
+
+    @classmethod
+    def supports_attn_type(cls, attn_type: str) -> bool:
+        """Check if backend supports a given attention type.
+
+        By default, only supports decoder attention.
+        Backends should override this to support other attention types.
+        """
+        from vllm.attention import AttentionType
+
+        return attn_type == AttentionType.DECODER
+
+    @classmethod
+    def supports_compute_capability(cls, capability: "DeviceCapability") -> bool:
+        return True
+
+    @classmethod
+    def supports_combination(
+        cls,
+        head_size: int,
+        dtype: torch.dtype,
+        kv_cache_dtype: "CacheDType | None",
+        block_size: int | None,
+        use_mla: bool,
+        has_sink: bool,
+        use_sparse: bool,
+        device_capability: "DeviceCapability",
+    ) -> str | None:
+        return None
+
+    @classmethod
+    def validate_configuration(
+        cls,
+        head_size: int,
+        dtype: torch.dtype,
+        kv_cache_dtype: "CacheDType | None",
+        block_size: int | None,
+        use_mla: bool,
+        has_sink: bool,
+        use_sparse: bool,
+        device_capability: "DeviceCapability",
+        attn_type: str,
+    ) -> list[str]:
+        invalid_reasons = []
+        if not cls.supports_head_size(head_size):
+            invalid_reasons.append("head_size not supported")
+        if not cls.supports_dtype(dtype):
+            invalid_reasons.append("dtype not supported")
+        if not cls.supports_kv_cache_dtype(kv_cache_dtype):
+            invalid_reasons.append("kv_cache_dtype not supported")
+        if not cls.supports_block_size(block_size):
+            invalid_reasons.append("block_size not supported")
+        if use_mla != cls.is_mla():
+            if use_mla:
+                invalid_reasons.append("MLA not supported")
+            else:
+                invalid_reasons.append("non-MLA not supported")
+        if has_sink and not cls.supports_sink():
+            invalid_reasons.append("sink setting not supported")
+        if use_sparse != cls.is_sparse():
+            if use_sparse:
+                invalid_reasons.append("sparse not supported")
+            else:
+                invalid_reasons.append("non-sparse not supported")
+        if not cls.supports_compute_capability(device_capability):
+            invalid_reasons.append("compute capability not supported")
+        if not cls.supports_attn_type(attn_type):
+            invalid_reasons.append(f"attention type {attn_type} not supported")
+        combination_reason = cls.supports_combination(
+            head_size,
+            dtype,
+            kv_cache_dtype,
+            block_size,
+            use_mla,
+            has_sink,
+            use_sparse,
+            device_capability,
+        )
+        if combination_reason is not None:
+            invalid_reasons.append(combination_reason)
+        return invalid_reasons
+
+    @classmethod
+    def get_required_kv_cache_layout(cls) -> "KVCacheLayoutType | None":
+        return None
+
+
+class AttentionMetadata:
+    pass
+
+
+T = TypeVar("T", bound=AttentionMetadata)
+
+
+class AttentionLayer(Protocol):
+    _q_scale: torch.Tensor
+    _k_scale: torch.Tensor
+    _v_scale: torch.Tensor
+    _q_scale_float: float
+    _k_scale_float: float
+    _v_scale_float: float
+    _prob_scale: torch.Tensor
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor: ...
+
+
+class AttentionImpl(ABC, Generic[T]):
+    # Whether the attention impl can return the softmax lse for decode.
+    # Some features like decode context parallelism require the softmax lse.
+    can_return_lse_for_decode: bool = False
+
+    # some attention backends might not always want to return lse
+    # even if they can return lse (for efficiency reasons)
+    need_to_return_lse_for_decode: bool = False
+
+    dcp_world_size: int
+    dcp_rank: int
+
+    def __new__(cls, *args, **kwargs):
+        # use __new__ so that all subclasses will call this
+        self = super().__new__(cls)
+        try:
+            from vllm.distributed.parallel_state import get_dcp_group
+
+            self.dcp_world_size = get_dcp_group().world_size
+            self.dcp_rank = get_dcp_group().rank_in_group
+        except AssertionError:
+            # DCP might not be initialized in testing
+            self.dcp_world_size = 1
+            self.dcp_rank = 0
+        self.need_to_return_lse_for_decode = (
+            self.dcp_world_size > 1 and self.can_return_lse_for_decode
+        )
+        return self
+
+    @abstractmethod
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int | None = None,
+        alibi_slopes: list[float] | None = None,
+        sliding_window: int | None = None,
+        kv_cache_dtype: str = "auto",
+        logits_soft_cap: float | None = None,
+        attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: str | None = None,
+    ) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def forward(
+        self,
+        layer: AttentionLayer,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: T,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def fused_output_quant_supported(self, quant_key: QuantKey):
+        """
+        Does this attention implementation support fused output quantization.
+        This is used by the AttnFusionPass to only fuse output quantization
+        onto implementations that support it.
+
+        :param quant_key: QuantKey object that describes the quantization op
+        :return: is fusion supported for this type of quantization
+        """
+        return False
+
+    def supports_quant_query_input(self) -> bool:
+        """
+        Check if this attention implementation supports pre-quantized query input.
+
+        When True, the attention layer will quantize queries before passing them
+        to this backend, allowing torch.compile to fuse the quantization with
+        previous operations. This is typically supported when using FP8 KV cache
+        with compatible attention kernels (e.g., TRT-LLM).
+        TODO add support to more backends:
+        https://github.com/vllm-project/vllm/issues/25584
+
+        Returns:
+            bool: True if the implementation can accept pre-quantized queries.
+        """
+        return False
+
+    def process_weights_after_loading(self, act_dtype: torch.dtype):
+        pass
+
+
+class MLAAttentionImpl(AttentionImpl[T], Generic[T]):
+    @abstractmethod
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
+        kv_cache_dtype: str,
+        logits_soft_cap: float | None,
+        attn_type: str,
+        kv_sharing_target_layer_name: str | None,
+        # MLA Specific Arguments
+        q_lora_rank: int | None,
+        kv_lora_rank: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        qk_head_dim: int,
+        v_head_dim: int,
+        kv_b_proj: ColumnParallelLinear,
+        indexer: object | None = None,
+    ) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def forward(
+        self,
+        layer: AttentionLayer,
+        hidden_states_or_cq: torch.Tensor,
+        kv_c_normed: torch.Tensor,
+        k_pe: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: T,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+
+def is_quantized_kv_cache(kv_cache_dtype: str) -> bool:
+    return kv_cache_dtype != "auto"
--- a/vllm_old/attention/backends/registry.py
+++ b/vllm_old/attention/backends/registry.py
@@ -0,0 +1,195 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Attention backend registry"""
+
+import enum
+from collections.abc import Callable
+from typing import TYPE_CHECKING, cast
+
+from vllm.logger import init_logger
+from vllm.utils.import_utils import resolve_obj_by_qualname
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
+
+logger = init_logger(__name__)
+
+
+class _AttentionBackendEnumMeta(enum.EnumMeta):
+    """Metaclass for AttentionBackendEnum to provide better error messages."""
+
+    def __getitem__(cls, name: str):
+        """Get backend by name with helpful error messages."""
+        try:
+            return super().__getitem__(name)
+        except KeyError:
+            members = cast("dict[str, AttentionBackendEnum]", cls.__members__).values()
+            valid_backends = ", ".join(m.name for m in members)
+            raise ValueError(
+                f"Unknown attention backend: '{name}'. "
+                f"Valid options are: {valid_backends}"
+            ) from None
+
+
+class AttentionBackendEnum(enum.Enum, metaclass=_AttentionBackendEnumMeta):
+    """Enumeration of all supported attention backends.
+
+    The enum value is the default class path, but this can be overridden
+    at runtime using register_backend().
+
+    To get the actual backend class (respecting overrides), use:
+        backend.get_class()
+    """
+
+    FLASH_ATTN = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
+    TRITON_ATTN = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"
+    XFORMERS = "vllm.v1.attention.backends.xformers.XFormersAttentionBackend"
+    ROCM_ATTN = "vllm.v1.attention.backends.rocm_attn.RocmAttentionBackend"
+    ROCM_AITER_MLA = "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend"
+    ROCM_AITER_FA = (
+        "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend"
+    )
+    TORCH_SDPA = ""  # this tag is only used for ViT
+    FLASHINFER = "vllm.v1.attention.backends.flashinfer.FlashInferBackend"
+    FLASHINFER_MLA = (
+        "vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend"
+    )
+    TRITON_MLA = "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend"
+    CUTLASS_MLA = "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend"
+    FLASHMLA = "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend"
+    FLASHMLA_SPARSE = (
+        "vllm.v1.attention.backends.mla.flashmla_sparse.FlashMLASparseBackend"
+    )
+    FLASH_ATTN_MLA = "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend"
+    PALLAS = "vllm.v1.attention.backends.pallas.PallasAttentionBackend"
+    IPEX = "vllm.v1.attention.backends.ipex.IpexAttentionBackend"
+    NO_ATTENTION = "vllm.v1.attention.backends.no_attention.NoAttentionBackend"
+    FLEX_ATTENTION = "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"
+    TREE_ATTN = "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend"
+    ROCM_AITER_UNIFIED_ATTN = (
+        "vllm.v1.attention.backends.rocm_aiter_unified_attn."
+        "RocmAiterUnifiedAttentionBackend"
+    )
+    CPU_ATTN = "vllm.v1.attention.backends.cpu_attn.CPUAttentionBackend"
+    # Placeholder for third-party/custom backends - must be registered before use
+    CUSTOM = ""
+
+    def get_path(self, include_classname: bool = True) -> str:
+        """Get the class path for this backend (respects overrides).
+
+        Returns:
+            The fully qualified class path string
+
+        Raises:
+            ValueError: If Backend.CUSTOM is used without being registered
+        """
+        path = _OVERRIDES.get(self, self.value)
+        if not path:
+            raise ValueError(
+                f"Backend {self.name} must be registered before use. "
+                f"Use register_backend(Backend.{self.name}, 'your.module.YourClass')"
+            )
+        if not include_classname:
+            path = path.rsplit(".", 1)[0]
+        return path
+
+    def get_class(self) -> "type[AttentionBackend]":
+        """Get the backend class (respects overrides).
+
+        Returns:
+            The backend class
+
+        Raises:
+            ImportError: If the backend class cannot be imported
+            ValueError: If Backend.CUSTOM is used without being registered
+        """
+        return resolve_obj_by_qualname(self.get_path())
+
+    def is_overridden(self) -> bool:
+        """Check if this backend has been overridden.
+
+        Returns:
+            True if the backend has a registered override
+        """
+        return self in _OVERRIDES
+
+    def clear_override(self) -> None:
+        """Clear any override for this backend, reverting to the default."""
+        _OVERRIDES.pop(self, None)
+
+
+_OVERRIDES: dict[AttentionBackendEnum, str] = {}
+
+
+def register_backend(
+    backend: AttentionBackendEnum, class_path: str | None = None
+) -> Callable[[type], type]:
+    """Register or override a backend implementation.
+
+    Args:
+        backend: The AttentionBackendEnum member to register
+        class_path: Optional class path. If not provided and used as
+            decorator, will be auto-generated from the class.
+
+    Returns:
+        Decorator function if class_path is None, otherwise a no-op
+
+    Examples:
+        # Override an existing backend
+        @register_backend(AttentionBackendEnum.FLASH_ATTN)
+        class MyCustomFlashAttn:
+            ...
+
+        # Register a custom third-party backend
+        @register_backend(AttentionBackendEnum.CUSTOM)
+        class MyCustomBackend:
+            ...
+
+        # Direct registration
+        register_backend(
+            AttentionBackendEnum.CUSTOM,
+            "my.module.MyCustomBackend"
+        )
+    """
+
+    def decorator(cls: type) -> type:
+        _OVERRIDES[backend] = f"{cls.__module__}.{cls.__qualname__}"
+        return cls
+
+    if class_path is not None:
+        _OVERRIDES[backend] = class_path
+        return lambda x: x
+
+    return decorator
+
+
+# Backwards compatibility alias for plugins
+class _BackendMeta(type):
+    """Metaclass to provide deprecation warnings when accessing _Backend."""
+
+    def __getattribute__(cls, name: str):
+        if name not in ("__class__", "__mro__", "__name__"):
+            logger.warning(
+                "_Backend has been renamed to AttentionBackendEnum. "
+                "Please update your code to use AttentionBackendEnum instead. "
+                "_Backend will be removed in a future release."
+            )
+        return getattr(AttentionBackendEnum, name)
+
+    def __getitem__(cls, name: str):
+        logger.warning(
+            "_Backend has been renamed to AttentionBackendEnum. "
+            "Please update your code to use AttentionBackendEnum instead. "
+            "_Backend will be removed in a future release."
+        )
+        return AttentionBackendEnum[name]
+
+
+class _Backend(metaclass=_BackendMeta):
+    """Deprecated: Use AttentionBackendEnum instead.
+
+    This class is provided for backwards compatibility with plugins
+    and will be removed in a future release.
+    """
+
+    pass
--- a/vllm_old/attention/backends/utils.py
+++ b/vllm_old/attention/backends/utils.py
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Attention backend utils"""
+
+from dataclasses import dataclass
+
+from vllm.config import ModelConfig
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+PAD_SLOT_ID = -1
+
+
+@dataclass
+class MLADims:
+    q_lora_rank: int | None
+    kv_lora_rank: int
+    qk_nope_head_dim: int
+    qk_rope_head_dim: int
+    v_head_dim: int
+
+
+def get_mla_dims(model_config: ModelConfig) -> MLADims:
+    hf_text_config = model_config.hf_text_config
+
+    return MLADims(
+        q_lora_rank=getattr(hf_text_config, "q_lora_rank", None),
+        kv_lora_rank=hf_text_config.kv_lora_rank,
+        qk_nope_head_dim=hf_text_config.qk_nope_head_dim,
+        qk_rope_head_dim=hf_text_config.qk_rope_head_dim,
+        v_head_dim=hf_text_config.v_head_dim,
+    )