First commit

2025-08-05 19:02:46 +08:00
parent 9efe891f99
commit 99fb9f5cb0
1412 changed files with 203615 additions and 0 deletions
--- a/pkgs/xformers/ops/fmha/common.py
+++ b/pkgs/xformers/ops/fmha/common.py
@@ -0,0 +1,542 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from dataclasses import dataclass
+from typing import Any, List, Mapping, Optional, Set, Tuple, Type, Union
+
+import torch
+
+from ..._cpp_lib import _built_with_cuda
+from ..common import BaseOperator
+from .attn_bias import (
+    AttentionBias,
+    BlockDiagonalMask,
+    LowerTriangularMask,
+    LowerTriangularMaskWithTensorBias,
+)
+
+
+def _is_bias_type_supported_in_BMK(attn_bias_type: Any) -> bool:
+    # NoneType
+    if isinstance(None, attn_bias_type):
+        return True
+    if attn_bias_type in [LowerTriangularMask, torch.Tensor]:
+        return True
+    return False
+
+
+@dataclass
+class Inputs:
+    """
+    Stores inputs to the `memory_efficient_attention` operators
+    """
+
+    query: torch.Tensor
+    key: torch.Tensor
+    value: torch.Tensor
+    attn_bias: Optional[Union[torch.Tensor, AttentionBias]] = None
+    p: float = 0.0
+    scale: Optional[float] = None
+    use_alibi: bool = False
+    alibi_mode: int = 1
+    imp_mode: int = 0
+
+    @property
+    def device(self) -> torch.device:
+        return self.query.device
+
+    @property
+    def scale_float(self) -> float:
+        return self.query.shape[-1] ** (-0.5) if self.scale is None else self.scale
+
+    def get_qkv_in_bmghk(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        if self.query.ndim == 5:
+            return self.query, self.key, self.value
+        if self.query.ndim == 4:
+            return (
+                self.query.unsqueeze(2),
+                self.key.unsqueeze(2),
+                self.value.unsqueeze(2),
+            )
+        if self.value.ndim == 3:
+            return (
+                self.query[:, :, None, None],
+                self.key[:, :, None, None],
+                self.value[:, :, None, None],
+            )
+        assert False
+
+    def normalize_bmhk(self) -> Tuple[int, ...]:
+        if self.query.ndim not in [3, 4, 5]:
+            raise ValueError(
+                f"Invalid shape for query: {self.query.shape}. "
+                "Expected shape [batch, seqlen, head_groups, num_heads_per_group, K]"
+                ", [batch, seqlen, num_heads, K], or [batch, seqlen, K]."
+            )
+        if self.value.dtype == torch.int32:
+            # Quantized K/V case, in which the last dims of Q and K are different.
+            # NB we currently don't have any implementations for quantized KV with
+            # SUPPORTS_DIFFERENT_VALUE_EMBED.
+            output_shape = tuple(self.query.shape)
+        else:
+            output_shape = (self.query.shape[:-1]) + (self.value.shape[-1],)
+        # Convert from legacy format
+        if self.query.ndim == 3:
+            self.query = self.query.unsqueeze(2)
+            self.key = self.key.unsqueeze(2)
+            self.value = self.value.unsqueeze(2)
+            if isinstance(self.attn_bias, torch.Tensor):
+                if self.attn_bias.ndim != 3:
+                    raise ValueError(
+                        f"Expected BMK format for attn_bias, but got {self.attn_bias.shape}"
+                    )
+                self.attn_bias = self.attn_bias.unsqueeze(1)
+        return output_shape
+
+    def validate_inputs(self) -> None:
+        qkv = (self.query, self.key, self.value)
+        if self.query.ndim not in (3, 4, 5) or any(
+            x.ndim != self.query.ndim for x in qkv
+        ):
+            raise ValueError(
+                f"Query/Key/Value should all have BMGHK, BMHK, or BMK shape.\n"
+                f"  query.shape: {self.query.shape}\n"
+                f"  key.shape  : {self.key.shape}\n"
+                f"  value.shape: {self.value.shape}"
+            )
+        if any(x.device != self.query.device for x in qkv):
+            raise ValueError("Query/Key/Value should all be on the same device")
+        quantized_dtypes = self.key.dtype == self.value.dtype == torch.int32
+        non_quantized_dtypes = all(x.dtype == self.query.dtype for x in qkv)
+        if not (quantized_dtypes or non_quantized_dtypes):
+            raise ValueError(
+                "Query/Key/Value should either all have the same dtype, or "
+                "(in the quantized case) Key/Value should have dtype torch.int32\n"
+                f"  query.dtype: {self.query.dtype}\n"
+                f"  key.dtype  : {self.key.dtype}\n"
+                f"  value.dtype: {self.value.dtype}"
+            )
+        # Biases with tensors attached are meant to be in BMHK format
+        # This would require to permute biases/gradients which can be expensive,
+        # so let's just forbid it - BMK is a legacy format anyway
+        if self.query.ndim == 3 and not _is_bias_type_supported_in_BMK(
+            type(self.attn_bias)
+        ):
+            raise ValueError(
+                f"Please provide inputs in BMHK format rather "
+                f"than BMK when using bias type `{type(self.attn_bias).__name__}`"
+            )
+        attn_bias_t: Optional[torch.Tensor] = None
+        if isinstance(self.attn_bias, torch.Tensor):
+            attn_bias_t = self.attn_bias
+        if isinstance(self.attn_bias, LowerTriangularMaskWithTensorBias):
+            attn_bias_t = self.attn_bias._bias
+        if self.query.ndim == 4 and attn_bias_t is not None:
+            expected_shape = (
+                self.query.shape[0],
+                self.query.shape[2],
+                self.query.shape[1],
+                self.key.shape[1],
+            )
+            if attn_bias_t.shape != expected_shape:
+                raise ValueError(
+                    f"Invalid shape for attention bias: {attn_bias_t.shape} (expected {expected_shape})\n"
+                    f"  query.shape: {self.query.shape}\n"
+                    f"  key.shape  : {self.key.shape}\n"
+                    f"  value.shape: {self.value.shape}"
+                )
+        if isinstance(self.attn_bias, BlockDiagonalMask):
+            if any(x.shape[0] != 1 for x in qkv):
+                raise ValueError(
+                    f"Expected batch_size=1 when using block-diagonal bias\n"
+                    f"  query.shape: {self.query.shape}\n"
+                    f"  key.shape  : {self.key.shape}\n"
+                    f"  value.shape: {self.value.shape}"
+                )
+        if self.p < 0.0 or self.p > 1.0:
+            raise ValueError(f"Invalid dropout probability: p={self.p}")
+        # Check that shapes match between inputs
+        B, Mq = self.query.shape[:2]
+        K = self.query.shape[-1]
+        B, Mkv = self.key.shape[:2]
+        Kv = self.value.shape[-1]
+
+        valid_shapes = True
+        if self.query.ndim == 3:  # BMK
+            valid_shapes = (
+                self.query.shape == (B, Mq, K)
+                and self.key.shape == (B, Mkv, K)
+                and self.value.shape == (B, Mkv, Kv)
+            )
+        H = self.query.shape[-2]
+        if self.query.ndim == 4:  # BMHK
+            quantized_kv_cache = self.value.dtype == torch.int32
+            key_embed_dim = Kv if quantized_kv_cache else K
+            valid_shapes = (
+                self.query.shape == (B, Mq, H, K)
+                and self.key.shape == (B, Mkv, H, key_embed_dim)
+                and self.value.shape == (B, Mkv, H, Kv)
+            )
+        G = self.query.shape[2]
+        if self.query.ndim == 5:  # BMNHK
+            valid_shapes = (
+                self.query.shape == (B, Mq, G, H, K)
+                and self.key.shape == (B, Mkv, G, H, K)
+                and self.value.shape == (B, Mkv, G, H, Kv)
+            )
+        if not valid_shapes:
+            raise ValueError(
+                f"Incompatible shapes for attention inputs:\n"
+                f"  query.shape: {self.query.shape}\n"
+                f"  key.shape  : {self.key.shape}\n"
+                f"  value.shape: {self.value.shape}\n"
+                "HINT: We don't support broadcasting, please use `expand` "
+                "yourself before calling `memory_efficient_attention` if you need to"
+            )
+
+
+@dataclass
+class Context:
+    lse: torch.Tensor
+    out: torch.Tensor
+    q_padded: Optional[torch.Tensor] = None
+    k_padded: Optional[torch.Tensor] = None
+    v_padded: Optional[torch.Tensor] = None
+    o_padded: Optional[torch.Tensor] = None
+    op_bw: Optional[Type["AttentionBwOpBase"]] = None
+    rng_state: Optional[torch.Tensor] = None
+
+    def get_padded_lse(self, pad_to: int, force_pad_inf: bool = False) -> torch.Tensor:
+        pad_amount = (pad_to - (self.lse.shape[2] % pad_to)) % pad_to
+        lse = self.lse
+        if pad_amount > 0:
+            if force_pad_inf:
+                lse = lse[:, :, : self.out.shape[1]]
+                pad_amount = (pad_to - (lse.shape[2] % pad_to)) % pad_to
+            lse = torch.nn.functional.pad(lse, [0, pad_amount], value=math.inf)
+        elif force_pad_inf and self.out.shape[1] != lse.shape[2]:
+            lse[:, :, self.out.shape[1] :].fill_(math.inf)
+        return lse
+
+
+@dataclass
+class Gradients:
+    dq: torch.Tensor
+    dk: torch.Tensor
+    dv: torch.Tensor
+    # bias gradient. None if there is no tensor bias or if it doesn't require grad
+    db: Optional[torch.Tensor] = None
+
+
+class AttentionOpBase(BaseOperator):
+    """Base class for any attention operator in xFormers
+
+    See:
+
+    - :attr:`xformers.ops.fmha.cutlass.FwOp`
+    - :attr:`xformers.ops.fmha.cutlass.BwOp`
+    - :attr:`xformers.ops.fmha.flash.FwOp`
+    - :attr:`xformers.ops.fmha.flash.BwOp`
+    - :attr:`xformers.ops.fmha.triton.FwOp`
+    - :attr:`xformers.ops.fmha.triton.BwOp`
+    - :attr:`xformers.ops.fmha.small_k.FwOp`
+    - :attr:`xformers.ops.fmha.small_k.BwOp`
+    """
+
+    OPERATOR: Any
+    SUPPORTED_DEVICES: Set[str]
+    CUDA_MINIMUM_COMPUTE_CAPABILITY: Tuple[int, int] = (5, 0)
+    SUPPORTED_DTYPES: Set[torch.dtype]
+    SUPPORTED_MAX_K: float
+    SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {type(None)}
+    SUPPORTS_DROPOUT: bool
+    SUPPORTS_CUSTOM_SCALE: bool = False
+    SUPPORTS_DIFFERENT_VALUE_EMBED: bool = False
+    IS_DETERMINISTIC: bool = True
+    SUPPORTS_BMGHK: bool = False
+    NAME: str
+    OPERATOR_CATEGORY = "memory_efficient_attention"
+
+    _TEST_BATCH_SIZES: List[int] = [1, 300]
+    _TEST_K: List[int] = [32, 128]
+
+    @classmethod
+    def supports(cls, d: Inputs) -> bool:
+        return not cls.not_supported_reasons(d)
+
+    @classmethod
+    def shape_not_supported_reasons(
+        cls, Mq: int, Mkv: int, K: int, Kv: int
+    ) -> List[str]:
+        reasons = []
+        if not cls.SUPPORTS_DIFFERENT_VALUE_EMBED and K != Kv:
+            reasons.append("query.shape[-1] != value.shape[-1]")
+        if max(K, Kv) > cls.SUPPORTED_MAX_K:
+            reasons.append(
+                f"max(query.shape[-1] != value.shape[-1]) > {cls.SUPPORTED_MAX_K}"
+            )
+        return reasons
+
+    @classmethod
+    def not_supported_reasons(cls, d: Inputs) -> List[str]:
+        """
+        Returns a list of reasons why this is not supported.
+        The kernel can run these inputs only if the returned list is empty
+        """
+        reasons = cls.shape_not_supported_reasons(
+            Mq=d.query.shape[1],
+            Mkv=d.key.shape[1],
+            K=d.query.shape[-1],
+            Kv=d.query.shape[-1],
+        )
+        device_type = d.query.device.type
+        dtype = d.query.dtype
+        if device_type not in cls.SUPPORTED_DEVICES:
+            reasons.append(f"device={device_type} (supported: {cls.SUPPORTED_DEVICES})")
+        if device_type == "cuda" and not _built_with_cuda:
+            reasons.append("xFormers wasn't build with CUDA support")
+        if device_type == "cuda":
+            device_capability = torch.cuda.get_device_capability(d.device)
+            if device_capability < cls.CUDA_MINIMUM_COMPUTE_CAPABILITY:
+                reasons.append(
+                    f"requires device with capability > {cls.CUDA_MINIMUM_COMPUTE_CAPABILITY} "
+                    f"but your GPU has capability {device_capability} (too old)"
+                )
+        if dtype not in cls.SUPPORTED_DTYPES:
+            reasons.append(f"dtype={dtype} (supported: {cls.SUPPORTED_DTYPES})")
+        if type(d.attn_bias) not in cls.SUPPORTED_ATTN_BIAS_TYPES:
+            reasons.append(f"attn_bias type is {type(d.attn_bias)}")
+        if (d.p != 0.0) and not cls.SUPPORTS_DROPOUT:
+            reasons.append("dropout > 0.0")
+        if d.scale is not None and not cls.SUPPORTS_CUSTOM_SCALE:
+            reasons.append("has custom scale")
+        # bfloat16 is only supported on A100+
+        # ... although the kernels can still run and give the
+        # correct result
+        if dtype is torch.bfloat16 and (
+            not device_type.startswith("cuda")
+        ):
+            reasons.append("bf16 is only supported on A100+ GPUs")
+        if not cls.is_available():
+            reasons.append(
+                "operator wasn't built - see `python -m xformers.info` for more info"
+            )
+        if not cls.IS_DETERMINISTIC and torch.are_deterministic_algorithms_enabled():
+            reasons.append(
+                "operator is non-deterministic, but `torch.use_deterministic_algorithms` is set"
+            )
+        if not cls.SUPPORTS_BMGHK and d.query.ndim == 5:
+            reasons.append("operator does not support BMGHK format")
+        return reasons
+
+
+class AttentionFwOpBase(AttentionOpBase):
+    ERROR_ATOL: Mapping[torch.dtype, float] = {
+        torch.float: 3e-4,
+        torch.half: 4e-3,
+        torch.bfloat16: 2e-2,
+    }
+    ERROR_RTOL: Mapping[torch.dtype, float] = {
+        torch.float: 2e-5,
+        torch.half: 4e-4,
+        torch.bfloat16: 5e-3,
+    }
+
+    @classmethod
+    def apply(
+        cls, inp: Inputs, needs_gradient: bool
+    ) -> Tuple[torch.Tensor, Optional[Context]]:
+        raise NotImplementedError()
+
+    @classmethod
+    def attn_operator_flop(
+        cls,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        causal: bool = False,
+        seqstart_k: Optional[torch.Tensor] = None,
+        seqstart_q: Optional[torch.Tensor] = None,
+    ) -> int:
+        """
+        Computes total flops for the attention
+        Assumes inputs in format BMHK
+        """
+        assert query.ndim == 4
+
+        if seqstart_q is not None:
+            seqstart_q_py = seqstart_q.tolist()
+        else:
+            seqstart_q_py = [0, query.shape[1]]
+        if seqstart_k is not None:
+            seqstart_k_py = seqstart_k.tolist()
+        else:
+            seqstart_k_py = [0, key.shape[1]]
+
+        total_flop = 0
+        for q_start, q_end, k_start, k_end in zip(
+            seqstart_q_py, seqstart_q_py[1:], seqstart_k_py, seqstart_k_py[1:]
+        ):
+            num_q = q_end - q_start
+            num_kv = k_end - k_start
+            # (M,K) @ (K,N) GEMM needs M*N*K*2 flop
+            # Q @ K.transpose
+            total_flop += num_q * num_kv * query.shape[-1] * 2
+            # (ignore softmax)
+            # attn @ V
+            total_flop += num_q * key.shape[-1] * num_kv * 2
+        # Multiply by num_heads and batches
+        total_flop = total_flop * value.shape[2] * value.shape[0]
+        if causal:
+            total_flop //= 2
+        return total_flop
+
+
+class AttentionBwOpBase(AttentionOpBase):
+    ERROR_ATOL: Mapping[torch.dtype, float] = {
+        torch.float: 5e-4,
+        torch.half: 9e-2,
+        torch.bfloat16: 0.7,
+    }
+    ERROR_RTOL: Mapping[torch.dtype, float] = {
+        torch.float: 1e-4,
+        torch.half: 2e-2,
+        torch.bfloat16: 0.1,
+    }
+    SUPPORTS_ATTN_BIAS_GRAD = False
+
+    @classmethod
+    def not_supported_reasons(cls, d: Inputs) -> List[str]:
+        reasons = super(AttentionBwOpBase, cls).not_supported_reasons(d)
+        if (
+            isinstance(d.attn_bias, torch.Tensor)
+            and d.attn_bias.requires_grad
+            and not cls.SUPPORTS_ATTN_BIAS_GRAD
+        ):
+            reasons.append(
+                "Computing the bias gradient is not supported (attn_bias.requires_grad = True)"
+            )
+
+        return reasons
+
+    @classmethod
+    def apply(cls, ctx: Context, inp: Inputs, grad: torch.Tensor) -> Gradients:
+        raise NotImplementedError()
+
+    @classmethod
+    def attn_operator_flop(
+        cls,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        causal: bool = False,
+        seqstart_k: Optional[torch.Tensor] = None,
+        seqstart_q: Optional[torch.Tensor] = None,
+    ) -> int:
+        """
+        Computes total flops for the attention
+        Assumes inputs in format BMHK
+        """
+        assert query.ndim == 4
+
+        if seqstart_q is not None:
+            seqstart_q_py = seqstart_q.tolist()
+        else:
+            seqstart_q_py = [0, query.shape[1]]
+        if seqstart_k is not None:
+            seqstart_k_py = seqstart_k.tolist()
+        else:
+            seqstart_k_py = [0, key.shape[1]]
+
+        total_flop = 0
+        for q_start, q_end, k_start, k_end in zip(
+            seqstart_q_py, seqstart_q_py[1:], seqstart_k_py, seqstart_k_py[1:]
+        ):
+            num_q = q_end - q_start
+            num_kv = k_end - k_start
+            Kqk = query.shape[-1]
+            Kv = value.shape[-1]
+            # (M,K) @ (K,N) GEMM needs M*N*K*2 flop
+            # att = Q @ K.transpose
+            total_flop += num_q * num_kv * Kqk * 2
+            # att @ dO
+            total_flop += num_kv * num_q * Kv * 2
+            # dov = dO @ V
+            total_flop += num_q * Kv * num_kv * 2
+            # dov @ K
+            total_flop += num_q * Kqk * num_kv * 2
+            # dov @ Q
+            total_flop += num_q * Kqk * num_kv * 2
+        # Multiply by num_heads and batches
+        total_flop = total_flop * value.shape[2] * value.shape[0]
+        if causal:
+            total_flop //= 2
+        return total_flop
+
+
+AttentionOp = Tuple[
+    Optional[Type[AttentionFwOpBase]], Optional[Type[AttentionBwOpBase]]
+]
+
+
+@dataclass
+class AttentionOpDispatch:
+    """Dispatcher to automatically select
+    the best operator to run memory-efficient attention.
+
+    :Deprecated:
+
+        This class is deprecated and will be removed in a later version
+    """
+
+    op: AttentionOp
+
+    @classmethod
+    def from_arguments(
+        cls,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attn_bias: Optional[Union[torch.Tensor, AttentionBias]] = None,
+        p: float = 0.0,
+        scale: Optional[float] = None,
+    ) -> "AttentionOpDispatch":
+        """Here for backward compatibility"""
+        from .dispatch import _dispatch_bw, _dispatch_fw
+
+        inp = Inputs(
+            query=query,
+            key=key,
+            value=value,
+            attn_bias=attn_bias,
+            p=p,
+            scale=scale,
+        )
+        return AttentionOpDispatch(op=(_dispatch_fw(inp, True), _dispatch_bw(inp)))
+
+
+def bmk2bmhk(tensor, num_heads: int) -> torch.Tensor:
+    if tensor.ndim == 4:
+        return tensor
+    return tensor.reshape([-1, num_heads, tensor.shape[1], tensor.shape[2]]).permute(
+        (0, 2, 1, 3)
+    )
+
+
+def check_lastdim_alignment_stride1(
+    reasons: List[str], name: str, x: torch.Tensor, alignment: int
+) -> None:
+    if x.shape[-1] % alignment != 0:
+        reasons.append(f"{name}.shape[-1] % {alignment} != 0")
+    elif x.stride(-2) % alignment != 0:
+        reasons.append(
+            f"{name}.stride(-2) % {alignment} != 0 ({name}.stride() = {x.stride()})"
+        )
+    # We can have stride=0 sometimes if dimension=1
+    if x.stride(-1) > 1:
+        reasons.append(
+            f"{name}.stride(-1) > 1 ({name}.stride() = {x.stride()}) - you should call `.contiguous()` on the input"
+        )