First commit

2025-08-05 19:02:46 +08:00
parent 9efe891f99
commit 99fb9f5cb0
1412 changed files with 203615 additions and 0 deletions
--- a/pkgs/xformers/ops/fmha/triton_splitk.py
+++ b/pkgs/xformers/ops/fmha/triton_splitk.py
@@ -0,0 +1,738 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import TYPE_CHECKING, Any, List, Optional, Set, Tuple
+
+import torch
+
+from ..common import _has_triton21, register_operator
+from .attn_bias import BlockDiagonalCausalWithOffsetPaddedKeysMask
+from .common import AttentionFwOpBase, Context, Inputs, check_lastdim_alignment_stride1
+
+
+def _strides(x: torch.Tensor, *stride_names: str):
+    assert x.ndim == len(stride_names)
+    return {f"stride_{s}": x.stride(i) for i, s in enumerate(stride_names)}
+
+
+if TYPE_CHECKING or _has_triton21():
+    import triton
+    import triton.language as tl
+
+    from xformers.triton.vararg_kernel import VAR_ARGS_ARRAY, unroll_varargs
+
+    @triton.jit
+    def _fwd_kernel_splitK(
+        Q,
+        K,
+        V,
+        sm_scale,
+        Out_splitK,  # [B, H, split_k, Mq, K]
+        Metadata,  # [B, H, 2, split_k, M_ceil] contains [mi, li]
+        Seq_len,
+        stride_qz,
+        stride_qm,
+        stride_qg,
+        stride_qh,
+        stride_qk,
+        stride_kz,
+        stride_kn,
+        stride_kg,
+        stride_kh,
+        stride_kk,
+        stride_vz,
+        stride_vn,
+        stride_vg,
+        stride_vh,
+        stride_vk,
+        stride_osk_zhg,
+        stride_osk_s,
+        stride_osk_m,
+        stride_osk_k,
+        stride_mzhg,
+        stride_m2,
+        stride_ms,
+        stride_mm,
+        Z,
+        N_CTX_Q,
+        N_CTX_K,
+        BLOCK_N_PER_SPLIT,
+        H: tl.constexpr,
+        G: tl.constexpr,
+        BLOCK_M: tl.constexpr,
+        BLOCK_DMODEL: tl.constexpr,
+        BLOCK_N: tl.constexpr,
+        BOUNDS_CHECKS_N: tl.constexpr,
+        USE_SEQ_LEN: tl.constexpr,
+        PACKED_PER_VAL: tl.constexpr = 1,
+        N_GROUPS: tl.constexpr = 1,
+    ):
+        """This kernel can accept non-quantized or int4-quantized keys/values.
+        PACKED_PER_VAL determines the quantization type:
+            - PACKED_PER_VAL == 1 means no quantization
+            - PACKED_PER_VAL == 8 means 4-bit quantization (8 packed quantized values inside one int32)
+        For the quantized case K/V should be int32 tensors.
+        Quantization can be row-wise (when N_GROUPS = 1) or group-wise with N_GROUPS = 2, 4, or 8.
+        Quantization coefficients are stored at the beginning of the row along the last dimension of K/V
+        So K[B, H, M, :] has a form
+        [   quant_coef0, quant_coef1, ...|
+            group0_quant_value0, group0_quant_value1,... |
+            group1_quant_value0, group1_quant_value1,...]
+        where each quant_coef is an int32 which should be interpreted as 2 packed float16: scale and offset.
+
+        Note: this kernel needs to be processed by xformers.triton.vararg_kernel.unroll_varargs
+        before compilation. That will unroll variables marked with "VAR_ARGS_ARRAY" into lists.
+        See how FwOp.apply does it below.
+        """
+        tl.static_assert(
+            (PACKED_PER_VAL == 1 and tl.constexpr(K.dtype.element_ty != tl.int32))
+            or (PACKED_PER_VAL == 8 and tl.constexpr(K.dtype.element_ty == tl.int32)),
+            f"Only 4-bit quantization is supported, K/V should have dtype int32 in "
+            f"the quantized case: {PACKED_PER_VAL=} {tl.constexpr(K.dtype)=} {tl.constexpr(K.dtype.element_ty)=}",
+        )
+        tl.static_assert(
+            (((N_GROUPS == 1 or N_GROUPS == 2) or N_GROUPS == 4) or N_GROUPS == 8),
+            "Number of quantization groups can be 1 (row-wise quantization), 2, 4, or 8.",
+        )
+
+        QUANTIZED: tl.constexpr = PACKED_PER_VAL > 1
+        PACKED_D_PER_GROUP: tl.constexpr = BLOCK_DMODEL // PACKED_PER_VAL // N_GROUPS
+        D_PER_GROUP: tl.constexpr = BLOCK_DMODEL // N_GROUPS
+
+        start_m = tl.program_id(0)
+        off_zhg = tl.program_id(1)
+        off_z = off_zhg // (H * G)
+        off_h = (off_zhg // G) % H
+        off_g = off_zhg % G
+        splitk_idx = tl.program_id(2)
+
+        lo = splitk_idx * BLOCK_N_PER_SPLIT
+        if USE_SEQ_LEN:
+            kv_len = tl.load(Seq_len + off_z)
+        else:
+            kv_len = N_CTX_K
+        hi = tl.minimum((splitk_idx + 1) * BLOCK_N_PER_SPLIT, kv_len)
+
+        Q_block_ptr = tl.make_block_ptr(
+            base=Q + off_h * stride_qh + off_z * stride_qz + off_g * stride_qg,
+            shape=(N_CTX_Q, D_PER_GROUP),
+            strides=(stride_qm, stride_qk),
+            offsets=(start_m * BLOCK_M, 0),
+            block_shape=(BLOCK_M, D_PER_GROUP),
+            order=(1, 0),
+        )
+
+        k_base = K + off_h * stride_kh + off_z * stride_kz + off_g * stride_kg
+        # Additional shift by 1 along the last dimension in the quantized case, since
+        # the first element along that dim contains packed quantization coefficients.
+        K_block_ptr = tl.make_block_ptr(
+            base=k_base + stride_kk * QUANTIZED * N_GROUPS,
+            shape=(PACKED_D_PER_GROUP, hi),
+            strides=(stride_kk, stride_kn),
+            offsets=(0, lo),
+            block_shape=(PACKED_D_PER_GROUP, BLOCK_N),
+            order=(0, 1),
+        )
+        v_base = V + off_h * stride_vh + off_z * stride_vz + off_g * stride_vg
+        V_block_ptr = tl.make_block_ptr(
+            base=v_base + stride_vk * QUANTIZED * N_GROUPS,
+            shape=(hi, PACKED_D_PER_GROUP),
+            strides=(stride_vn, stride_vk),
+            offsets=(lo, 0),
+            block_shape=(BLOCK_N, PACKED_D_PER_GROUP),
+            order=(1, 0),
+        )
+
+        if QUANTIZED:
+            # Pointers to quantization coefficients. Even those they are 1D,
+            # we have to use block pointers, since usual pointers
+            # don't support boundary checks
+            K_scale_shift_block_ptr = tl.make_block_ptr(
+                base=k_base,
+                shape=(1, hi),
+                strides=(stride_kk, stride_kn),
+                offsets=(0, lo),
+                block_shape=(1, BLOCK_N),
+                order=(0, 1),
+            )
+            V_scale_shift_block_ptr = tl.make_block_ptr(
+                base=v_base,
+                shape=(hi, 1),
+                strides=(stride_vn, stride_vk),
+                offsets=(lo, 0),
+                block_shape=(BLOCK_N, 1),
+                order=(1, 0),
+            )
+        else:
+            K_scale_shift_block_ptr = None
+            V_scale_shift_block_ptr = None
+
+        # initialize pointer to m and l
+        m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+        l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+
+        # Before compilation, this kernel will be processed by xformers.triton.vararg_kernel.unroll_varargs.
+        # That turns tensors annotated as the one below into lists of tensors of length N_GROUPS.
+        # This is a solution for Triton native lack of support for lists of tensors.
+        acc: "VAR_ARGS_ARRAY"  # noqa: F821
+
+        for i in range(len(acc)):  # noqa: F821
+            acc[i] = tl.zeros([BLOCK_M, D_PER_GROUP], dtype=tl.float32)  # noqa: F821
+        # scale sm_scale by log_2(e) and use
+        # 2^x instead of exp in the loop because CSE and LICM
+        # don't work as expected with `exp` in the loop
+        qk_scale = sm_scale * 1.44269504
+        # load q: it will stay in SRAM throughout
+        q: "VAR_ARGS_ARRAY"  # noqa: F821
+        for i in range(len(acc)):  # noqa: F821
+            q[i] = tl.load(  # noqa: F821
+                tl.advance(Q_block_ptr, (0, i * D_PER_GROUP)), boundary_check=(0,)
+            )
+        # loop over k, v and update accumulator
+        for start_n in range(lo, hi, BLOCK_N):
+            k: "VAR_ARGS_ARRAY"  # noqa: F821
+            v: "VAR_ARGS_ARRAY"  # noqa: F821
+            for i in range(len(acc)):  # noqa: F821
+                k[i], v[i] = load_dequantize_k_v_group(  # noqa: F821
+                    K_block_ptr,
+                    V_block_ptr,
+                    K_scale_shift_block_ptr,
+                    V_scale_shift_block_ptr,
+                    BOUNDS_CHECKS_N,
+                    PACKED_PER_VAL,
+                    PACKED_D_PER_GROUP,
+                    Q.dtype.element_ty,
+                    i,
+                )
+
+            # -- compute qk ---
+            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+            for i in range(len(acc)):  # noqa: F821
+                qk += tl.dot(q[i], k[i])  # noqa: F821
+            qk *= qk_scale
+
+            # TODO: This is slow, and only needed at the last iteration.
+            # Maybe we can unroll the last iteration instead?
+            if BOUNDS_CHECKS_N:
+                qk = tl.where(tl.arange(0, BLOCK_N) < hi - start_n, qk, float("-inf"))
+            # -- compute scaling constant ---
+            m_i_new = tl.maximum(m_i, tl.max(qk, 1))
+            alpha = tl.math.exp2(m_i - m_i_new)
+            p = tl.math.exp2(qk - m_i_new[:, None])
+
+            # -- update m_i and l_i --
+            l_i = l_i * alpha + tl.sum(p, 1)
+            m_i = m_i_new
+            p = p.to(Q.dtype.element_ty)
+
+            # -- scale and update acc --
+            for i in range(len(acc)):  # noqa: F821
+                acc[i] *= alpha[:, None]  # noqa: F821
+                acc[i] += tl.dot(p, v[i])  # noqa: F821
+            # update pointers
+            K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
+            V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0))
+            if PACKED_PER_VAL > 1:
+                K_scale_shift_block_ptr = tl.advance(
+                    K_scale_shift_block_ptr, (0, BLOCK_N)
+                )
+                V_scale_shift_block_ptr = tl.advance(
+                    V_scale_shift_block_ptr, (BLOCK_N, 0)
+                )
+
+        # write back O
+        O_block_ptr = tl.make_block_ptr(
+            base=Out_splitK + off_zhg * stride_osk_zhg + splitk_idx * stride_osk_s,
+            shape=(N_CTX_Q, D_PER_GROUP),
+            strides=(stride_osk_m, 1),
+            offsets=(start_m * BLOCK_M, 0),
+            block_shape=(BLOCK_M, D_PER_GROUP),
+            order=(1, 0),
+        )
+        for i in range(len(acc)):  # noqa: F821
+            tl.store(
+                tl.advance(O_block_ptr, (0, i * D_PER_GROUP)),
+                acc[i],  # noqa: F821
+                boundary_check=(0,),
+            )
+        # Write metadata for split-K reduction
+        Metadata_ptr = (
+            Metadata
+            + off_zhg * stride_mzhg
+            + splitk_idx * stride_ms
+            + start_m * BLOCK_M
+            + tl.arange(0, BLOCK_M)
+        )
+        tl.store(Metadata_ptr, m_i)
+        tl.store(Metadata_ptr + stride_m2, l_i)
+
+    @triton.jit
+    def load_dequantize_k_v_group(
+        K_block_ptr,
+        V_block_ptr,
+        K_scale_shift_block_ptr,
+        V_scale_shift_block_ptr,
+        BOUNDS_CHECKS_N: tl.constexpr,
+        PACKED_PER_VAL: tl.constexpr,
+        PACKED_D_PER_GROUP: tl.constexpr,
+        dtype: tl.constexpr,
+        group_id: tl.constexpr,
+    ):
+        """Load K/V for a given block. In case of int4-quantized K/V, dequantize them after loading.
+        If quantization is group-wise, use group_id to advance the pointers to the current group.
+        """
+        # Advance to the current quantization group
+        K_block_ptr = tl.advance(K_block_ptr, (PACKED_D_PER_GROUP * group_id, 0))
+        V_block_ptr = tl.advance(V_block_ptr, (0, PACKED_D_PER_GROUP * group_id))
+
+        # -- load k, v --
+        k = tl.load(K_block_ptr, boundary_check=(1,) if BOUNDS_CHECKS_N else ())
+        v = tl.load(V_block_ptr, boundary_check=(0,) if BOUNDS_CHECKS_N else ())
+
+        if PACKED_PER_VAL > 1:
+            # K/V are quantized, load quantization coefficients and dequantize
+
+            K_scale_shift_block_ptr = tl.advance(K_scale_shift_block_ptr, (group_id, 0))
+            V_scale_shift_block_ptr = tl.advance(V_scale_shift_block_ptr, (0, group_id))
+
+            k_scale_shift = tl.load(
+                K_scale_shift_block_ptr, boundary_check=(1,) if BOUNDS_CHECKS_N else ()
+            )
+            v_scale_shift = tl.load(
+                V_scale_shift_block_ptr, boundary_check=(0,) if BOUNDS_CHECKS_N else ()
+            )
+
+            k_scale, k_shift = cast_uint32_to_half2(k_scale_shift)
+            v_scale, v_shift = cast_uint32_to_half2(v_scale_shift)
+            v = dequantize(v, v_scale, v_shift, PACKED_PER_VAL).to(dtype)
+            k_t = dequantize(
+                tl.trans(k),
+                tl.trans(k_scale),
+                tl.trans(k_shift),
+                PACKED_PER_VAL,
+            ).to(dtype)
+            k = tl.trans(k_t)
+        return k, v
+
+    @triton.jit
+    def cast_uint32_to_half2(scale_shift):
+        """Extract two float16 packed into one int32"""
+        scale = scale_shift & 0xFFFF
+        shift = scale_shift >> 16
+        scale = scale.to(tl.uint16).to(tl.float16, bitcast=True)
+        shift = shift.to(tl.uint16).to(tl.float16, bitcast=True)
+        return scale, shift
+
+    @triton.jit
+    def dequantize(
+        x_,
+        scale,
+        shift,
+        PACKED_PER_VAL: tl.constexpr = 8,
+    ):
+        """PACKED_PER_VAL is the number of values packed into each element x_.
+        For example, for int4 quantization and x_ of type int32, PACKED_PER_VAL is 8.
+        """
+
+        # Axis along which offsets are applied matters here
+        # It would be natural to have offsets in shape (BLOCK_N, D // PACKED_PER_VAL, PACKED_PER_VAL)
+        # and expand K/V to that shape before applying offsets
+        # However, Triton for some reason considers dim=1 as contiguous when doing tl.view below, and not dim=2
+        # Note that tl.view doesn't guarantee the order of elements in the result - thus the code below depends
+        # on the implementation details which might change in the future.
+        # Ideally we would like to use tl.reshape, but it's not implemented yet.
+        # See https://github.com/openai/triton/blob/9055af1a5dadc576804b38dd77ee91dc42af0bf7/python/triton/language/semantic.py#L541 # noqa: E501
+
+        # x_ : (BLOCK_N, D // PACKED_PER_VAL)
+        # scale: (BLOCK_N, 1)
+        # offsets: (PACKED_PER_VAL,)
+        BLOCK_N: tl.constexpr = x_.shape[0]
+        BLOCK_DMODEL_PACKED: tl.constexpr = x_.shape[1]
+        offsets = tl.arange(0, PACKED_PER_VAL) * 4
+        quant_offset = (
+            x_[:, None, :] >> offsets[None, :, None]
+        )  # (BLOCK_N, PACKED_PER_VAL, D // PACKED_PER_VAL)
+
+        quant_offset = tl.view(
+            quant_offset, (BLOCK_N, BLOCK_DMODEL_PACKED * PACKED_PER_VAL)
+        )
+        # Trick - instead of converting int4 to float16 we view it as float16
+        # and then multiply by 32768 * 512 == 2**24
+        quant_offset = (quant_offset & 0xF).to(tl.uint16).to(tl.float16, bitcast=True)
+        quant_offset = (quant_offset * 32768.0).to(tl.float16)
+        scale_512 = scale * 512
+
+        dequant = quant_offset * scale_512 + shift
+        return dequant
+
+    @triton.jit
+    def _splitK_reduce(
+        Out_splitK,  # [B, H, split_k, Mq, K]
+        Metadata,  # [B, H, 2, split_k, M_ceil] contains [mi, li]
+        Out,  # [B, H, M, K]
+        LSE,  # [B, H, M]
+        split_k,
+        stride_osk_zhg,
+        stride_osk_s,
+        stride_osk_m,
+        stride_osk_k,
+        stride_mzhg,
+        stride_m2,
+        stride_ms,
+        stride_mm,
+        stride_oz,
+        stride_oh,
+        stride_og,
+        stride_om,
+        stride_ok,
+        stride_lse_zhg,
+        stride_lse_m,
+        BLOCK_SIZE: tl.constexpr,
+        H: tl.constexpr,
+        G: tl.constexpr,
+    ):
+        off_zhg = tl.program_id(0)
+        off_z = off_zhg // (H * G)
+        off_h = (off_zhg // G) % H
+        off_g = off_zhg % G
+        off_m = tl.program_id(1)
+
+        Out_splitK_ptr = (
+            Out_splitK
+            + stride_osk_zhg * off_zhg
+            + stride_osk_m * off_m
+            + tl.arange(0, BLOCK_SIZE)
+        )
+        Metadata_ptr = Metadata + stride_mzhg * off_zhg + off_m
+        m = tl.load(Metadata_ptr)
+        l_sum = tl.load(Metadata_ptr + stride_m2)
+        acc = tl.load(Out_splitK_ptr)
+
+        for split_k_idx in range(1, split_k):
+            Metadata_ptr = Metadata_ptr + stride_ms
+            Out_splitK_ptr = Out_splitK_ptr + stride_osk_s
+
+            m_k = tl.load(Metadata_ptr)
+            l_k = tl.load(Metadata_ptr + stride_m2)
+            acc_k = tl.load(Out_splitK_ptr)
+
+            m_new = tl.maximum(m, m_k)
+            if m_k < m:
+                # Scale incoming values
+                alpha = tl.math.exp2(m_k - m_new)
+                acc_k = acc_k * alpha
+                l_k = l_k * alpha
+            else:
+                # Scale our values
+                alpha = tl.math.exp2(m - m_new)
+                acc = acc * alpha
+                l_sum = l_sum * alpha
+
+            m = m_new
+            l_sum = l_sum + l_k
+            acc = acc + acc_k
+
+        acc = acc / l_sum
+        Out_ptr = (
+            Out
+            + stride_oz * off_z
+            + stride_oh * off_h
+            + stride_og * off_g
+            + stride_om * off_m
+            + tl.arange(0, BLOCK_SIZE)
+        )
+        tl.store(Out_ptr, acc)
+
+        l_ptrs = LSE + off_zhg * stride_lse_zhg + off_m
+        tl.store(l_ptrs, (m + tl.math.log2(l_sum)) / 1.44269504)
+
+else:
+    _fwd_kernel_splitK = None
+    _splitK_reduce = None
+
+
+@register_operator
+class FwOp(AttentionFwOpBase):
+    """Flash-Attention with Split-K. Supports fused int-4 K/V quantization.
+    Quantized path will be taken if input K/V have type int32.
+
+    Quantization can be row-wise or group-wise (when cls.NUM_GROUPS > 1) along
+    the last dimension of K and V. Currently 1, 2, 4, or 8 groups per row are supported.
+    Quantization coefficients (scale and shift) are represented as two
+    float16 constants per group, packed into int32. Quantization coefficients of
+    all groups are placed at the beginning of the row. So, if unquantized K/V have head
+    dimension D, the quantized versions have head dimension D // 8 + NUM_GROUPS
+    and dtype int32.
+    Pseudocode for dequantizing one row can look like:
+    group_size = D // 8
+    for i in range(NUM_GROUPS):
+        group_start = NUM_GROUPS + i * group_size
+        group_quant = K[..., group_start: group_start + group_size]
+        scale, shift = unpack_int32_into_float16x2(group_quant[0])
+        group_dequant = group_quant[..., 1:] * scale + shift
+    ...
+
+    """
+
+    OPERATOR = _fwd_kernel_splitK
+    SUPPORTED_DEVICES = {"cuda"}
+    CUDA_MINIMUM_COMPUTE_CAPABILITY = (8, 0)
+    SUPPORTED_DTYPES = {
+        torch.half,
+        torch.bfloat16,
+    }  # Those are dtypes of Q. In the quantized case K/V has dtype int32
+    SUPPORTED_MAX_K = 128
+    SUPPORTED_ATTN_BIAS_TYPES: Set[Any] = {
+        type(None),
+        BlockDiagonalCausalWithOffsetPaddedKeysMask,
+    }
+    SUPPORTS_DROPOUT = False
+    SUPPORTS_CUSTOM_SCALE = True
+    SUPPORTS_BMGHK = True
+    NAME = "triton_splitKF"
+
+    SPLIT_K: Optional[int] = None
+    BLOCK_M = 16
+    BLOCK_N = 64
+
+    NUM_GROUPS = 1  # Default quantization is row-wise
+
+    @classmethod
+    def shape_not_supported_reasons(
+        cls, Mq: int, Mkv: int, K: int, Kv: int
+    ) -> List[str]:
+        reasons = super().shape_not_supported_reasons(Mq, Mkv, K, Kv)
+        if K not in {16, 32, 64, 128}:
+            reasons.append(f"Embed dim {K} not supported")
+        return reasons
+
+    @classmethod
+    def not_supported_reasons(cls, d: Inputs) -> List[str]:
+        reasons = super(FwOp, cls).not_supported_reasons(d)
+        check_lastdim_alignment_stride1(reasons, "query", d.query, 8)
+        if d.key.dtype != torch.int32:
+            check_lastdim_alignment_stride1(reasons, "key", d.key, 8)
+            check_lastdim_alignment_stride1(reasons, "value", d.value, 8)
+        if cls.OPERATOR is None:
+            reasons.append("triton is not available")
+        if d.device.type == "cuda":
+            # Has only been tested on 8.0 / 9.0.
+            if torch.cuda.get_device_capability(d.device) < (8, 0):
+                reasons.append(
+                    "requires GPU with sm80 minimum compute capacity, e.g., A100/H100/L4"
+                )
+
+        q_len = d.query.shape[1]
+        if isinstance(d.attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask):
+            seqinfo = d.attn_bias.q_seqinfo
+            if q_len != seqinfo.seqstart_py[-1]:
+                reasons.append(
+                    f"Expected total {seqinfo.seqstart_py[-1]} queries not {q_len}"
+                )
+            q_len = seqinfo.min_seqlen
+            if q_len != seqinfo.max_seqlen:
+                reasons.append(
+                    "Variable query len is not supported in the presence of causal mask."
+                )
+
+        if d.key.ndim in [4, 5] and d.key.shape[-2] != 1:
+            if d.key.stride(-2) == 0 and d.value.stride(-2) == 0 and q_len > 1:
+                reasons.append("multiquery is only supported with query seqlen=1")
+
+        if d.attn_bias is not None and q_len > 1:
+            reasons.append(
+                "query with seqlen > 1 is not supported in the presence of causal mask"
+            )
+        return reasons
+
+    @classmethod
+    def get_split_k(cls, B: int, H: int, Mk: int) -> int:
+        """Heuristic for the number of splits"""
+        bh = B * H
+        split_k = max(Mk, 1024) // bh
+        max_chunk_size = 64 if Mk <= 512 and bh <= 64 else 128
+        while split_k > 0 and Mk / split_k < max_chunk_size:
+            split_k = split_k // 2
+        split_k = min(split_k, 64)
+        split_k = max(split_k, 1)
+        return split_k
+
+    @classmethod
+    def apply(
+        cls, inp: Inputs, needs_gradient: bool
+    ) -> Tuple[torch.Tensor, Optional[Context]]:
+        attn_bias = inp.attn_bias
+        seq_len = None
+        q, k, v = inp.get_qkv_in_bmghk()
+
+        if attn_bias is not None:
+            assert isinstance(attn_bias, BlockDiagonalCausalWithOffsetPaddedKeysMask)
+            # TODO: do we really need to do this cast? seems fishy but
+            # I just copied it from the decoder.py
+            attn_bias.k_seqinfo.to(inp.query.device)
+            attn_bias.q_seqinfo.to(inp.query.device)
+            seq_len = attn_bias.k_seqinfo.seqlen
+            B = len(seq_len)
+            G, H, Kq = q.shape[-3:]
+            Kkv = v.shape[-1]
+
+            # assume kv has been padded
+            q = q.reshape(B, -1, G, H, Kq)
+            k = k.reshape(B, -1, G, H, Kkv)
+            v = v.reshape(B, -1, G, H, Kkv)
+
+        # Transpose in the case of MQA/GQA
+        mqa_swap_seqlen_head = False
+        if k.shape[3] > 1 and k.stride(3) == 0 and v.stride(3) == 0:
+            mqa_swap_seqlen_head = True
+            assert q.shape[1] == 1
+            q = q.transpose(1, 3)
+            k = k[:, :, :, :1]
+            v = v[:, :, :, :1]
+
+        if k.dtype == torch.int32:
+            # Quantized K/V
+            PACKED_PER_VAL = 8
+            Lk = (k.shape[-1] - cls.NUM_GROUPS) * 8
+        else:
+            Lk = k.shape[-1]
+            PACKED_PER_VAL = 1
+
+        B, Mk, G, H, Kkv = k.shape
+        B, M, G, H, Kq = q.shape
+        assert Lk == Kq, f"Keys have head dim {Lk} but queries have head dim {Kq}"
+
+        BLOCK_M = cls.BLOCK_M
+        BLOCK_N = cls.BLOCK_N
+        if cls.SPLIT_K is not None:
+            split_k = cls.SPLIT_K
+        else:
+            # Use heuristics
+            split_k = cls.get_split_k(B, H, Mk)
+
+        M_ceil = (M + BLOCK_M - 1) // BLOCK_M * BLOCK_M
+        o_splitk = torch.empty(
+            [B * G * H, split_k, M_ceil, Kq], dtype=torch.float32, device=q.device
+        )
+        metadata = torch.empty(
+            [B * G * H, 2, split_k, M_ceil], dtype=torch.float32, device=q.device
+        )
+        lse = torch.empty((B * G * H, M), device=q.device, dtype=torch.float32)
+        grid = (triton.cdiv(M, BLOCK_M), B * G * H, split_k)
+
+        num_warps = 2
+        split_size = (Mk + split_k - 1) // split_k
+        use_seq_len = seq_len is not None
+        _fwd_kernel_splitK_unrolled = unroll_varargs(
+            _fwd_kernel_splitK, N=cls.NUM_GROUPS if PACKED_PER_VAL > 1 else 1
+        )
+
+        _fwd_kernel_splitK_unrolled[grid](
+            Q=q,
+            K=k,
+            V=v,
+            sm_scale=inp.scale_float,
+            Out_splitK=o_splitk,
+            Metadata=metadata,
+            Seq_len=seq_len,
+            **_strides(q, "qz", "qm", "qg", "qh", "qk"),
+            **_strides(k, "kz", "kn", "kg", "kh", "kk"),
+            **_strides(v, "vz", "vn", "vg", "vh", "vk"),
+            **_strides(o_splitk, "osk_zhg", "osk_s", "osk_m", "osk_k"),
+            **_strides(metadata, "mzhg", "m2", "ms", "mm"),
+            Z=B,
+            H=H,
+            G=G,
+            N_CTX_Q=M,
+            N_CTX_K=Mk,
+            BLOCK_N_PER_SPLIT=split_size,
+            BLOCK_M=BLOCK_M,
+            BLOCK_N=BLOCK_N,
+            BLOCK_DMODEL=Lk,
+            BOUNDS_CHECKS_N=(split_size % BLOCK_N) > 0 or use_seq_len,
+            USE_SEQ_LEN=use_seq_len,
+            num_warps=num_warps,
+            num_stages=1,
+            PACKED_PER_VAL=PACKED_PER_VAL,
+            N_GROUPS=cls.NUM_GROUPS if PACKED_PER_VAL > 1 else 1,
+        )
+
+        if mqa_swap_seqlen_head:
+            out = torch.empty(
+                (B, H, G, M, Kq), device=q.device, dtype=q.dtype
+            ).transpose(1, 3)
+        else:
+            out = torch.empty((B, M, G, H, Kq), device=q.device, dtype=q.dtype)
+
+        # Merge together
+        grid = (B * G * H, M, 1)
+        _splitK_reduce[grid](
+            o_splitk,
+            metadata,
+            out,
+            lse,
+            split_k=split_k,
+            **_strides(o_splitk, "osk_zhg", "osk_s", "osk_m", "osk_k"),
+            **_strides(metadata, "mzhg", "m2", "ms", "mm"),
+            **_strides(out, "oz", "om", "og", "oh", "ok"),
+            **_strides(lse, "lse_zhg", "lse_m"),
+            BLOCK_SIZE=out.shape[-1],
+            G=G,
+            H=H,
+            # TODO: Tune num_warps
+        )
+        lse = lse.reshape([B, G, H, M])
+        if mqa_swap_seqlen_head:
+            # H/M dimensions have been swapped
+            out = out.transpose(1, 3)
+            lse = lse.transpose(2, 3)
+        if inp.query.ndim == 4:
+            # BMGHK -> BMHK
+            assert G == 1
+            out = out[:, :, 0]
+            lse = lse[:, 0]
+
+        return out, Context(out=out, lse=lse)
+
+
+class FwOp_S1(FwOp):
+    SPLIT_K = 1
+    NAME = "triton_splitK1"
+
+
+class FwOp_S2(FwOp):
+    SPLIT_K = 2
+    NAME = "triton_splitK2"
+
+
+class FwOp_S4(FwOp):
+    SPLIT_K = 4
+    NAME = "triton_splitK4"
+
+
+class FwOp_S8(FwOp):
+    SPLIT_K = 8
+    NAME = "triton_splitK8"
+
+
+class FwOp_S16(FwOp):
+    SPLIT_K = 16
+    NAME = "triton_splitK16"
+
+
+class FwOp_S32(FwOp):
+    SPLIT_K = 32
+    NAME = "triton_splitK32"
+
+
+class FwOp_S64(FwOp):
+    SPLIT_K = 64
+    NAME = "triton_splitK64"
+
+
+class FwOp_S128(FwOp):
+    SPLIT_K = 128
+    NAME = "triton_splitK128"