init

2026-01-09 13:34:11 +08:00
parent dfa6476b58
commit b2ef04d792
538 changed files with 105693 additions and 2 deletions
--- a/vllm/attention/ops/init.py
+++ b/vllm/attention/ops/init.py
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -0,0 +1,216 @@
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.attention.ops.prefix_prefill import context_attention_fwd
+
+# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
+_PARTITION_SIZE = 512
+
+
+@dataclass
+class PagedAttentionMetadata:
+    """Metadata for PagedAttention."""
+    # (batch_size,). The length of sequences (entire tokens seen so far) per
+    # sequence.
+    seq_lens_tensor: Optional[torch.Tensor]
+    # Maximum sequence length in the batch.
+    max_seq_len: Optional[int]
+    # (batch_size, max_blocks_per_seq).
+    # Block addresses per sequence. (Seq id -> list of physical block)
+    # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
+    # in the kv cache. Each block can contain up to block_size tokens.
+    # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
+    # captured.
+    block_tables: Optional[torch.Tensor]
+
+
+class PagedAttention:
+
+    @staticmethod
+    def get_supported_head_sizes() -> List[int]:
+        return [64, 80, 96, 112, 128, 256]
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return (2, num_blocks, block_size * num_kv_heads * head_size)
+
+    @staticmethod
+    def split_kv_cache(
+        kv_cache: torch.Tensor,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        x = 16 // kv_cache.element_size()
+        num_blocks = kv_cache.shape[1]
+
+        key_cache = kv_cache[0]
+        key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x,
+                                   -1, x)
+        value_cache = kv_cache[1]
+        value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1)
+        return key_cache, value_cache
+
+    @staticmethod
+    def write_to_paged_cache(
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        slot_mapping: torch.Tensor,
+        kv_cache_dtype: str,
+        kv_scale: float,
+    ) -> None:
+        ops.reshape_and_cache(
+            key,
+            value,
+            key_cache,
+            value_cache,
+            slot_mapping.flatten(),
+            kv_cache_dtype,
+            kv_scale,
+        )
+
+    @staticmethod
+    def forward_decode(
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        block_tables: torch.Tensor,
+        seq_lens: torch.Tensor,
+        max_seq_len: int,
+        kv_cache_dtype: str,
+        num_kv_heads: int,
+        scale: float,
+        alibi_slopes: Optional[torch.Tensor],
+        kv_scale: float,
+    ) -> torch.Tensor:
+        output = torch.empty_like(query)
+
+        block_size = value_cache.shape[3]
+        num_seqs, num_heads, head_size = query.shape
+        max_num_partitions = ((max_seq_len + _PARTITION_SIZE - 1) //
+                              _PARTITION_SIZE)
+        # NOTE(woosuk): We use a simple heuristic to decide whether to use
+        # PagedAttention V1 or V2. If the number of partitions is 1, we use
+        # V1 to avoid the overhead of reduction. Also, if the number of
+        # sequences or heads is large, we use V1 since there is enough work
+        # to parallelize.
+        # TODO(woosuk): Tune this heuristic.
+        # For context len > 8192, use V2 kernel to avoid shared memory shortage.
+        use_v1 = (max_seq_len <= 8192
+                  and (max_num_partitions == 1 or num_seqs * num_heads > 512))
+        if use_v1:
+            # Run PagedAttention V1.
+            ops.paged_attention_v1(
+                output,
+                query,
+                key_cache,
+                value_cache,
+                num_kv_heads,
+                scale,
+                block_tables,
+                seq_lens,
+                block_size,
+                max_seq_len,
+                alibi_slopes,
+                kv_cache_dtype,
+                kv_scale,
+            )
+        else:
+            # Run PagedAttention V2.
+            assert _PARTITION_SIZE % block_size == 0
+            tmp_output = torch.empty(
+                size=(num_seqs, num_heads, max_num_partitions, head_size),
+                dtype=output.dtype,
+                device=output.device,
+            )
+            exp_sums = torch.empty(
+                size=(num_seqs, num_heads, max_num_partitions),
+                dtype=torch.float32,
+                device=output.device,
+            )
+            max_logits = torch.empty_like(exp_sums)
+            ops.paged_attention_v2(
+                output,
+                exp_sums,
+                max_logits,
+                tmp_output,
+                query,
+                key_cache,
+                value_cache,
+                num_kv_heads,
+                scale,
+                block_tables,
+                seq_lens,
+                block_size,
+                max_seq_len,
+                alibi_slopes,
+                kv_cache_dtype,
+                kv_scale,
+            )
+        return output
+
+    @staticmethod
+    def forward_prefix(
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        block_tables: torch.Tensor,
+        subquery_start_loc: torch.Tensor,
+        seq_lens_tensor: torch.Tensor,
+        context_lens: torch.Tensor,
+        max_query_len: int,
+        alibi_slopes: Optional[torch.Tensor],
+        sliding_window: Optional[int],
+    ) -> torch.Tensor:
+        output = torch.empty_like(query)
+        context_attention_fwd(
+            query,
+            key,
+            value,
+            output,
+            key_cache,
+            value_cache,
+            block_tables,
+            # subquery_start_loc is (batch_size + 1,)
+            subquery_start_loc[:-1],
+            seq_lens_tensor,
+            context_lens,
+            max_query_len,
+            alibi_slopes,
+            sliding_window,
+        )
+        return output
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: Dict[int, int],
+    ) -> None:
+        src_key_cache = src_kv_cache[0]
+        dst_key_cache = dst_kv_cache[0]
+        ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst)
+
+        src_value_cache = src_kv_cache[1]
+        dst_value_cache = dst_kv_cache[1]
+        ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst)
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: Dict[int, List[int]],
+    ) -> None:
+        key_caches = [kv_cache[0] for kv_cache in kv_caches]
+        value_caches = [kv_cache[1] for kv_cache in kv_caches]
+        ops.copy_blocks(key_caches, value_caches, src_to_dists)
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -0,0 +1,792 @@
+# The kernels in this file are adapted from LightLLM's context_attention_fwd:
+# https://github.com/ModelTC/lightllm/blob/main/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py
+
+import torch
+import triton
+import triton.language as tl
+
+if triton.__version__ >= "2.1.0":
+
+    @triton.jit
+    def _fwd_kernel(
+        Q,
+        K,
+        V,
+        K_cache,
+        V_cache,
+        B_Loc,
+        sm_scale,
+        B_Start_Loc,
+        B_Seqlen,
+        B_Ctxlen,
+        block_size,
+        x,
+        Out,
+        stride_b_loc_b,
+        stride_b_loc_s,
+        stride_qbs,
+        stride_qh,
+        stride_qd,
+        stride_kbs,
+        stride_kh,
+        stride_kd,
+        stride_vbs,
+        stride_vh,
+        stride_vd,
+        stride_obs,
+        stride_oh,
+        stride_od,
+        stride_k_cache_bs,
+        stride_k_cache_h,
+        stride_k_cache_d,
+        stride_k_cache_bl,
+        stride_k_cache_x,
+        stride_v_cache_bs,
+        stride_v_cache_h,
+        stride_v_cache_d,
+        stride_v_cache_bl,
+        num_queries_per_kv: int,
+        BLOCK_M: tl.constexpr,
+        BLOCK_DMODEL: tl.constexpr,  # head size
+        BLOCK_DMODEL_PADDED: tl.constexpr,  # head size padded to a power of 2
+        BLOCK_N: tl.constexpr,
+        SLIDING_WINDOW: tl.constexpr,
+    ):
+        cur_batch = tl.program_id(0)
+        cur_head = tl.program_id(1)
+        start_m = tl.program_id(2)
+
+        cur_kv_head = cur_head // num_queries_per_kv
+
+        cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch)
+        cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+        cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
+        cur_batch_query_len = cur_batch_seq_len - cur_batch_ctx_len
+
+        # start position inside of the query
+        # generally, N goes over kv, while M goes over query_len
+        block_start_loc = BLOCK_M * start_m
+
+        # initialize offsets
+        # [N]; starts at 0
+        offs_n = tl.arange(0, BLOCK_N)
+        # [D]; starts at 0
+        offs_d = tl.arange(0, BLOCK_DMODEL_PADDED)
+        # [M]; starts at current position in query
+        offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+        # [M,D]
+        off_q = (
+            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
+            cur_head * stride_qh + offs_d[None, :] * stride_qd)
+
+        dim_mask = tl.where(
+            tl.arange(0, BLOCK_DMODEL_PADDED) < BLOCK_DMODEL, 1,
+            0).to(tl.int1)  # [D]
+
+        q = tl.load(Q + off_q,
+                    mask=dim_mask[None, :] &
+                    (offs_m[:, None] < cur_batch_query_len),
+                    other=0.0)  # [M,D]
+
+        # initialize pointer to m and l
+        m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")  # [M]
+        l_i = tl.zeros([BLOCK_M], dtype=tl.float32)  # [M]
+        acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED],
+                       dtype=tl.float32)  # [M,D]
+
+        # compute query against context (no causal mask here)
+        for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
+            start_n = tl.multiple_of(start_n, BLOCK_N)
+            # -- compute qk ----
+            bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
+                         ((start_n + offs_n) // block_size) * stride_b_loc_s,
+                         mask=(start_n + offs_n) < cur_batch_ctx_len,
+                         other=0)  # [N]
+            # [D,N]
+            off_k = (bn[None, :] * stride_k_cache_bs +
+                     cur_kv_head * stride_k_cache_h +
+                     (offs_d[:, None] // x) * stride_k_cache_d +
+                     ((start_n + offs_n[None, :]) % block_size) *
+                     stride_k_cache_bl +
+                     (offs_d[:, None] % x) * stride_k_cache_x)
+            # [N,D]
+            off_v = (
+                bn[:, None] * stride_v_cache_bs +
+                cur_kv_head * stride_v_cache_h +
+                offs_d[None, :] * stride_v_cache_d +
+                (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
+            k = tl.load(K_cache + off_k,
+                        mask=dim_mask[:, None] &
+                        ((start_n + offs_n[None, :]) < cur_batch_ctx_len),
+                        other=0.0)  # [D,N]
+
+            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)  # [M,N]
+            qk += tl.dot(q, k)
+            qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
+                          float("-inf"))
+            qk *= sm_scale
+            if SLIDING_WINDOW > 0:
+                # (cur_batch_ctx_len + offs_m[:, None]) are the positions of
+                # Q entries in sequence
+                # (start_n + offs_n[None, :]) are the positions of
+                # KV entries in sequence
+                # So the condition makes sure each entry in Q only attends
+                # to KV entries not more than SLIDING_WINDOW away.
+                #
+                # We can't use -inf here, because the
+                # sliding window may lead to the entire row being masked.
+                # This then makes m_ij contain -inf, which causes NaNs in
+                # exp().
+                qk = tl.where((cur_batch_ctx_len + offs_m[:, None]) -
+                              (start_n + offs_n[None, :]) < SLIDING_WINDOW, qk,
+                              -10000)
+
+            # -- compute m_ij, p, l_ij
+            m_ij = tl.max(qk, 1)  # [M]
+            p = tl.exp(qk - m_ij[:, None])  # [M,N]
+            l_ij = tl.sum(p, 1)  # [M]
+            # -- update m_i and l_i
+            m_i_new = tl.maximum(m_i, m_ij)  # [M]
+            alpha = tl.exp(m_i - m_i_new)  # [M]
+            beta = tl.exp(m_ij - m_i_new)  # [M]
+            l_i_new = alpha * l_i + beta * l_ij  # [M]
+
+            # -- update output accumulator --
+            # scale p
+            p_scale = beta / l_i_new
+            p = p * p_scale[:, None]
+            # scale acc
+            acc_scale = l_i / l_i_new * alpha
+            acc = acc * acc_scale[:, None]
+            # update acc
+            v = tl.load(V_cache + off_v,
+                        mask=dim_mask[None, :] &
+                        ((start_n + offs_n[:, None]) < cur_batch_ctx_len),
+                        other=0.0)  # [N,D]
+
+            p = p.to(v.dtype)
+            acc += tl.dot(p, v)
+            # # update m_i and l_i
+            l_i = l_i_new
+            m_i = m_i_new
+
+        off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
+                 offs_d[:, None] * stride_kd)
+        off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
+                 offs_d[None, :] * stride_vd)
+        k_ptrs = K + off_k
+        v_ptrs = V + off_v
+
+        # block_mask is 0 when we're already past the current query length
+        block_mask = tl.where(block_start_loc < cur_batch_query_len, 1, 0)
+
+        # compute query against itself (with causal mask)
+        for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
+            start_n = tl.multiple_of(start_n, BLOCK_N)
+            # -- compute qk ----
+            k = tl.load(k_ptrs +
+                        (cur_batch_in_all_start_index + start_n) * stride_kbs,
+                        mask=dim_mask[:, None] &
+                        ((start_n + offs_n[None, :]) < cur_batch_query_len),
+                        other=0.0)
+
+            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+            qk += tl.dot(q, k)
+            qk *= sm_scale
+            # apply causal mask
+            qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
+                          float("-inf"))
+            if SLIDING_WINDOW > 0:
+                qk = tl.where(
+                    offs_m[:, None] -
+                    (start_n + offs_n[None, :]) < SLIDING_WINDOW, qk, -10000)
+
+            # -- compute m_ij, p, l_ij
+            m_ij = tl.max(qk, 1)
+            p = tl.exp(qk - m_ij[:, None])
+            l_ij = tl.sum(p, 1)
+            # -- update m_i and l_i
+            m_i_new = tl.maximum(m_i, m_ij)
+            alpha = tl.exp(m_i - m_i_new)
+            beta = tl.exp(m_ij - m_i_new)
+            l_i_new = alpha * l_i + beta * l_ij
+            # -- update output accumulator --
+            # scale p
+            p_scale = beta / l_i_new
+            p = p * p_scale[:, None]
+            # scale acc
+            acc_scale = l_i / l_i_new * alpha
+            acc = acc * acc_scale[:, None]
+            # update acc
+            v = tl.load(v_ptrs +
+                        (cur_batch_in_all_start_index + start_n) * stride_vbs,
+                        mask=dim_mask[None, :] &
+                        ((start_n + offs_n[:, None]) < cur_batch_query_len),
+                        other=0.0)
+
+            p = p.to(v.dtype)
+            acc += tl.dot(p, v)
+            # update m_i and l_i
+            l_i = l_i_new
+            m_i = m_i_new
+        # initialize pointers to output
+        off_o = (
+            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
+            cur_head * stride_oh + offs_d[None, :] * stride_od)
+        out_ptrs = Out + off_o
+        tl.store(out_ptrs,
+                 acc,
+                 mask=dim_mask[None, :] &
+                 (offs_m[:, None] < cur_batch_query_len))
+        return
+
+    @triton.jit
+    def _fwd_kernel_flash_attn_v2(
+        Q,
+        K,
+        V,
+        K_cache,
+        V_cache,
+        B_Loc,
+        sm_scale,
+        B_Start_Loc,
+        B_Seqlen,
+        B_Ctxlen,
+        block_size,
+        x,
+        Out,
+        stride_b_loc_b,
+        stride_b_loc_s,
+        stride_qbs,
+        stride_qh,
+        stride_qd,
+        stride_kbs,
+        stride_kh,
+        stride_kd,
+        stride_vbs,
+        stride_vh,
+        stride_vd,
+        stride_obs,
+        stride_oh,
+        stride_od,
+        stride_k_cache_bs,
+        stride_k_cache_h,
+        stride_k_cache_d,
+        stride_k_cache_bl,
+        stride_k_cache_x,
+        stride_v_cache_bs,
+        stride_v_cache_h,
+        stride_v_cache_d,
+        stride_v_cache_bl,
+        num_queries_per_kv: int,
+        BLOCK_M: tl.constexpr,
+        BLOCK_DMODEL: tl.constexpr,
+        BLOCK_N: tl.constexpr,
+    ):
+        cur_batch = tl.program_id(0)
+        cur_head = tl.program_id(1)
+        start_m = tl.program_id(2)
+
+        cur_kv_head = cur_head // num_queries_per_kv
+
+        cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch)
+        cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+        cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
+
+        block_start_loc = BLOCK_M * start_m
+
+        # initialize offsets
+        offs_n = tl.arange(0, BLOCK_N)
+        offs_d = tl.arange(0, BLOCK_DMODEL)
+        offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+        off_q = (
+            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
+            cur_head * stride_qh + offs_d[None, :] * stride_qd)
+
+        q = tl.load(
+            Q + off_q,
+            mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len,
+            other=0.0)
+
+        # # initialize pointer to m and l
+        m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+        l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+        acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+
+        for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
+            start_n = tl.multiple_of(start_n, BLOCK_N)
+            # -- compute qk ----
+            bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
+                         ((start_n + offs_n) // block_size) * stride_b_loc_s,
+                         mask=(start_n + offs_n) < cur_batch_ctx_len,
+                         other=0)
+            off_k = (bn[None, :] * stride_k_cache_bs +
+                     cur_kv_head * stride_k_cache_h +
+                     (offs_d[:, None] // x) * stride_k_cache_d +
+                     ((start_n + offs_n[None, :]) % block_size) *
+                     stride_k_cache_bl +
+                     (offs_d[:, None] % x) * stride_k_cache_x)
+            off_v = (
+                bn[:, None] * stride_v_cache_bs +
+                cur_kv_head * stride_v_cache_h +
+                offs_d[None, :] * stride_v_cache_d +
+                (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
+            k = tl.load(K_cache + off_k,
+                        mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len,
+                        other=0.0)
+
+            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+            qk += tl.dot(q, k)
+            qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
+                          float("-inf"))
+            qk *= sm_scale
+
+            # -- compute m_ij, p, l_ij
+            m_ij = tl.max(qk, 1)
+            m_i_new = tl.maximum(m_i, m_ij)
+            p = tl.math.exp(qk - m_i_new[:, None])
+            l_ij = tl.sum(p, 1)
+            # -- update m_i and l_i
+
+            alpha = tl.math.exp(m_i - m_i_new)
+            l_i_new = alpha * l_i + l_ij
+            # -- update output accumulator --
+            # scale p
+            # scale acc
+            acc_scale = alpha
+            # acc_scale = l_i / l_i_new * alpha
+            acc = acc * acc_scale[:, None]
+            # update acc
+            v = tl.load(V_cache + off_v,
+                        mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len,
+                        other=0.0)
+
+            p = p.to(v.dtype)
+            acc += tl.dot(p, v)
+            # update m_i and l_i
+            l_i = l_i_new
+            m_i = m_i_new
+
+        off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
+                 offs_d[:, None] * stride_kd)
+        off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
+                 offs_d[None, :] * stride_vd)
+        k_ptrs = K + off_k
+        v_ptrs = V + off_v
+
+        block_mask = tl.where(
+            block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
+
+        for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
+            start_n = tl.multiple_of(start_n, BLOCK_N)
+            # -- compute qk ----
+            k = tl.load(k_ptrs +
+                        (cur_batch_in_all_start_index + start_n) * stride_kbs,
+                        mask=(start_n + offs_n[None, :]) <
+                        cur_batch_seq_len - cur_batch_ctx_len,
+                        other=0.0)
+
+            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+            qk += tl.dot(q, k)
+            qk *= sm_scale
+            qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
+                          float("-inf"))
+
+            # -- compute m_ij, p, l_ij
+            m_ij = tl.max(qk, 1)
+            m_i_new = tl.maximum(m_i, m_ij)
+            p = tl.math.exp(qk - m_i_new[:, None])
+            l_ij = tl.sum(p, 1)
+            # -- update m_i and l_i
+
+            alpha = tl.math.exp(m_i - m_i_new)
+            l_i_new = alpha * l_i + l_ij
+            # -- update output accumulator --
+            # scale p
+            # scale acc
+            acc_scale = alpha
+            # acc_scale = l_i / l_i_new * alpha
+            acc = acc * acc_scale[:, None]
+            # update acc
+            v = tl.load(v_ptrs +
+                        (cur_batch_in_all_start_index + start_n) * stride_vbs,
+                        mask=(start_n + offs_n[:, None]) <
+                        cur_batch_seq_len - cur_batch_ctx_len,
+                        other=0.0)
+
+            p = p.to(v.dtype)
+            acc += tl.dot(p, v)
+            # update m_i and l_i
+            l_i = l_i_new
+            m_i = m_i_new
+
+        # acc /= l_i[:, None]
+        # initialize pointers to output
+        off_o = (
+            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
+            cur_head * stride_oh + offs_d[None, :] * stride_od)
+        out_ptrs = Out + off_o
+        tl.store(out_ptrs,
+                 acc,
+                 mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len)
+        return
+
+    @triton.jit
+    def _fwd_kernel_alibi(
+        Q,
+        K,
+        V,
+        K_cache,
+        V_cache,
+        B_Loc,
+        sm_scale,
+        B_Start_Loc,
+        B_Seqlen,
+        B_Ctxlen,
+        Alibi_slopes,
+        block_size,
+        x,
+        Out,
+        stride_b_loc_b,
+        stride_b_loc_s,
+        stride_qbs,
+        stride_qh,
+        stride_qd,
+        stride_kbs,
+        stride_kh,
+        stride_kd,
+        stride_vbs,
+        stride_vh,
+        stride_vd,
+        stride_obs,
+        stride_oh,
+        stride_od,
+        stride_k_cache_bs,
+        stride_k_cache_h,
+        stride_k_cache_d,
+        stride_k_cache_bl,
+        stride_k_cache_x,
+        stride_v_cache_bs,
+        stride_v_cache_h,
+        stride_v_cache_d,
+        stride_v_cache_bl,
+        num_queries_per_kv: int,
+        BLOCK_M: tl.constexpr,
+        BLOCK_DMODEL: tl.constexpr,
+        BLOCK_N: tl.constexpr,
+    ):
+        # attn_bias[]
+        cur_batch = tl.program_id(0)
+        cur_head = tl.program_id(1)
+        start_m = tl.program_id(2)
+
+        cur_kv_head = cur_head // num_queries_per_kv
+
+        # cur_batch_seq_len: the length of prompts
+        # cur_batch_ctx_len: the length of prefix
+        # cur_batch_in_all_start_index: the start id of the dim=0
+        cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch)
+        cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+        cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
+
+        block_start_loc = BLOCK_M * start_m
+
+        # initialize offsets
+        offs_n = tl.arange(0, BLOCK_N)
+        offs_d = tl.arange(0, BLOCK_DMODEL)
+        offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+        off_q = (
+            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
+            cur_head * stride_qh + offs_d[None, :] * stride_qd)
+
+        q = tl.load(
+            Q + off_q,
+            mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len,
+            other=0.0)
+
+        # # initialize pointer to m and l
+        m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+        l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+        acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+
+        alibi_slope = tl.load(Alibi_slopes + cur_head)
+        alibi_start_q = tl.arange(
+            0, BLOCK_M) + block_start_loc + cur_batch_ctx_len
+        alibi_start_k = 0
+        for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
+            start_n = tl.multiple_of(start_n, BLOCK_N)
+            # -- compute qk ----
+            bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
+                         ((start_n + offs_n) // block_size) * stride_b_loc_s,
+                         mask=(start_n + offs_n) < cur_batch_ctx_len,
+                         other=0)
+            off_k = (bn[None, :] * stride_k_cache_bs +
+                     cur_kv_head * stride_k_cache_h +
+                     (offs_d[:, None] // x) * stride_k_cache_d +
+                     ((start_n + offs_n[None, :]) % block_size) *
+                     stride_k_cache_bl +
+                     (offs_d[:, None] % x) * stride_k_cache_x)
+            off_v = (
+                bn[:, None] * stride_v_cache_bs +
+                cur_kv_head * stride_v_cache_h +
+                offs_d[None, :] * stride_v_cache_d +
+                (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
+            k = tl.load(K_cache + off_k,
+                        mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len,
+                        other=0.0)
+
+            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+            qk += tl.dot(q, k)
+            qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
+                          float("-inf"))
+            qk *= sm_scale
+
+            # load alibi
+            alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k -
+                     alibi_start_q[:, None]) * alibi_slope
+            alibi = tl.where(
+                (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len),
+                alibi, float("-inf"))
+            qk += alibi
+            alibi_start_k += BLOCK_N
+
+            # -- compute m_ij, p, l_ij
+            m_ij = tl.max(qk, 1)
+            m_i_new = tl.maximum(m_i, m_ij)
+            p = tl.math.exp(qk - m_i_new[:, None])
+            l_ij = tl.sum(p, 1)
+            # -- update m_i and l_i
+
+            alpha = tl.math.exp(m_i - m_i_new)
+            l_i_new = alpha * l_i + l_ij
+            # -- update output accumulator --
+            # scale p
+            # scale acc
+            acc_scale = alpha
+            # acc_scale = l_i / l_i_new * alpha
+            acc = acc * acc_scale[:, None]
+            # update acc
+            v = tl.load(V_cache + off_v,
+                        mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len,
+                        other=0.0)
+
+            p = p.to(v.dtype)
+            acc += tl.dot(p, v, allow_tf32=False)
+            # update m_i and l_i
+            l_i = l_i_new
+            m_i = m_i_new
+
+        off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
+                 offs_d[:, None] * stride_kd)
+        off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
+                 offs_d[None, :] * stride_vd)
+        k_ptrs = K + off_k
+        v_ptrs = V + off_v
+
+        block_mask = tl.where(
+            block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
+
+        # init alibi
+        alibi_slope = tl.load(Alibi_slopes + cur_head)
+        alibi_start_q = tl.arange(
+            0, BLOCK_M) + block_start_loc + cur_batch_ctx_len
+        alibi_start_k = cur_batch_ctx_len
+        # # init debugger
+        # offset_db_q = tl.arange(0, BLOCK_M) + block_start_loc
+        # offset_db_k = tl.arange(0, BLOCK_N)
+        # calc q[BLOCK_M, BLOCK_MODEL] mul k[prefix_len: , BLOCK_DMODEL]
+        for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
+            start_n = tl.multiple_of(start_n, BLOCK_N)
+            # -- compute qk ----
+            k = tl.load(k_ptrs +
+                        (cur_batch_in_all_start_index + start_n) * stride_kbs,
+                        mask=(start_n + offs_n[None, :]) <
+                        cur_batch_seq_len - cur_batch_ctx_len,
+                        other=0.0)
+
+            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+            qk += tl.dot(q, k, allow_tf32=False)
+            qk *= sm_scale
+            qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
+                          float("-inf"))
+
+            # load alibi
+            alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k -
+                     alibi_start_q[:, None]) * alibi_slope
+            alibi = tl.where(
+                (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len),
+                alibi, float("-inf"))
+            qk += alibi
+            alibi_start_k += BLOCK_N
+
+            # -- compute m_ij, p, l_ij
+            m_ij = tl.max(qk, 1)
+            m_i_new = tl.maximum(m_i, m_ij)
+            p = tl.math.exp(qk - m_i_new[:, None])
+            l_ij = tl.sum(p, 1)
+            # -- update m_i and l_i
+
+            alpha = tl.math.exp(m_i - m_i_new)
+            l_i_new = alpha * l_i + l_ij
+            # -- update output accumulator --
+            # scale p
+            # scale acc
+            acc_scale = alpha
+            # acc_scale = l_i / l_i_new * alpha
+            acc = acc * acc_scale[:, None]
+            # update acc
+            v = tl.load(v_ptrs +
+                        (cur_batch_in_all_start_index + start_n) * stride_vbs,
+                        mask=(start_n + offs_n[:, None]) <
+                        cur_batch_seq_len - cur_batch_ctx_len,
+                        other=0.0)
+
+            p = p.to(v.dtype)
+            acc += tl.dot(p, v, allow_tf32=False)
+            # update m_i and l_i
+            l_i = l_i_new
+            m_i = m_i_new
+
+        acc = acc / l_i[:, None]
+
+        # initialize pointers to output
+        off_o = (
+            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
+            cur_head * stride_oh + offs_d[None, :] * stride_od)
+        out_ptrs = Out + off_o
+        tl.store(out_ptrs,
+                 acc,
+                 mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len)
+        return
+
+    @torch.inference_mode()
+    def context_attention_fwd(q,
+                              k,
+                              v,
+                              o,
+                              k_cache,
+                              v_cache,
+                              b_loc,
+                              b_start_loc,
+                              b_seq_len,
+                              b_ctx_len,
+                              max_input_len,
+                              alibi_slopes=None,
+                              sliding_window=None):
+
+        cap = torch.musa.get_device_capability()
+        BLOCK = 128 if cap[0] >= 8 else 64
+        # shape constraints
+        Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
+        assert Lq == Lk and Lk == Lv
+        # round up Lk to a power of 2 - this is required for Triton block size
+        Lk_padded = triton.next_power_of_2(Lk)
+
+        sm_scale = 1.0 / (Lq**0.5)
+        batch, head = b_seq_len.shape[0], q.shape[1]
+        num_queries_per_kv = q.shape[1] // k.shape[1]
+
+        grid = (batch, head, triton.cdiv(max_input_len, BLOCK))  # batch, head,
+
+        num_warps = 8 if Lk <= 64 else 8
+        if alibi_slopes is not None:
+            assert Lk == Lk_padded
+            _fwd_kernel_alibi[grid](
+                q,
+                k,
+                v,
+                k_cache,
+                v_cache,
+                b_loc,
+                sm_scale,
+                b_start_loc,
+                b_seq_len,
+                b_ctx_len,
+                alibi_slopes,
+                v_cache.shape[3],
+                8,
+                o,
+                b_loc.stride(0),
+                b_loc.stride(1),
+                q.stride(0),
+                q.stride(1),
+                q.stride(2),
+                k.stride(0),
+                k.stride(1),
+                k.stride(2),
+                v.stride(0),
+                v.stride(1),
+                v.stride(2),
+                o.stride(0),
+                o.stride(1),
+                o.stride(2),
+                k_cache.stride(0),
+                k_cache.stride(1),
+                k_cache.stride(2),
+                k_cache.stride(3),
+                k_cache.stride(
+                    4
+                ),  #[num_blocks, num_kv_heads, head_size/x, block_size, x]
+                v_cache.stride(0),
+                v_cache.stride(1),
+                v_cache.stride(2),
+                v_cache.stride(
+                    3),  #[num_blocks, num_kv_heads, head_size, block_size]
+                num_queries_per_kv=num_queries_per_kv,
+                BLOCK_M=BLOCK,
+                BLOCK_DMODEL=Lk,
+                BLOCK_N=BLOCK,
+                num_warps=num_warps,
+                num_stages=1,
+            )
+            return
+
+        _fwd_kernel[grid](
+            q,
+            k,
+            v,
+            k_cache,
+            v_cache,
+            b_loc,
+            sm_scale,
+            b_start_loc,
+            b_seq_len,
+            b_ctx_len,
+            v_cache.shape[3],
+            8,
+            o,
+            b_loc.stride(0),
+            b_loc.stride(1),
+            q.stride(0),
+            q.stride(1),
+            q.stride(2),
+            k.stride(0),
+            k.stride(1),
+            k.stride(2),
+            v.stride(0),
+            v.stride(1),
+            v.stride(2),
+            o.stride(0),
+            o.stride(1),
+            o.stride(2),
+            k_cache.stride(0),
+            k_cache.stride(1),
+            k_cache.stride(2),
+            k_cache.stride(3),
+            k_cache.stride(
+                4),  #[num_blocks, num_kv_heads, head_size/x, block_size, x]
+            v_cache.stride(0),
+            v_cache.stride(1),
+            v_cache.stride(2),
+            v_cache.stride(
+                3),  #[num_blocks, num_kv_heads, head_size, block_size]
+            num_queries_per_kv=num_queries_per_kv,
+            BLOCK_M=BLOCK,
+            BLOCK_DMODEL=Lk,
+            BLOCK_DMODEL_PADDED=Lk_padded,
+            BLOCK_N=BLOCK,
+            SLIDING_WINDOW=sliding_window if sliding_window is not None else 0,
+            num_warps=num_warps,
+            num_stages=1,
+        )
+        return
--- a/vllm/attention/ops/triton_flash_attention.py
+++ b/vllm/attention/ops/triton_flash_attention.py
@@ -0,0 +1,810 @@
+#!/usr/bin/env python
+"""
+Fused Attention
+===============
+
+This is a Triton implementation of the Flash Attention v2 algorithm from Tri Dao
+(https://tridao.me/publications/flash2/flash2.pdf)
+Credits: OpenAI kernel team, AMD ML Frameworks Triton team
+
+Features supported:
+
+1) Fwd with causal masking
+2) Any sequence lengths without padding (currently fwd kernel only)
+3) Support for different sequence lengths for q and k
+4) Nested tensor API currently does not support dropout or bias.
+
+Not currently supported:
+
+1) Non power of two head dims
+
+"""
+
+import torch
+import triton
+import triton.language as tl
+
+torch_dtype: tl.constexpr = torch.float16
+
+
+@triton.jit
+def cdiv_fn(x, y):
+    return (x + y - 1) // y
+
+
+@triton.jit
+def max_fn(x, y):
+    return tl.math.max(x, y)
+
+
+@triton.jit
+def dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, stride):
+    ms = tl.arange(0, m)
+    ns = tl.arange(0, n)
+    return philox_offset + ms[:, None] * stride + ns[None, :]
+
+
+@triton.jit
+def dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride):
+    rng_offsets = dropout_offsets(philox_seed, philox_offset, dropout_p, m, n,
+                                  stride).to(tl.uint32)
+    # TODO: use tl.randint for better performance
+    return tl.rand(philox_seed, rng_offsets)
+
+
+@triton.jit
+def dropout_mask(philox_seed, philox_offset, dropout_p, m, n, stride):
+    rng_output = dropout_rng(philox_seed, philox_offset, dropout_p, m, n,
+                             stride)
+    rng_keep = rng_output > dropout_p
+    return rng_keep
+
+
+@triton.jit
+def load_fn(block_ptr, first, second, pad):
+    if first and second:
+        tensor = tl.load(block_ptr, boundary_check=(0, 1), padding_option=pad)
+    elif first:
+        tensor = tl.load(block_ptr, boundary_check=(0, ), padding_option=pad)
+    elif second:
+        tensor = tl.load(block_ptr, boundary_check=(1, ), padding_option=pad)
+    else:
+        tensor = tl.load(block_ptr)
+    return tensor
+
+
+@triton.jit
+def _attn_fwd_inner(
+    acc,
+    l_i,
+    m_i,
+    q,
+    K_block_ptr,
+    V_block_ptr,
+    start_m,
+    actual_seqlen_k,
+    dropout_p,
+    philox_seed,
+    batch_philox_offset,
+    encoded_softmax_block_ptr,
+    block_min,
+    block_max,
+    offs_n_causal,
+    masked_blocks,
+    n_extra_tokens,
+    bias_ptr,
+    IS_CAUSAL: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    OFFS_M: tl.constexpr,
+    OFFS_N: tl.constexpr,
+    PRE_LOAD_V: tl.constexpr,
+    MASK_STEPS: tl.constexpr,
+    ENABLE_DROPOUT: tl.constexpr,
+    RETURN_ENCODED_SOFTMAX: tl.constexpr,
+    PADDED_HEAD: tl.constexpr,
+):
+    # loop over k, v, and update accumulator
+    for start_n in range(block_min, block_max, BLOCK_N):
+        # For padded blocks, we will overrun the tensor size if
+        # we load all BLOCK_N. For others, the blocks are all within range.
+        k = load_fn(
+            K_block_ptr,
+            PADDED_HEAD,
+            MASK_STEPS and (n_extra_tokens != 0),
+            "zero",
+        )
+        if PRE_LOAD_V:
+            v = load_fn(
+                V_block_ptr,
+                MASK_STEPS and (n_extra_tokens != 0),
+                PADDED_HEAD,
+                "zero",
+            )
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        # We start from end of seqlen_k so only the first iteration would need
+        # to be checked for padding if it is not a multiple of block_n
+        # TODO: This can be optimized to only be true for the padded block.
+        if MASK_STEPS:  # noqa: SIM102
+            # If this is the last block / iteration, we want to
+            # mask if the sequence length is not a multiple of block size
+            # a solution is to always do BLOCK_M // BLOCK_N + 1 steps
+            # if not is_modulo_mn. last step might get wasted but that is okay.
+            # check if this masking works for that case.
+            if (start_n + BLOCK_N == block_max) and (n_extra_tokens != 0):
+                boundary_m = tl.full([BLOCK_M],
+                                     actual_seqlen_k,
+                                     dtype=tl.int32)
+                size_n = start_n + OFFS_N[None, :]
+                mask = size_n < boundary_m[:, None]
+                qk = tl.where(mask, qk, float("-inf"))
+        if IS_CAUSAL:
+            causal_boundary = start_n + offs_n_causal
+            causal_mask = OFFS_M[:, None] >= causal_boundary[None, :]
+            qk = tl.where(causal_mask, qk, float("-inf"))
+        # -- compute qk ----
+        qk += tl.dot(q, k)
+        if bias_ptr is not None:
+            bias = load_fn(bias_ptr, False, MASK_STEPS
+                           and (n_extra_tokens != 0), "zero")
+            # While bias is added after multiplying qk with sm_scale, our
+            # optimization to use 2^x instead of e^x results in an additional
+            # scale factor of log2(e) which we must also multiply the bias with.
+            qk += bias * 1.44269504089
+        m_ij = tl.maximum(m_i, tl.max(qk, 1))
+        qk = qk - m_ij[:, None]
+        p = tl.math.exp2(qk)
+
+        # CAVEAT: Must update l_ij before applying dropout
+        l_ij = tl.sum(p, 1)
+        if ENABLE_DROPOUT:
+            philox_offset = (batch_philox_offset +
+                             start_m * BLOCK_M * actual_seqlen_k + start_n -
+                             BLOCK_N)
+            keep = dropout_mask(
+                philox_seed,
+                philox_offset,
+                dropout_p,
+                BLOCK_M,
+                BLOCK_N,
+                actual_seqlen_k,
+            )
+            if RETURN_ENCODED_SOFTMAX:
+                tl.store(
+                    encoded_softmax_block_ptr,
+                    tl.where(keep, p,
+                             -p).to(encoded_softmax_block_ptr.type.element_ty),
+                )
+            p = tl.where(keep, p, 0.0)
+        elif RETURN_ENCODED_SOFTMAX:
+            tl.store(
+                encoded_softmax_block_ptr,
+                p.to(encoded_softmax_block_ptr.type.element_ty),
+            )
+        # -- update output accumulator --
+        alpha = tl.math.exp2(m_i - m_ij)
+        acc = acc * alpha[:, None]
+        if not PRE_LOAD_V:
+            v = load_fn(
+                V_block_ptr,
+                MASK_STEPS and (n_extra_tokens != 0),
+                PADDED_HEAD,
+                "zero",
+            )
+        # -- update m_i and l_i
+        l_i = l_i * alpha + l_ij
+        # update m_i and l_i
+        m_i = m_ij
+        acc += tl.dot(p.to(V_block_ptr.type.element_ty), v)
+        V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0))
+        K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
+        if bias_ptr is not None:
+            bias_ptr = tl.advance(bias_ptr, (0, BLOCK_N))
+        if RETURN_ENCODED_SOFTMAX:
+            encoded_softmax_block_ptr = tl.advance(encoded_softmax_block_ptr,
+                                                   (0, BLOCK_N))
+    return acc, l_i, m_i
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {
+                "BLOCK_M": 256,
+                "BLOCK_N": 64,
+                "waves_per_eu": 2,
+                "PRE_LOAD_V": False,
+            },
+            num_stages=1,
+            num_warps=8,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 128,
+                "BLOCK_N": 128,
+                "waves_per_eu": 2,
+                "PRE_LOAD_V": False,
+            },
+            num_stages=1,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 256,
+                "BLOCK_N": 128,
+                "waves_per_eu": 2,
+                "PRE_LOAD_V": False,
+            },
+            num_stages=1,
+            num_warps=8,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 128,
+                "BLOCK_N": 64,
+                "waves_per_eu": 3,
+                "PRE_LOAD_V": True,
+            },
+            num_stages=1,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 128,
+                "BLOCK_N": 64,
+                "waves_per_eu": 3,
+                "PRE_LOAD_V": False,
+            },
+            num_stages=1,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 64,
+                "BLOCK_N": 64,
+                "waves_per_eu": 4,
+                "PRE_LOAD_V": False,
+            },
+            num_stages=1,
+            num_warps=8,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 32,
+                "BLOCK_N": 32,
+                "waves_per_eu": 4,
+                "PRE_LOAD_V": False,
+            },
+            num_stages=1,
+            num_warps=8,
+        ),
+        # TODO: This config fails with head_size not pow2 with data mismatches.
+        #    triton.Config({'BLOCK_M': 32, 'BLOCK_N': 16, 'waves_per_eu': 1,
+        #                   'PRE_LOAD_V': False}, num_stages=1, num_warps=4),
+        triton.Config(
+            {
+                "BLOCK_M": 16,
+                "BLOCK_N": 16,
+                "waves_per_eu": 1,
+                "PRE_LOAD_V": False,
+            },
+            num_stages=1,
+            num_warps=4,
+        ),
+    ],
+    key=['IS_CAUSAL', 'dropout_p', 'BLOCK_DMODEL'],
+)
+@triton.jit
+def attn_fwd(
+    Q,
+    K,
+    V,
+    bias,
+    sm_scale,
+    L,
+    Out,
+    stride_qz,
+    stride_qh,
+    stride_qm,
+    stride_qk,
+    stride_kz,
+    stride_kh,
+    stride_kn,
+    stride_kk,
+    stride_vz,
+    stride_vh,
+    stride_vk,
+    stride_vn,
+    stride_oz,
+    stride_oh,
+    stride_om,
+    stride_on,
+    stride_bz,
+    stride_bh,
+    stride_bm,
+    stride_bn,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    dropout_p,
+    philox_seed,
+    philox_offset_base,
+    encoded_softmax,
+    HQ: tl.constexpr,
+    HK: tl.constexpr,
+    ACTUAL_BLOCK_DMODEL: tl.constexpr,
+    MAX_SEQLENS_Q: tl.constexpr,
+    MAX_SEQLENS_K: tl.constexpr,
+    VARLEN: tl.constexpr,
+    IS_CAUSAL: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    PRE_LOAD_V: tl.constexpr,
+    BIAS_TYPE: tl.constexpr,
+    ENABLE_DROPOUT: tl.constexpr,
+    RETURN_ENCODED_SOFTMAX: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_h_q = tl.program_id(1)
+    off_z = tl.program_id(2)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    if VARLEN:
+        cu_seqlens_q_start = tl.load(cu_seqlens_q + off_z)
+        cu_seqlens_q_end = tl.load(cu_seqlens_q + off_z + 1)
+        seqlen_q = cu_seqlens_q_end - cu_seqlens_q_start
+        # We have a one-size-fits-all grid in id(0). Some seqlens might be too
+        # small for all start_m so for those we return early.
+        if start_m * BLOCK_M > seqlen_q:
+            return
+        cu_seqlens_k_start = tl.load(cu_seqlens_k + off_z)
+        cu_seqlens_k_end = tl.load(cu_seqlens_k + off_z + 1)
+        seqlen_k = cu_seqlens_k_end - cu_seqlens_k_start
+    else:
+        cu_seqlens_q_start = 0
+        cu_seqlens_k_start = 0
+        seqlen_q = MAX_SEQLENS_Q
+        seqlen_k = MAX_SEQLENS_K
+
+    # Now we compute whether we need to exit early due to causal masking.
+    # This is because for seqlen_q > seqlen_k, M rows of the attn scores
+    # are completely masked, resulting in 0s written to the output, and
+    # inf written to LSE. We don't need to do any GEMMs in this case.
+    # This block of code determines what N is, and if this WG is operating
+    # on those M rows.
+    n_blocks = cdiv_fn(seqlen_k, BLOCK_N)
+    if IS_CAUSAL:
+        # If seqlen_q == seqlen_k, the attn scores are a square matrix.
+        # If seqlen_q != seqlen_k, attn scores are rectangular which means
+        # the causal mask boundary is bottom right aligned, and ends at either
+        # the top edge (seqlen_q < seqlen_k) or left edge.
+        # This captures the decrease in n_blocks if we have a rectangular attn
+        # matrix
+        n_blocks_seqlen = cdiv_fn(
+            (start_m + 1) * BLOCK_M + seqlen_k - seqlen_q, BLOCK_N)
+        # This is what adjusts the block_max for the current WG, only
+        # if IS_CAUSAL. Otherwise we want to always iterate through all n_blocks
+        n_blocks = min(n_blocks, n_blocks_seqlen)
+        # If we have no blocks after adjusting for seqlen deltas, this WG is
+        # part of the blocks that are all 0. We exit early.
+        if n_blocks <= 0:
+            o_offset = (off_z * stride_oz + cu_seqlens_q_start * stride_om +
+                        off_h_q * stride_oh)
+            O_block_ptr = tl.make_block_ptr(
+                base=Out + o_offset,
+                shape=(seqlen_q, BLOCK_DMODEL),
+                strides=(stride_om, stride_on),
+                offsets=(start_m * BLOCK_M, 0),
+                block_shape=(BLOCK_M, BLOCK_DMODEL),
+                order=(1, 0),
+            )
+            acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=Out.type.element_ty)
+            # We still need to write 0s to the result
+            # tl.store(O_block_ptr,
+            # acc.to(Out.type.element_ty), boundary_check=(0,1))
+            # l_ptrs = L + off_z * HQ * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q
+            #          + offs_m
+            # We store inf to LSE, not -inf because in the bwd pass,
+            # we subtract this
+            # from qk which makes it -inf, such that exp(qk - inf) = 0
+            # for these masked blocks.
+            # l = tl.full([BLOCK_M], value=float("inf"), dtype=tl.float32)
+            # tl.store(l_ptrs, l)
+            # TODO: Should dropout and return encoded softmax be handled here?
+            return
+
+    # If MQA / GQA, set the K and V head offsets appropriately.
+    GROUP_SIZE: tl.constexpr = HQ // HK
+    off_h_k = off_h_q // GROUP_SIZE if GROUP_SIZE != 1 else off_h_q
+
+    n_extra_tokens = 0
+    if seqlen_k < BLOCK_N:
+        n_extra_tokens = BLOCK_N - seqlen_k
+    elif seqlen_k % BLOCK_N:
+        n_extra_tokens = seqlen_k % BLOCK_N
+    padded_head = ACTUAL_BLOCK_DMODEL != BLOCK_DMODEL
+
+    # Compute pointers for all the tensors used in this kernel.
+    q_offset = (off_z * stride_qz + off_h_q * stride_qh +
+                cu_seqlens_q_start * stride_qm)
+    Q_block_ptr = tl.make_block_ptr(
+        base=Q + q_offset,
+        shape=(seqlen_q, ACTUAL_BLOCK_DMODEL),
+        strides=(stride_qm, stride_qk),
+        offsets=(start_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_DMODEL),
+        order=(1, 0),
+    )
+    k_offset = (off_z * stride_kz + off_h_k * stride_kh +
+                cu_seqlens_k_start * stride_kn)
+    K_block_ptr = tl.make_block_ptr(
+        base=K + k_offset,
+        shape=(ACTUAL_BLOCK_DMODEL, seqlen_k),
+        strides=(stride_kk, stride_kn),
+        offsets=(0, 0),
+        block_shape=(BLOCK_DMODEL, BLOCK_N),
+        order=(0, 1),
+    )
+    v_offset = (off_z * stride_vz + off_h_k * stride_vh +
+                cu_seqlens_k_start * stride_vk)
+    V_block_ptr = tl.make_block_ptr(
+        base=V + v_offset,
+        shape=(seqlen_k, ACTUAL_BLOCK_DMODEL),
+        strides=(stride_vk, stride_vn),
+        offsets=(0, 0),
+        block_shape=(BLOCK_N, BLOCK_DMODEL),
+        order=(1, 0),
+    )
+    if BIAS_TYPE != 0:
+        bias_ptr = tl.make_block_ptr(
+            base=bias + off_h_q * stride_bh,
+            shape=(seqlen_q, seqlen_k),
+            strides=(stride_bm, stride_bn),
+            offsets=(start_m * BLOCK_M, 0),
+            block_shape=(BLOCK_M, BLOCK_N),
+            order=(1, 0),
+        )
+    else:
+        bias_ptr = None
+    if ENABLE_DROPOUT:
+        batch_philox_offset = philox_offset_base \
+                              + (off_z * HQ + off_h_q) \
+                              * seqlen_q * seqlen_k
+    else:
+        batch_philox_offset = 0
+    # We can ask to return the dropout mask without actually doing any dropout.
+    # In this case, we return an invalid pointer so indicate the mask is not i
+    # valid.
+    # TODO: Fix encoded softmax. It currently uses just h_q in the base offset.
+    if RETURN_ENCODED_SOFTMAX:
+        encoded_softmax_block_ptr = tl.make_block_ptr(
+            base=encoded_softmax + off_h_q * seqlen_q * seqlen_k,
+            shape=(seqlen_q, seqlen_k),
+            strides=(seqlen_k, 1),
+            offsets=(start_m * BLOCK_M, 0),
+            block_shape=(BLOCK_M, BLOCK_N),
+            order=(1, 0),
+        )
+    else:
+        encoded_softmax_block_ptr = 0
+    # initialize pointer to m and l
+    m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    # scale sm_scale by log_2(e) and use 2^x in the loop as we do not
+    # have native e^x support in HW.
+    qk_scale = sm_scale * 1.44269504089
+    # Q is loaded once at the beginning and shared by all N blocks.
+    q = load_fn(Q_block_ptr, True, padded_head, "zero")
+    q = (q * qk_scale).to(Q_block_ptr.type.element_ty)
+
+    # Here we compute how many full and masked blocks we have.
+    padded_block_k = n_extra_tokens != 0
+    is_modulo_mn = not padded_block_k and (seqlen_q % BLOCK_M == 0)
+    if IS_CAUSAL:
+        # There are always at least BLOCK_M // BLOCK_N masked blocks.
+        # Additionally there might be one more due to dissimilar seqlens.
+        masked_blocks = BLOCK_M // BLOCK_N + (not is_modulo_mn)
+    else:
+        # Padding on Q does not need to be masked in the FA loop.
+        masked_blocks = padded_block_k
+    # if IS_CAUSAL, not is_modulo_mn does not always result in an additional
+    # block. In this case we might exceed n_blocks so pick the min.
+    masked_blocks = min(masked_blocks, n_blocks)
+    n_full_blocks = n_blocks - masked_blocks
+    block_min = 0
+    block_max = n_blocks * BLOCK_N
+    # Compute for full blocks. Here we set causal to false regardless of its
+    # value because there is no masking. Similarly we do not need padding.
+    if n_full_blocks > 0:
+        block_max = (n_blocks - masked_blocks) * BLOCK_N
+        acc, l_i, m_i = _attn_fwd_inner(
+            acc,
+            l_i,
+            m_i,
+            q,
+            K_block_ptr,
+            V_block_ptr,
+            start_m,
+            seqlen_k,
+            dropout_p,
+            philox_seed,
+            batch_philox_offset,
+            encoded_softmax_block_ptr,
+            # _, _, offs_n_causal, masked_blocks, n_extra_tokens, _
+            block_min,
+            block_max,
+            0,
+            0,
+            0,
+            bias_ptr,
+            # IS_CAUSAL, ....
+            False,
+            BLOCK_M,
+            BLOCK_DMODEL,
+            BLOCK_N,
+            offs_m,
+            offs_n,
+            # _, MASK_STEPS, ...
+            PRE_LOAD_V,
+            False,
+            ENABLE_DROPOUT,
+            RETURN_ENCODED_SOFTMAX,
+            padded_head,
+        )
+        block_min = block_max
+        block_max = n_blocks * BLOCK_N
+
+    tl.debug_barrier()
+    # Remaining blocks, if any, are full / not masked.
+    if masked_blocks > 0:
+        offs_n_causal = offs_n + (seqlen_q - seqlen_k) if IS_CAUSAL else 0
+        K_block_ptr = tl.advance(K_block_ptr, (0, n_full_blocks * BLOCK_N))
+        V_block_ptr = tl.advance(V_block_ptr, (n_full_blocks * BLOCK_N, 0))
+        if bias_ptr is not None:
+            bias_ptr = tl.advance(bias_ptr, (0, n_full_blocks * BLOCK_N))
+        if RETURN_ENCODED_SOFTMAX:
+            encoded_softmax_block_ptr = tl.advance(encoded_softmax_block_ptr,
+                                                   (0, n_full_blocks))
+        acc, l_i, m_i = _attn_fwd_inner(
+            acc,
+            l_i,
+            m_i,
+            q,
+            K_block_ptr,
+            V_block_ptr,
+            start_m,
+            seqlen_k,
+            dropout_p,
+            philox_seed,
+            batch_philox_offset,
+            encoded_softmax_block_ptr,
+            block_min,
+            block_max,
+            offs_n_causal,
+            masked_blocks,
+            n_extra_tokens,
+            bias_ptr,
+            IS_CAUSAL,
+            BLOCK_M,
+            BLOCK_DMODEL,
+            BLOCK_N,
+            offs_m,
+            offs_n,
+            # _, MASK_STEPS, ...
+            PRE_LOAD_V,
+            True,
+            ENABLE_DROPOUT,
+            RETURN_ENCODED_SOFTMAX,
+            padded_head,
+        )
+    # epilogue
+    acc = acc / l_i[:, None]
+    if ENABLE_DROPOUT:
+        acc = acc / (1 - dropout_p)
+    # If seqlen_q > seqlen_k but the delta is not a multiple of BLOCK_M,
+    # then we have one block with a row of all NaNs which come from computing
+    # softmax over a row of all -infs (-inf - inf = NaN). We check for that here
+    # and store 0s where there are NaNs as these rows should've been zeroed out.
+    end_m_idx = (start_m + 1) * BLOCK_M
+    start_m_idx = start_m * BLOCK_M
+    causal_start_idx = seqlen_q - seqlen_k
+    acc = acc.to(Out.type.element_ty)
+    if IS_CAUSAL:  # noqa: SIM102
+        if causal_start_idx > start_m_idx and causal_start_idx < end_m_idx:
+            out_mask_boundary = tl.full((BLOCK_DMODEL, ),
+                                        causal_start_idx,
+                                        dtype=tl.int32)
+            mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M)
+            out_ptrs_mask = (mask_m_offsets[:, None] >=
+                             out_mask_boundary[None, :])
+            z = 0.0
+            acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty))
+    # write back LSE
+    # l_ptrs = L + off_z * HQ * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + offs_m
+    # If seqlen_q not multiple of BLOCK_M, we need to mask out the last
+    # few rows. This is only true for the last M block. For others,
+    # overflow_size will be -ve
+    # overflow_size = end_m_idx - seqlen_q
+    # if overflow_size > 0:
+    #    boundary = tl.full((BLOCK_M,), BLOCK_M - overflow_size, dtype=tl.int32)
+    #    # This is a > check because mask being 0 blocks the store.
+    #    l_ptrs_mask = boundary > tl.arange(0, BLOCK_M)
+    #    tl.store(l_ptrs, m_i + tl.math.log2(l_i), mask=l_ptrs_mask)
+    # else:
+    #    tl.store(l_ptrs, m_i + tl.math.log2(l_i))
+
+    # write back O
+    o_offset = (off_z * stride_oz + cu_seqlens_q_start * stride_om +
+                off_h_q * stride_oh)
+    O_block_ptr = tl.make_block_ptr(
+        base=Out + o_offset,
+        shape=(seqlen_q, ACTUAL_BLOCK_DMODEL),
+        strides=(stride_om, stride_on),
+        offsets=(start_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_DMODEL),
+        order=(1, 0),
+    )
+    # Need boundary check on this to make sure the padding from the
+    # Q and KV tensors in both dims are not part of what we store back.
+    # TODO: Do the boundary check optionally.
+    tl.store(O_block_ptr, acc, boundary_check=(0, 1))
+
+
+def check_args(
+    q,
+    k,
+    v,
+    o,
+    varlen=True,
+    max_seqlens=None,
+    cu_seqlens_q=None,
+    cu_seqlens_k=None,
+):
+    assert q.dim() == k.dim() and q.dim() == v.dim()
+    if varlen:
+        assert q.dim() == 3
+        total_q, nheads_q, head_size = q.shape
+        total_k, nheads_k, _ = k.shape
+        assert cu_seqlens_q is not None
+        assert cu_seqlens_k is not None
+        assert len(cu_seqlens_q) == len(cu_seqlens_k)
+    else:
+        assert q.dim() == 4
+        batch, nheads_q, seqlen_q, head_size = q.shape
+        _, nheads_k, seqlen_k, _ = k.shape
+        assert max_seqlens > 0
+    assert k.shape == v.shape
+    assert q.shape[-1] == k.shape[-1] and q.shape[-1] == v.shape[-1]
+    # TODO: Change assert if we support qkl f8 and v f16
+    assert q.dtype == k.dtype and q.dtype == v.dtype
+    assert head_size <= 256
+    assert o.shape == q.shape
+    assert (nheads_q % nheads_k) == 0
+
+
+class _attention(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx,
+        q,
+        k,
+        v,
+        o,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlens_q,
+        max_seqlens_k,
+        causal=False,
+        sm_scale=1.0,
+        bias=None,
+    ):
+        if o is None:
+            o = torch.empty_like(q, dtype=v.dtype)
+
+        check_args(
+            q,
+            k,
+            v,
+            o,
+            varlen=True,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+        )
+        if True:  # varlen
+            total_q, nheads_q, head_size = q.shape
+            total_k, nheads_k, _ = k.shape
+            batch = len(cu_seqlens_q) - 1
+            q_strides = (0, q.stride(1), q.stride(0), q.stride(2))
+            k_strides = (0, k.stride(1), k.stride(0), k.stride(2))
+            v_strides = (0, v.stride(1), v.stride(0), v.stride(2))
+            o_strides = (0, o.stride(1), o.stride(0), o.stride(2))
+        else:
+            batch, seqlen_q, nheads_q, head_size = q.shape
+            _, seqlen_k, nheads_k, _ = k.shape
+            q_strides = (q.stride(0), q.stride(2), q.stride(1), q.stride(3))
+            k_strides = (k.stride(0), k.stride(2), k.stride(1), k.stride(3))
+            v_strides = (v.stride(0), v.stride(2), v.stride(1), v.stride(3))
+            o_strides = (o.stride(0), o.stride(2), o.stride(1), o.stride(3))
+
+        # Get closest power of 2 over or equal to 32.
+        unpadded_head_dims = {32, 64, 128, 256}
+        if head_size not in unpadded_head_dims:
+            padded_d_model = None
+            for i in unpadded_head_dims:
+                if i > head_size:
+                    padded_d_model = i
+                    break
+            assert padded_d_model is not None
+        else:
+            padded_d_model = head_size
+
+        grid = lambda META: (
+            triton.cdiv(max_seqlens_q, META["BLOCK_M"]),
+            nheads_q,
+            batch,
+        )
+
+        encoded_softmax = None
+
+        # Seed the RNG so we get reproducible results for testing.
+        philox_seed = 0x1BF52
+        philox_offset = 0x1D4B42
+
+        if bias is not None:
+            bias_strides = (
+                bias.stride(0),
+                bias.stride(1),
+                bias.stride(2),
+                bias.stride(3),
+            )
+        else:
+            bias_strides = (0, 0, 0, 0)
+
+        attn_fwd[grid](
+            q,
+            k,
+            v,
+            bias,
+            sm_scale,
+            None,
+            o,
+            *q_strides,
+            *k_strides,
+            *v_strides,
+            *o_strides,
+            *bias_strides,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            dropout_p=0.0,
+            philox_seed=philox_seed,
+            philox_offset_base=philox_offset,
+            encoded_softmax=encoded_softmax,
+            HQ=nheads_q,
+            HK=nheads_k,
+            ACTUAL_BLOCK_DMODEL=head_size,
+            MAX_SEQLENS_Q=max_seqlens_q,
+            MAX_SEQLENS_K=max_seqlens_k,
+            IS_CAUSAL=causal,
+            VARLEN=True,
+            BLOCK_DMODEL=padded_d_model,
+            BIAS_TYPE=0 if bias is None else 1,
+            ENABLE_DROPOUT=False,
+            RETURN_ENCODED_SOFTMAX=False,
+        )
+
+        ctx.grid = grid
+        ctx.sm_scale = sm_scale
+        ctx.BLOCK_DMODEL = head_size
+        ctx.causal = causal
+        ctx.dropout_p = 0.0
+        ctx.philox_seed = philox_seed
+        ctx.philox_offset = philox_offset
+        ctx.encoded_softmax = encoded_softmax
+        ctx.return_encoded_softmax = False
+        return o, encoded_softmax
+
+
+triton_attention = _attention.apply