Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/tests/kernels/attention/conftest.py
+++ b/tests/kernels/attention/conftest.py
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.utils.torch_utils import (
+    create_kv_caches_with_random,
+    create_kv_caches_with_random_flash,
+)
+
+
+@pytest.fixture()
+def kv_cache_factory():
+    return create_kv_caches_with_random
+
+
+@pytest.fixture()
+def kv_cache_factory_flashinfer():
+    return create_kv_caches_with_random_flash
--- a/tests/kernels/attention/test_aiter_flash_attn.py
+++ b/tests/kernels/attention/test_aiter_flash_attn.py
@@ -0,0 +1,190 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import pytest
+import torch
+
+import vllm.v1.attention.backends.rocm_aiter_fa  # noqa: F401
+from vllm.attention.utils.fa_utils import is_flash_attn_varlen_func_available
+from vllm.platforms import current_platform
+
+NUM_HEADS = [(4, 4), (8, 2)]
+HEAD_SIZES = [128, 256]
+BLOCK_SIZES = [16]
+DTYPES = [torch.bfloat16]
+QDTYPES = [None]
+# one value large enough to test overflow in index calculation.
+# one value small enough to test the schema op check
+NUM_BLOCKS = [32768, 2048]
+
+
+def ref_paged_attn(
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    query_lens: list[int],
+    kv_lens: list[int],
+    block_tables: torch.Tensor,
+    scale: float,
+    sliding_window: int | None = None,
+    soft_cap: float | None = None,
+) -> torch.Tensor:
+    num_seqs = len(query_lens)
+    block_tables = block_tables.cpu().numpy()
+    _, block_size, num_kv_heads, head_size = key_cache.shape
+
+    outputs: list[torch.Tensor] = []
+    start_idx = 0
+    for i in range(num_seqs):
+        query_len = query_lens[i]
+        kv_len = kv_lens[i]
+        q = query[start_idx : start_idx + query_len]
+        q *= scale
+
+        num_kv_blocks = (kv_len + block_size - 1) // block_size
+        block_indices = block_tables[i, :num_kv_blocks]
+
+        k = key_cache[block_indices].view(-1, num_kv_heads, head_size)
+        k = k[:kv_len]
+        v = value_cache[block_indices].view(-1, num_kv_heads, head_size)
+        v = v[:kv_len]
+
+        if q.shape[1] != k.shape[1]:
+            k = torch.repeat_interleave(k, q.shape[1] // k.shape[1], dim=1)
+            v = torch.repeat_interleave(v, q.shape[1] // v.shape[1], dim=1)
+        attn = torch.einsum("qhd,khd->hqk", q, k).float()
+        empty_mask = torch.ones(query_len, kv_len)
+        mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
+        if sliding_window is not None:
+            sliding_window_mask = (
+                torch.triu(
+                    empty_mask, diagonal=kv_len - (query_len + sliding_window) + 1
+                )
+                .bool()
+                .logical_not()
+            )
+            mask |= sliding_window_mask
+        if soft_cap is not None:
+            attn = soft_cap * torch.tanh(attn / soft_cap)
+        attn.masked_fill_(mask, float("-inf"))
+        attn = torch.softmax(attn, dim=-1).to(v.dtype)
+        out = torch.einsum("hqk,khd->qhd", attn, v)
+
+        outputs.append(out)
+        start_idx += query_len
+
+    return torch.cat(outputs, dim=0)
+
+
+@pytest.mark.skipif(not current_platform.is_rocm(), reason="Only ROCm is supported")
+@pytest.mark.parametrize(
+    "seq_lens", [[(10, 1328), (5, 18), (129, 463)], [(8, 523), (24, 37), (3, 2011)]]
+)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("sliding_window", [None, 256])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("soft_cap", [None])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("q_dtype", QDTYPES)
+@torch.inference_mode()
+def test_varlen_with_paged_kv(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    q_dtype: torch.dtype | None,
+) -> None:
+    if not is_flash_attn_varlen_func_available():
+        pytest.skip("flash_attn_varlen_func required to run this test.")
+    torch.set_default_device("cuda")
+    current_platform.seed_everything(0)
+    num_seqs = len(seq_lens)
+    query_lens = [x[0] for x in seq_lens]
+    kv_lens = [x[1] for x in seq_lens]
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+    max_query_len = max(query_lens)
+    max_kv_len = max(kv_lens)
+    window_size = (sliding_window - 1, 0) if sliding_window is not None else (-1, -1)
+    scale = head_size**-0.5
+
+    query = torch.randn(sum(query_lens), num_query_heads, head_size, dtype=dtype)
+    key_cache = torch.randn(
+        num_blocks, block_size, num_kv_heads, head_size, dtype=dtype
+    )
+    value_cache = torch.randn_like(key_cache)
+    cu_query_lens = torch.tensor([0] + query_lens, dtype=torch.int32).cumsum(
+        dim=0, dtype=torch.int32
+    )
+
+    cu_seq_lens = torch.tensor([0] + kv_lens, dtype=torch.int32).cumsum(
+        dim=0, dtype=torch.int32
+    )
+    kv_lens = torch.tensor(kv_lens, dtype=torch.int32)
+
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(
+        0, num_blocks, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
+
+    output = torch.empty_like(query)
+
+    maybe_quantized_query = query
+    maybe_quantized_key_cache = key_cache
+    maybe_quantized_value_cache = value_cache
+    k_descale = None
+    v_descale = None
+    if q_dtype is not None:
+        # QKV are drawn from N(0, 1): no need for a fp8 scaling factor
+        maybe_quantized_query = query.to(q_dtype)
+        maybe_quantized_key_cache = key_cache.to(q_dtype)
+        maybe_quantized_value_cache = value_cache.to(q_dtype)
+
+        scale_shape = (num_seqs, num_kv_heads)
+        k_descale = torch.ones(scale_shape, dtype=torch.float32)
+        v_descale = torch.ones(scale_shape, dtype=torch.float32)
+
+    torch.ops.vllm.flash_attn_varlen_func(
+        maybe_quantized_query,
+        maybe_quantized_key_cache,
+        maybe_quantized_value_cache,
+        out=output,
+        cu_seqlens_q=cu_query_lens,
+        max_seqlen_q=max_query_len,
+        max_seqlen_k=max_kv_len,
+        softmax_scale=scale,
+        alibi_slopes=None,
+        window_size=window_size,
+        block_table=block_tables,
+        cu_seqlens_k=cu_seq_lens,
+        k_scale=k_descale,
+        v_scale=v_descale,
+    )
+
+    ref_output = ref_paged_attn(
+        query=query,
+        key_cache=key_cache,
+        value_cache=value_cache,
+        query_lens=query_lens,
+        kv_lens=kv_lens,
+        block_tables=block_tables,
+        scale=scale,
+        sliding_window=sliding_window,
+        soft_cap=soft_cap,
+    )
+
+    atol, rtol = 2e-2, 2e-2
+    if q_dtype is not None:
+        atol, rtol = 1.5e-1, 1.5e-1
+    (
+        torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol),
+        f"{torch.max(torch.abs(output - ref_output))}",
+    )
--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@@ -0,0 +1,457 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import random
+
+import pytest
+import torch
+
+from tests.kernels.allclose_default import get_default_atol, get_default_rtol
+from tests.kernels.utils import opcheck
+from vllm import _custom_ops as ops
+from vllm.attention.layer import Attention, MultiHeadAttention
+from vllm.platforms import current_platform
+from vllm.utils.mem_utils import get_max_shared_memory_bytes
+
+FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
+# This will change depending on the compute capability.
+# - 512 as a buffer
+MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512
+# There may not be enough gpu memory due to large NUM_BLOCKS.
+# Reduce NUM_BLOCKS when it happens.
+NUM_BLOCKS = 4321  # Arbitrary values for testing
+PARTITION_SIZE = 512
+PARTITION_SIZE_ROCM = 256
+DTYPES = [torch.bfloat16]
+NUM_GEN_SEQS = [7]  # Arbitrary values for testing
+NUM_PREFILL_SEQS = [3]  # Arbitrary values for testing
+NUM_HEADS = [(40, 40), (64, 8)]  # Arbitrary values for testing
+
+# This should be sync with get_supported_head_sizes() in
+# vllm.attention.ops.paged_attn.PagedAttention
+HEAD_SIZES = [32, 80, 128, 256]
+
+BLOCK_SIZES = [16, 32]
+USE_ALIBI = [False, True]
+KV_CACHE_DTYPE = ["auto", "fp8"]
+SEEDS = [0]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+
+
+def ref_masked_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    scale: float,
+    attn_mask: torch.Tensor | None = None,
+) -> torch.Tensor:
+    attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float()
+    if attn_mask is not None:
+        attn_weights = attn_weights + attn_mask.float()
+    attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
+    out = torch.einsum("hqk,khd->qhd", attn_weights, value)
+    return out
+
+
+def ref_single_query_cached_kv_attention(
+    output: torch.Tensor,
+    query: torch.Tensor,
+    num_queries_per_kv: int,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    block_tables: torch.Tensor,
+    seq_lens: torch.Tensor,
+    scale: float,
+    alibi_slopes: torch.Tensor | None,
+) -> None:
+    num_query_heads = query.shape[1]
+    num_kv_heads = value_cache.shape[1]
+    head_size = value_cache.shape[2]
+    block_size = value_cache.shape[3]
+    num_seqs = query.shape[0]
+
+    block_tables_lst = block_tables.cpu().tolist()
+    seq_lens_lst = seq_lens.cpu().tolist()
+    for i in range(num_seqs):
+        q = query[i].unsqueeze(0)
+        block_table = block_tables_lst[i]
+        seq_len = int(seq_lens_lst[i])
+
+        keys_lst: list[torch.Tensor] = []
+        values_lst: list[torch.Tensor] = []
+        for j in range(seq_len):
+            block_number = int(block_table[j // block_size])
+            block_offset = j % block_size
+
+            k = key_cache[block_number, :, :, block_offset, :]
+            k = k.reshape(num_kv_heads, head_size)
+            keys_lst.append(k)
+
+            v = value_cache[block_number, :, :, block_offset]
+            values_lst.append(v)
+        keys = torch.stack(keys_lst, dim=0)
+        values = torch.stack(values_lst, dim=0)
+        if num_queries_per_kv > 1:
+            # Handle MQA and GQA
+            keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1)
+            values = torch.repeat_interleave(values, num_queries_per_kv, dim=1)
+
+        alibi_bias = None
+        if alibi_slopes is not None:
+            # Create the ALiBi bias used in the paged attention kernel.
+            position_ids = torch.arange(seq_len).int()
+            alibi_bias = (position_ids - seq_len + 1).float()
+            alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view(1, 1, -1)
+
+        out = ref_masked_attention(q, keys, values, scale, alibi_bias)
+        out = out.view(num_query_heads, head_size)
+        output[i].copy_(out, non_blocking=True)
+
+
+@pytest.mark.parametrize(
+    "version", ["v1", "v2"] if not current_platform.is_rocm() else ["v1", "v2", "rocm"]
+)
+@pytest.mark.parametrize("num_seqs", NUM_GEN_SEQS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("use_alibi", USE_ALIBI)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_paged_attention(
+    kv_cache_factory,
+    version: str,
+    num_seqs: int,
+    num_heads: tuple[int, int],
+    head_size: int,
+    use_alibi: bool,
+    block_size: int,
+    dtype: torch.dtype,
+    kv_cache_dtype: str,
+    seed: int,
+    device: str,
+) -> None:
+    if (kv_cache_dtype == "fp8" and head_size % 16) or (
+        version == "rocm" and head_size not in (64, 128)
+    ):
+        pytest.skip()
+
+    if (
+        version == "rocm"
+        and current_platform.is_navi()
+        and (
+            kv_cache_dtype == "fp8" or head_size != 128 or block_size != 16 or use_alibi
+        )
+    ):
+        pytest.skip()
+
+    global PARTITION_SIZE
+
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+    scale = float(1.0 / (head_size**0.5))
+    num_query_heads, num_kv_heads = num_heads
+    query = torch.empty(num_seqs, num_query_heads, head_size, dtype=dtype)
+    query.uniform_(-scale, scale)
+
+    assert num_query_heads % num_kv_heads == 0
+    num_queries_per_kv = num_query_heads // num_kv_heads
+    alibi_slopes = None
+    if use_alibi:
+        alibi_slopes = torch.randn(num_query_heads, dtype=torch.float)
+
+    seq_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
+    seq_lens[-1] = MAX_SEQ_LEN
+    max_seq_len = max(seq_lens)
+    seq_lens = torch.tensor(seq_lens, dtype=torch.int)
+
+    # Create the block tables.
+    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
+    block_tables_lst: list[list[int]] = []
+    for _ in range(num_seqs):
+        block_table = [
+            random.randint(0, NUM_BLOCKS - 1) for _ in range(max_num_blocks_per_seq)
+        ]
+        block_tables_lst.append(block_table)
+
+    block_tables = torch.tensor(block_tables_lst, dtype=torch.int)
+
+    # Create the KV caches.
+    key_caches, value_caches = kv_cache_factory(
+        NUM_BLOCKS,
+        block_size,
+        1,
+        num_kv_heads,
+        head_size,
+        kv_cache_dtype,
+        dtype,
+        seed,
+        device,
+    )
+    key_cache, value_cache = key_caches[0], value_caches[0]
+
+    # Using default kv_scale
+    k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device)
+
+    # Call the paged attention kernel.
+    output = torch.empty_like(query)
+    if version == "v1":
+        ops.paged_attention_v1(
+            output,
+            query,
+            key_cache,
+            value_cache,
+            num_kv_heads,
+            scale,
+            block_tables,
+            seq_lens,
+            block_size,
+            max_seq_len,
+            alibi_slopes,
+            kv_cache_dtype,
+            k_scale,
+            v_scale,
+        )
+
+        opcheck(
+            torch.ops._C.paged_attention_v1,
+            (
+                output,
+                query,
+                key_cache,
+                value_cache,
+                num_kv_heads,
+                scale,
+                block_tables,
+                seq_lens,
+                block_size,
+                max_seq_len,
+                alibi_slopes,
+                kv_cache_dtype,
+                k_scale,
+                v_scale,
+                0,
+                0,
+                0,
+                64,
+                0,
+            ),
+            cond=(head_size == HEAD_SIZES[0] and block_size == BLOCK_SIZES[0]),
+        )
+
+    elif version in ("v2", "rocm"):
+        if current_platform.is_rocm() and version == "rocm":
+            PARTITION_SIZE = PARTITION_SIZE_ROCM
+
+        num_partitions = (max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE
+        assert PARTITION_SIZE % block_size == 0
+        num_seqs, num_heads, head_size = output.shape
+        tmp_output = torch.empty(
+            size=(num_seqs, num_heads, num_partitions, head_size),
+            dtype=output.dtype,
+        )
+        exp_sums = torch.empty(
+            size=(num_seqs, num_heads, num_partitions),
+            dtype=torch.float32,
+        )
+        max_logits = torch.empty_like(exp_sums)
+        if version == "v2":
+            ops.paged_attention_v2(
+                output,
+                exp_sums,
+                max_logits,
+                tmp_output,
+                query,
+                key_cache,
+                value_cache,
+                num_kv_heads,
+                scale,
+                block_tables,
+                seq_lens,
+                block_size,
+                max_seq_len,
+                alibi_slopes,
+                kv_cache_dtype,
+                k_scale,
+                v_scale,
+            )
+
+            opcheck(
+                torch.ops._C.paged_attention_v2,
+                (
+                    output,
+                    exp_sums,
+                    max_logits,
+                    tmp_output,
+                    query,
+                    key_cache,
+                    value_cache,
+                    num_kv_heads,
+                    scale,
+                    block_tables,
+                    seq_lens,
+                    block_size,
+                    max_seq_len,
+                    alibi_slopes,
+                    kv_cache_dtype,
+                    k_scale,
+                    v_scale,
+                    0,
+                    0,
+                    0,
+                    64,
+                    0,
+                ),
+                cond=(head_size == HEAD_SIZES[0] and block_size == BLOCK_SIZES[0]),
+            )
+
+        else:
+            ops.paged_attention_rocm(
+                output,
+                exp_sums,
+                max_logits,
+                tmp_output,
+                query,
+                key_cache,
+                value_cache,
+                num_kv_heads,
+                scale,
+                block_tables,
+                seq_lens,
+                None,
+                block_size,
+                max_seq_len,
+                alibi_slopes,
+                kv_cache_dtype,
+                k_scale,
+                v_scale,
+            )
+
+            opcheck(
+                torch.ops._rocm_C.paged_attention,
+                (
+                    output,
+                    exp_sums,
+                    max_logits,
+                    tmp_output,
+                    query,
+                    key_cache,
+                    value_cache,
+                    num_kv_heads,
+                    scale,
+                    block_tables,
+                    seq_lens,
+                    None,
+                    block_size,
+                    max_seq_len,
+                    alibi_slopes,
+                    kv_cache_dtype,
+                    k_scale,
+                    v_scale,
+                ),
+                cond=(head_size == HEAD_SIZES[0] and block_size == BLOCK_SIZES[0]),
+            )
+
+    else:
+        raise AssertionError(f"Unknown version: {version}")
+
+    # Run the reference implementation.
+    if kv_cache_dtype == "fp8":
+        # Convert cache data back to dtype.
+        x = 16 // torch.tensor([], dtype=dtype).element_size()
+        key_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size // x, block_size, x)
+        dequantized_key_cache = torch.empty(
+            size=key_cache_shape, dtype=dtype, device=device
+        )
+        ops.convert_fp8(dequantized_key_cache, key_cache)
+        key_cache = dequantized_key_cache
+
+        value_cache_shape = value_cache.shape
+        dequantized_value_cache = torch.empty(
+            size=value_cache_shape, dtype=dtype, device=device
+        )
+        ops.convert_fp8(dequantized_value_cache, value_cache)
+        value_cache = dequantized_value_cache
+
+    ref_output = torch.empty_like(query)
+    ref_single_query_cached_kv_attention(
+        ref_output,
+        query,
+        num_queries_per_kv,
+        key_cache,
+        value_cache,
+        block_tables,
+        seq_lens,
+        scale,
+        alibi_slopes,
+    )
+
+    # NOTE(woosuk): Due to the kernel-level differences in the two
+    # implementations, there is a small numerical difference in the two
+    # outputs. Thus, we use a relaxed tolerance for the test.
+    atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3
+    rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5
+
+    # NOTE(zhaoyang): FP8 KV Cache will introduce quantization error,
+    # so we use a relaxed tolerance for the test.
+    atol, rtol = 1e-3, 1e-5
+    if kv_cache_dtype == "fp8":
+        atol, rtol = 1e-2, 1e-5
+    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
+
+
+def ref_multi_query_kv_attention(
+    cu_seq_lens: list[int],
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    scale: float,
+    alibi_bias: list[torch.Tensor] | None,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    num_seqs = len(cu_seq_lens) - 1
+    ref_outputs: list[torch.Tensor] = []
+    if alibi_bias:
+        assert len(alibi_bias) == num_seqs
+    for i in range(num_seqs):
+        start_idx = cu_seq_lens[i]
+        end_idx = cu_seq_lens[i + 1]
+        seq_len = end_idx - start_idx
+
+        # Create attention mask. ALiBi already includes a tril causal mask.
+        if alibi_bias:
+            attn_mask = alibi_bias[i]
+        else:
+            attn_mask = torch.triu(
+                torch.ones(seq_len, seq_len, dtype=dtype), diagonal=1
+            )
+            attn_mask = attn_mask * torch.finfo(dtype).min
+            attn_mask = attn_mask.to(dtype=dtype)
+
+        ref_output = ref_masked_attention(
+            query[start_idx:end_idx],
+            key[start_idx:end_idx],
+            value[start_idx:end_idx],
+            scale,
+            attn_mask=attn_mask,
+        )
+        ref_outputs.append(ref_output)
+
+    return torch.cat(ref_outputs, dim=0)
+
+
+@pytest.mark.parametrize("attention_cls", [Attention, MultiHeadAttention])
+def test_num_heads_not_divisble_by_num_kv_heads(attention_cls: type) -> None:
+    head_size = 64
+    scale = float(1.0 / (head_size**0.5))
+    num_heads = 16
+    num_kv_heads = 5
+    with pytest.raises(AssertionError):
+        _ = attention_cls(
+            num_heads=num_heads,
+            head_size=head_size,
+            scale=scale,
+            num_kv_heads=num_kv_heads,
+        )
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -0,0 +1,291 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import patch
+
+import pytest
+import torch
+
+from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
+from vllm.platforms import current_platform
+from vllm.platforms.cpu import CpuPlatform
+from vllm.platforms.cuda import CudaPlatform
+from vllm.platforms.rocm import RocmPlatform
+
+
+@pytest.fixture(autouse=True)
+def clear_cache():
+    """Clear lru cache to ensure each test case runs without caching."""
+    _cached_get_attn_backend.cache_clear()
+
+
+# Define MLA and non-MLA backends separately
+DEVICE_MLA_BACKENDS = {
+    "cuda": [
+        "TRITON_MLA",
+        "FLASHMLA",
+        "FLASHINFER_MLA",
+        "FLASH_ATTN_MLA",
+        "CUTLASS_MLA",
+    ],
+    "hip": ["TRITON_MLA", "ROCM_AITER_MLA"],
+    "cpu": [],
+}
+
+DEVICE_REGULAR_ATTN_BACKENDS = {
+    "cuda": ["FLASHINFER", "FLASH_ATTN"],
+    "hip": ["ROCM_ATTN"],
+    "cpu": ["CPU_ATTN"],
+}
+
+DEVICE_MLA_BLOCK_SIZES = {
+    "cuda": [16, 64],  # CUDA supports both standard and extended block sizes
+    "hip": [16, 1],  # HIP requires special handling for block_size=1
+    # "cpu": [16]  # CPU uses fixed block size from test cases
+    "cpu": [],  # FIXME(woosuk): Temporarily disable CPU tests
+}
+
+
+def generate_params():
+    is_rocm = current_platform.is_rocm()
+    params = []
+    device_list = ["cuda", "cpu"] if not is_rocm else ["hip", "cpu"]
+    for use_mla in [True, False]:
+        for device in device_list:
+            backends = (
+                DEVICE_MLA_BACKENDS[device]
+                if use_mla
+                else DEVICE_REGULAR_ATTN_BACKENDS[device]
+            )
+            for name in backends:
+                block_sizes = DEVICE_MLA_BLOCK_SIZES[device] if use_mla else [16]
+                for block_size in block_sizes:
+                    params.append(
+                        pytest.param(
+                            device,
+                            name,
+                            use_mla,
+                            block_size,
+                            id=f"{device}_{name}_mla_{str(use_mla)[0]}_blks{block_size}",
+                        )
+                    )
+    return params
+
+
+@pytest.mark.parametrize("device, name, use_mla, block_size", generate_params())
+def test_env(
+    device: str,
+    name: str,
+    use_mla: bool,
+    block_size: int,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    """Test attention backend selection with valid device-backend pairs."""
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_ATTENTION_BACKEND", name)
+        m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
+
+        if device == "cpu":
+            with patch("vllm.platforms.current_platform", CpuPlatform()):
+                backend = get_attn_backend(16, torch.float16, None, block_size)
+            assert backend.get_name() == "CPU_ATTN"
+
+        elif device == "hip":
+            with patch("vllm.platforms.current_platform", RocmPlatform()):
+                if use_mla:
+                    # ROCm MLA backend logic:
+                    # - TRITON_MLA: supported when block_size != 1
+                    # - ROCM_AITER_MLA: supported when block_size == 1
+                    # If backend is forced but doesn't match block_size,
+                    # should raise ValueError
+
+                    if name == "TRITON_MLA" and block_size == 1:
+                        # TRITON_MLA doesn't support block_size == 1
+                        with pytest.raises(ValueError) as exc_info:
+                            get_attn_backend(
+                                16, torch.float16, None, block_size, use_mla=use_mla
+                            )
+                        assert f"The selected backend, {name}" in str(exc_info.value)
+                    else:
+                        # Valid backend-block_size combination
+                        backend = get_attn_backend(
+                            16, torch.float16, None, block_size, use_mla=use_mla
+                        )
+                        expected = name
+                        assert backend.get_name() == expected
+                else:
+                    backend = get_attn_backend(
+                        16, torch.float16, None, block_size, use_mla=use_mla
+                    )
+                    expected = "ROCM_ATTN"
+                    assert backend.get_name() == expected
+
+        elif device == "cuda":
+            with patch("vllm.platforms.current_platform", CudaPlatform()):
+                capability = torch.cuda.get_device_capability()
+                if use_mla:
+                    # CUDA MLA backend logic:
+                    # - CUTLASS_MLA: only supported with block_size == 128
+                    #   and Blackwell GPUs (SM 10.x), V1 only
+                    # - FLASHINFER_MLA: only supported on Blackwell GPUs
+                    #   (SM 10.x), V1 only
+                    # - FLASHMLA: only supported with block_size == 64
+                    # - FLASH_ATTN_MLA: V1 only
+                    # - TRITON_MLA: fallback for other cases
+
+                    if name == "CUTLASS_MLA":
+                        if block_size != 128:
+                            # CUTLASS_MLA only supports block_size == 128
+                            pytest.skip("CUTLASS_MLA only supports block_size 128")
+                        if capability[0] != 10:
+                            pytest.skip("CUTLASS MLA is not supported on this platform")
+                        backend = get_attn_backend(
+                            576, torch.float16, None, block_size, use_mla=use_mla
+                        )
+                        expected = "CUTLASS_MLA"
+                        assert backend.get_name() == expected
+                    elif name == "FLASHINFER_MLA":
+                        if capability[0] != 10:
+                            pytest.skip(
+                                "FlashInfer MLA is not supported on this platform"
+                            )
+                        if block_size not in [32, 64]:
+                            # FlashInfer MLA only supports block_size 32 or 64
+                            pytest.skip(
+                                "FlashInfer MLA only supports block_size 32 or 64"
+                            )
+                        backend = get_attn_backend(
+                            576, torch.float16, None, block_size, use_mla=use_mla
+                        )
+                        expected = "FLASHINFER_MLA"
+                        assert backend.get_name() == expected
+                    elif name == "FLASHMLA":
+                        if block_size != 64:
+                            # FlashMLA only supports block_size == 64
+                            pytest.skip("FlashMLA only supports block_size 64")
+                        from vllm.v1.attention.backends.mla.flashmla import (
+                            is_flashmla_dense_supported,
+                        )
+
+                        is_supported, _ = is_flashmla_dense_supported()
+                        if not is_supported:
+                            pytest.skip("FlashMLA not supported on this platform")
+                        backend = get_attn_backend(
+                            576,
+                            torch.float16,
+                            None,
+                            block_size,
+                            use_mla=use_mla,
+                        )
+                        expected = name
+                        assert backend.get_name() == expected
+                    elif name == "FLASH_ATTN_MLA":
+                        from vllm.attention.utils.fa_utils import (
+                            flash_attn_supports_mla,
+                        )
+
+                        if not flash_attn_supports_mla():
+                            pytest.skip(
+                                "FlashAttention MLA not supported on this platform"
+                            )
+                        backend = get_attn_backend(
+                            576, torch.float16, None, block_size, use_mla=use_mla
+                        )
+                        expected = "FLASH_ATTN_MLA"
+                        assert backend.get_name() == expected
+                    else:
+                        # TRITON_MLA or other fallback
+                        backend = get_attn_backend(
+                            576, torch.float16, None, block_size, use_mla=use_mla
+                        )
+                        expected = "TRITON_MLA"
+                        assert backend.get_name() == expected
+                elif name == "FLASHINFER":
+                    backend = get_attn_backend(
+                        64, torch.float16, None, block_size, use_mla=use_mla
+                    )
+                    expected = "FLASHINFER"
+                    assert backend.get_name() == expected
+                elif name == "FLASH_ATTN":
+                    backend = get_attn_backend(
+                        32, torch.float16, None, block_size, use_mla=use_mla
+                    )
+                    expected = "FLASH_ATTN"
+                    assert backend.get_name() == expected
+
+
+@pytest.mark.parametrize("device", ["cpu", "cuda"])
+def test_fp32_fallback(device: str):
+    """Test attention backend selection with fp32."""
+    if device == "cpu":
+        with patch("vllm.platforms.current_platform", CpuPlatform()):
+            backend = get_attn_backend(16, torch.float32, None, 16)
+        assert backend.get_name() == "CPU_ATTN"
+
+    elif device == "cuda":
+        with patch("vllm.platforms.current_platform", CudaPlatform()):
+            backend = get_attn_backend(16, torch.float32, None, 16)
+        assert backend.get_name() == "FLEX_ATTENTION"
+
+
+def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
+    """Test FlashAttn validation."""
+    pytest.skip(
+        "Skipping as current backend selector does not "
+        "handle fallbacks when a backend is set via env var."
+    )
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_ATTENTION_BACKEND", "FLASH_ATTN")
+
+        # Unsupported CUDA arch
+        monkeypatch.setattr(torch.cuda, "get_device_capability", lambda _=None: (7, 5))
+        backend = get_attn_backend(16, torch.float16, None, 16)
+        assert backend.get_name() != "FLASH_ATTN"
+
+        # Reset the monkeypatch for subsequent tests
+        monkeypatch.undo()
+
+        # Unsupported data type
+        backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16)
+        assert backend.get_name() != "FLASH_ATTN"
+
+        # Unsupported kv cache data type
+        backend = get_attn_backend(16, torch.float16, "fp8", 16)
+        assert backend.get_name() != "FLASH_ATTN"
+
+        # Unsupported block size
+        backend = get_attn_backend(16, torch.float16, None, 8)
+        assert backend.get_name() != "FLASH_ATTN"
+
+        # flash-attn is not installed
+        import sys
+
+        original_module = sys.modules.get("vllm_flash_attn")
+        monkeypatch.setitem(sys.modules, "vllm_flash_attn", None)
+        backend = get_attn_backend(16, torch.float16, None, 16)
+        assert backend.get_name() != "FLASH_ATTN"
+
+        # Restore the original module if it existed
+        if original_module is not None:
+            monkeypatch.setitem(sys.modules, "vllm_flash_attn", original_module)
+        else:
+            monkeypatch.delitem(sys.modules, "vllm_flash_attn", raising=False)
+
+        # Unsupported head size
+        backend = get_attn_backend(17, torch.float16, None, 16)
+        assert backend.get_name() != "FLASH_ATTN"
+
+
+def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
+    """Test that invalid attention backend names raise ValueError."""
+    with (
+        monkeypatch.context() as m,
+        patch("vllm.platforms.current_platform", CudaPlatform()),
+    ):
+        m.setenv("VLLM_ATTENTION_BACKEND", "INVALID")
+
+        # Should raise ValueError for invalid backend
+        with pytest.raises(ValueError) as exc_info:
+            get_attn_backend(32, torch.float16, None, 16)
+        assert "Invalid value 'INVALID'" in str(exc_info.value)
--- a/tests/kernels/attention/test_cache.py
+++ b/tests/kernels/attention/test_cache.py
--- a/tests/kernels/attention/test_cascade_flash_attn.py
+++ b/tests/kernels/attention/test_cascade_flash_attn.py
@@ -0,0 +1,186 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+from vllm.v1.attention.backends.flash_attn import cascade_attention, merge_attn_states
+
+try:
+    from vllm.vllm_flash_attn import (
+        fa_version_unsupported_reason,
+        flash_attn_varlen_func,
+        is_fa_version_supported,
+    )
+except ImportError:
+    if current_platform.is_rocm():
+        pytest.skip(
+            "vllm_flash_attn is not supported for vLLM on ROCm.",
+            allow_module_level=True,
+        )
+
+NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
+HEAD_SIZES = [128, 192, 256]
+BLOCK_SIZES = [16]
+DTYPES = [torch.float16, torch.bfloat16]
+
+
+@pytest.mark.parametrize("num_tokens", [1, 39, 16912])
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@torch.inference_mode()
+def test_merge_kernel(
+    num_tokens: int,
+    num_heads: tuple[int, int],
+    head_size: int,
+    dtype: torch.dtype,
+):
+    torch.set_default_device("cuda")
+    current_platform.seed_everything(0)
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+
+    # Prepare inputs.
+    prefix_output = torch.randn(num_tokens, num_query_heads, head_size, dtype=dtype)
+    suffix_output = torch.randn(num_tokens, num_query_heads, head_size, dtype=dtype)
+    prefix_lse = torch.randn(num_query_heads, num_tokens, dtype=torch.float32)
+    suffix_lse = torch.randn(num_query_heads, num_tokens, dtype=torch.float32)
+
+    # Run the kernel.
+    output = torch.empty(num_tokens, num_query_heads, head_size, dtype=dtype)
+    merge_attn_states(output, prefix_output, prefix_lse, suffix_output, suffix_lse)
+
+    # Reference implementation.
+    max_lse = torch.maximum(prefix_lse, suffix_lse)
+    p_lse = torch.exp(prefix_lse - max_lse)
+    s_lse = torch.exp(suffix_lse - max_lse)
+    p_scale = p_lse / (p_lse + s_lse)
+    s_scale = s_lse / (p_lse + s_lse)
+    p_scale = p_scale.transpose(0, 1).unsqueeze(2)
+    s_scale = s_scale.transpose(0, 1).unsqueeze(2)
+    ref_output = p_scale * prefix_output + s_scale * suffix_output
+    ref_output = ref_output.to(dtype)
+
+    # Compare the results.
+    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2)
+
+
+CASES = [
+    # Case 1. A general case.
+    ([(129, 871), (18, 280), (37, 988), (1023, 2304), (1, 257)], 256),
+    # Case 2. Flash-decoding case.
+    ([(1, 1023), (1, 879), (1, 778), (1, 1777)] * 100, 512),
+]
+
+
+@pytest.mark.parametrize("seq_lens_and_common_prefix", CASES)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("soft_cap", [None, 50])
+@pytest.mark.parametrize("num_blocks", [2048])
+@pytest.mark.parametrize("fa_version", [2, 3])
+@torch.inference_mode()
+def test_cascade(
+    seq_lens_and_common_prefix: tuple[list[tuple[int, int]], int],
+    num_heads: tuple[int, int],
+    head_size: int,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    fa_version: int,
+) -> None:
+    torch.set_default_device("cuda")
+    if not is_fa_version_supported(fa_version):
+        pytest.skip(
+            f"Flash attention version {fa_version} not supported due "
+            f'to: "{fa_version_unsupported_reason(fa_version)}"'
+        )
+
+    current_platform.seed_everything(0)
+
+    window_size = (-1, -1)
+    scale = head_size**-0.5
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+    key_cache = torch.randn(
+        num_blocks, block_size, num_kv_heads, head_size, dtype=dtype
+    )
+    value_cache = torch.randn_like(key_cache)
+
+    seq_lens, common_prefix_len = seq_lens_and_common_prefix
+    num_seqs = len(seq_lens)
+    query_lens = [x[0] for x in seq_lens]
+    kv_lens = [x[1] for x in seq_lens]
+    max_query_len = max(query_lens)
+    max_kv_len = max(kv_lens)
+
+    total_num_query_tokens = sum(query_lens)
+    query = torch.randn(total_num_query_tokens, num_query_heads, head_size, dtype=dtype)
+    cu_query_lens = torch.tensor([0] + query_lens, dtype=torch.int32).cumsum(
+        dim=0, dtype=torch.int32
+    )
+    kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int32)
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(
+        0, num_blocks, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
+
+    assert common_prefix_len > 0
+    assert common_prefix_len % block_size == 0
+    num_common_kv_blocks = common_prefix_len // block_size
+    # Make sure the first `num_common_kv_blocks` blocks are the same.
+    block_tables[:, :num_common_kv_blocks] = block_tables[0, :num_common_kv_blocks]
+
+    # Run the regular attention.
+    ref_output = flash_attn_varlen_func(
+        q=query,
+        k=key_cache,
+        v=value_cache,
+        cu_seqlens_q=cu_query_lens,
+        seqused_k=kv_lens_tensor,
+        max_seqlen_q=max_query_len,
+        max_seqlen_k=max_kv_len,
+        softmax_scale=scale,
+        causal=True,
+        window_size=window_size,
+        block_table=block_tables,
+        softcap=soft_cap if soft_cap is not None else 0,
+    )
+
+    # Run cascade attention.
+    assert all(common_prefix_len < kv_len for kv_len in kv_lens)
+    cu_prefix_query_lens = torch.tensor([0, total_num_query_tokens], dtype=torch.int32)
+    prefix_kv_lens = torch.tensor([common_prefix_len], dtype=torch.int32)
+    suffix_kv_lens = kv_lens_tensor - common_prefix_len
+    output = torch.empty_like(query)
+    cascade_attention(
+        output=output,
+        query=query,
+        key_cache=key_cache,
+        value_cache=value_cache,
+        cu_query_lens=cu_query_lens,
+        max_query_len=max_query_len,
+        cu_prefix_query_lens=cu_prefix_query_lens,
+        prefix_kv_lens=prefix_kv_lens,
+        suffix_kv_lens=suffix_kv_lens,
+        max_kv_len=max_kv_len,
+        softmax_scale=scale,
+        alibi_slopes=None,
+        sliding_window=window_size,
+        logits_soft_cap=soft_cap if soft_cap is not None else 0,
+        block_table=block_tables,
+        common_prefix_len=common_prefix_len,
+        max_num_splits=0,  # no max
+        fa_version=fa_version,
+    )
+
+    # Compare the results.
+    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2)
--- a/tests/kernels/attention/test_cpu_attn.py
+++ b/tests/kernels/attention/test_cpu_attn.py
@@ -0,0 +1,628 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import functools
+import math
+
+import pytest
+import torch
+
+from vllm.platforms import CpuArchEnum, current_platform
+from vllm.v1.attention.backends.cpu_attn import _get_attn_isa
+
+if not current_platform.is_cpu():
+    pytest.skip("skipping CPU-only tests", allow_module_level=True)
+
+from vllm._custom_ops import (
+    cpu_attention_with_kv_cache,
+    cpu_attn_get_scheduler_metadata,
+    cpu_attn_reshape_and_cache,
+)
+
+NUM_HEADS = [
+    (4, 4),
+    (8, 2),
+    (9, 3),
+]
+HEAD_SIZES = [96, 128]
+QTYPES = [torch.bfloat16, torch.half, torch.float32]
+SLIDING_WINDOWS = [None, 256]
+NUM_BLOCKS = [
+    1024,
+]
+SEQ_LENS = [  # (q_len, kv_len)
+    [(1, 213), (1, 1), (1, 312), (1, 7), (1, 7812)],  # decode batch
+    [(2345, 2345), (5, 5), (3, 16), (134, 5131)],  # prefill batch
+    [(992, 2456), (1, 1234), (98, 1145), (1, 4162), (2345, 2345)],  # mixed batch
+]
+
+
+def get_attn_isa(
+    block_size: int | None = None,
+    dtype: torch.dtype | None = None,
+):
+    if block_size and dtype:
+        return _get_attn_isa(dtype, block_size)
+    else:
+        if current_platform.get_cpu_architecture() == CpuArchEnum.ARM:
+            return "neon"
+        elif torch._C._cpu._is_amx_tile_supported():
+            return "amx"
+        else:
+            return "vec"
+
+
+# rand number generation takes too much time, cache rand tensors
+@functools.lru_cache(maxsize=128, typed=False)
+def tensor_cache(
+    elem_num: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    tensor = torch.randn(elem_num, dtype=dtype)
+
+    return tensor
+
+
+def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
+    closest_power_of_2 = 2 ** math.floor(math.log2(total_num_heads))
+    base = torch.tensor(
+        2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))),
+        dtype=torch.float32,
+    )
+    powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
+    slopes = torch.pow(base, powers)
+
+    if closest_power_of_2 != total_num_heads:
+        extra_base = torch.tensor(
+            2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))),
+            dtype=torch.float32,
+        )
+        num_remaining_heads = min(
+            closest_power_of_2, total_num_heads - closest_power_of_2
+        )
+        extra_powers = torch.arange(
+            start=1, end=1 + 2 * num_remaining_heads, step=2, dtype=torch.int32
+        )
+        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
+    return slopes.float()
+
+
+def ref_paged_attn(
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    query_lens: list[int],
+    kv_lens: list[int],
+    block_tables: torch.Tensor,
+    scale: float,
+    sliding_window: int | None = None,
+    soft_cap: float | None = None,
+    alibi_slopes: torch.Tensor | None = None,
+    s_aux: torch.Tensor | None = None,
+) -> torch.Tensor:
+    num_seqs = len(query_lens)
+    block_tables = block_tables.cpu().numpy()
+    _, block_size, num_kv_heads, head_size = key_cache.shape
+    dtype = query.dtype
+
+    outputs: list[torch.Tensor] = []
+    start_idx = 0
+
+    if alibi_slopes is not None:
+        alibi_slopes = alibi_slopes[:, None, None]
+
+    if s_aux is not None:
+        s_aux = s_aux.float()
+        s_aux = s_aux[:, None, None]
+
+    for i in range(num_seqs):
+        query_len = query_lens[i]
+        kv_len = kv_lens[i]
+        q = query[start_idx : start_idx + query_len].float()
+        q *= scale
+
+        num_kv_blocks = (kv_len + block_size - 1) // block_size
+        block_indices = block_tables[i, :num_kv_blocks]
+
+        k = key_cache[block_indices].view(-1, num_kv_heads, head_size)
+        k = k[:kv_len].float()
+        v = value_cache[block_indices].view(-1, num_kv_heads, head_size)
+        v = v[:kv_len].float()
+
+        if q.shape[1] != k.shape[1]:
+            k = torch.repeat_interleave(k, q.shape[1] // k.shape[1], dim=1)
+            v = torch.repeat_interleave(v, q.shape[1] // v.shape[1], dim=1)
+        attn = torch.einsum("qhd,khd->hqk", q, k).float()
+        empty_mask = torch.ones(query_len, kv_len)
+        mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
+
+        if sliding_window is not None:
+            sliding_window_mask = (
+                torch.triu(
+                    empty_mask, diagonal=kv_len - (query_len + sliding_window) + 1
+                )
+                .bool()
+                .logical_not()
+            )
+            mask |= sliding_window_mask
+
+        if soft_cap is not None:
+            attn = soft_cap * torch.tanh(attn / soft_cap)
+
+        if alibi_slopes is not None:
+            q_start_pos = kv_len - query_len
+            q_pos = q_start_pos + torch.arange(0, query_len)[None, :, None]
+            kv_pos = torch.arange(0, kv_len)[None, None, :]
+            dist = q_pos - kv_pos
+            alibi_bias = -alibi_slopes * dist
+            attn += alibi_bias
+
+        attn.masked_fill_(mask, float("-inf"))
+
+        if s_aux is not None:
+            s_aux_ext = s_aux.repeat(1, query_len, 1)
+            attn = torch.cat((s_aux_ext, attn), dim=-1)
+
+        attn = torch.softmax(attn, dim=-1)
+
+        if s_aux is not None:
+            attn = attn[:, :, 1:]
+
+        out = torch.einsum("hqk,khd->qhd", attn, v).to(dtype=dtype)
+
+        outputs.append(out)
+        start_idx += query_len
+
+    return torch.cat(outputs, dim=0)
+
+
+@torch.inference_mode()
+def varlen_with_paged_kv(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    use_alibi: bool,
+    use_sink: bool,
+    isa: str,
+) -> None:
+    current_platform.seed_everything(0)
+    num_seqs = len(seq_lens)
+    query_lens = [x[0] for x in seq_lens]
+    kv_lens = [x[1] for x in seq_lens]
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+    max_kv_len = max(kv_lens)
+    window_size = (sliding_window - 1, 0) if sliding_window is not None else (-1, -1)
+    scale = head_size**-0.5
+    token_num = sum(query_lens)
+
+    # for n heads the set of slopes is the geometric sequence that starts
+    # 2^(-8/n)
+    alibi_slopes = _get_alibi_slopes(num_query_heads) if use_alibi else None
+
+    s_aux = (
+        15 * torch.rand((num_query_heads,), dtype=torch.bfloat16) if use_sink else None
+    )
+
+    query = tensor_cache(
+        elem_num=token_num * num_query_heads * head_size,
+        dtype=dtype,
+    )
+    query = query.view(
+        token_num,
+        num_query_heads,
+        head_size,
+    )
+
+    key_value = tensor_cache(
+        elem_num=2 * num_blocks * num_kv_heads * block_size * head_size,
+        dtype=dtype,
+    )
+    key_value = key_value.view(
+        2,
+        num_blocks,
+        block_size,
+        num_kv_heads,
+        head_size,
+    )
+    key_cache, value_cache = key_value.unbind(0)
+
+    # KV cache for CPU attention
+    packed_key_cache = torch.empty(
+        num_blocks, num_kv_heads, block_size, head_size, dtype=dtype
+    )
+    packed_value_cache = torch.empty_like(packed_key_cache)
+
+    cu_query_lens = torch.tensor([0] + query_lens, dtype=torch.int32).cumsum(
+        dim=0, dtype=torch.int32
+    )
+    kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int32)
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(
+        0, num_blocks, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
+
+    # use reshape_and_cache to pack key_cache and value_cache
+    slot_mapping = torch.arange(0, num_blocks * block_size, dtype=torch.int64)
+    cpu_attn_reshape_and_cache(
+        key=key_cache.view(-1, num_kv_heads, head_size),
+        value=value_cache.view(-1, num_kv_heads, head_size),
+        key_cache=packed_key_cache,
+        value_cache=packed_value_cache,
+        slot_mapping=slot_mapping,
+        isa=isa,
+    )
+
+    metadata = cpu_attn_get_scheduler_metadata(
+        num_reqs=num_seqs,
+        num_heads=num_query_heads,
+        num_kv_heads=num_kv_heads,
+        head_dim=head_size,
+        seq_lens=kv_lens_tensor,
+        dtype=dtype,
+        query_start_loc=cu_query_lens,
+        causal=True,
+        sliding_window_size=sliding_window if sliding_window is not None else -1,
+        isa=isa,
+        enable_kv_split=False,
+    )
+
+    out_without_split = torch.empty_like(query)
+    cpu_attention_with_kv_cache(
+        query=query,
+        key_cache=packed_key_cache,
+        value_cache=packed_value_cache,
+        output=out_without_split,
+        query_start_loc=cu_query_lens,
+        seq_lens=kv_lens_tensor,
+        scale=scale,
+        causal=True,
+        alibi_slopes=alibi_slopes,
+        sliding_window=window_size,
+        block_table=block_tables,
+        softcap=soft_cap if soft_cap is not None else 0,
+        scheduler_metadata=metadata,
+        s_aux=s_aux,
+    )
+
+    metadata = cpu_attn_get_scheduler_metadata(
+        num_reqs=num_seqs,
+        num_heads=num_query_heads,
+        num_kv_heads=num_kv_heads,
+        head_dim=head_size,
+        seq_lens=kv_lens_tensor,
+        dtype=dtype,
+        query_start_loc=cu_query_lens,
+        causal=True,
+        sliding_window_size=sliding_window if sliding_window is not None else -1,
+        isa=isa,
+        enable_kv_split=True,
+    )
+
+    out_with_split = torch.empty_like(query)
+    cpu_attention_with_kv_cache(
+        query=query,
+        key_cache=packed_key_cache,
+        value_cache=packed_value_cache,
+        output=out_with_split,
+        query_start_loc=cu_query_lens,
+        seq_lens=kv_lens_tensor,
+        scale=scale,
+        causal=True,
+        alibi_slopes=alibi_slopes,
+        sliding_window=window_size,
+        block_table=block_tables,
+        softcap=soft_cap if soft_cap is not None else 0,
+        scheduler_metadata=metadata,
+        s_aux=s_aux,
+    )
+
+    ref_output = ref_paged_attn(
+        query=query,
+        key_cache=key_cache,
+        value_cache=value_cache,
+        query_lens=query_lens,
+        kv_lens=kv_lens,
+        block_tables=block_tables,
+        scale=scale,
+        sliding_window=sliding_window,
+        soft_cap=soft_cap,
+        alibi_slopes=alibi_slopes,
+        s_aux=s_aux,
+    )
+
+    atol, rtol = 1.5e-2, 1e-2
+    (
+        torch.testing.assert_close(out_with_split, ref_output, atol=atol, rtol=rtol),
+        f"{torch.max(torch.abs(out_with_split - ref_output))}",
+    )
+    (
+        torch.testing.assert_close(out_without_split, ref_output, atol=atol, rtol=rtol),
+        f"{torch.max(torch.abs(out_without_split - ref_output))}",
+    )
+
+
+@pytest.mark.parametrize("seq_lens", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", [96, 128])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
+@pytest.mark.parametrize("dtype", QTYPES)
+@pytest.mark.parametrize("soft_cap", [None])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("use_alibi", [False])
+@pytest.mark.parametrize("use_sink", [False])
+@pytest.mark.parametrize("isa", ["vec"])
+def test_varlen_with_paged_kv_normal_vec(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    use_alibi: bool,
+    use_sink: bool,
+    isa: str,
+) -> None:
+    varlen_with_paged_kv(
+        seq_lens=seq_lens,
+        num_heads=num_heads,
+        head_size=head_size,
+        sliding_window=sliding_window,
+        dtype=dtype,
+        block_size=block_size,
+        soft_cap=soft_cap,
+        num_blocks=num_blocks,
+        use_alibi=use_alibi,
+        use_sink=use_sink,
+        isa=isa,
+    )
+
+
+@pytest.mark.parametrize("seq_lens", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", [96, 128])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("soft_cap", [None])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("use_alibi", [False])
+@pytest.mark.parametrize("use_sink", [False])
+@pytest.mark.parametrize("isa", ["amx"])
+@pytest.mark.skipif(
+    not torch._C._cpu._is_amx_tile_supported(), reason="no AMX support."
+)
+def test_varlen_with_paged_kv_normal_amx(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    use_alibi: bool,
+    use_sink: bool,
+    isa: str,
+) -> None:
+    varlen_with_paged_kv(
+        seq_lens=seq_lens,
+        num_heads=num_heads,
+        head_size=head_size,
+        sliding_window=sliding_window,
+        dtype=dtype,
+        block_size=block_size,
+        soft_cap=soft_cap,
+        num_blocks=num_blocks,
+        use_alibi=use_alibi,
+        use_sink=use_sink,
+        isa=isa,
+    )
+
+
+@pytest.mark.parametrize("seq_lens", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", [48])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("soft_cap", [None])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("use_alibi", [False])
+@pytest.mark.parametrize("use_sink", [False])
+@pytest.mark.parametrize("isa", ["vec16"])
+def test_varlen_with_paged_kv_normal_vec16(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    use_alibi: bool,
+    use_sink: bool,
+    isa: str,
+) -> None:
+    varlen_with_paged_kv(
+        seq_lens=seq_lens,
+        num_heads=num_heads,
+        head_size=head_size,
+        sliding_window=sliding_window,
+        dtype=dtype,
+        block_size=block_size,
+        soft_cap=soft_cap,
+        num_blocks=num_blocks,
+        use_alibi=use_alibi,
+        use_sink=use_sink,
+        isa=isa,
+    )
+
+
+@pytest.mark.parametrize("seq_lens", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", [96, 128])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
+@pytest.mark.parametrize("dtype", QTYPES)
+@pytest.mark.parametrize("soft_cap", [None])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("use_alibi", [False])
+@pytest.mark.parametrize("use_sink", [False])
+@pytest.mark.parametrize("isa", ["neon"])
+@pytest.mark.skipif(
+    current_platform.get_cpu_architecture() != CpuArchEnum.ARM,
+    reason="Not an Arm CPU.",
+)
+def test_varlen_with_paged_kv_normal_neon(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    use_alibi: bool,
+    use_sink: bool,
+    isa: str,
+) -> None:
+    varlen_with_paged_kv(
+        seq_lens=seq_lens,
+        num_heads=num_heads,
+        head_size=head_size,
+        sliding_window=sliding_window,
+        dtype=dtype,
+        block_size=block_size,
+        soft_cap=soft_cap,
+        num_blocks=num_blocks,
+        use_alibi=use_alibi,
+        use_sink=use_sink,
+        isa=isa,
+    )
+
+
+@pytest.mark.parametrize("seq_lens", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", [96])
+@pytest.mark.parametrize("block_size", [128])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("soft_cap", [50])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("use_alibi", [False])
+@pytest.mark.parametrize("use_sink", [False])
+@pytest.mark.parametrize("isa", [get_attn_isa()])
+def test_varlen_with_paged_kv_softcap(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    use_alibi: bool,
+    use_sink: bool,
+    isa: str,
+) -> None:
+    varlen_with_paged_kv(
+        seq_lens=seq_lens,
+        num_heads=num_heads,
+        head_size=head_size,
+        sliding_window=sliding_window,
+        dtype=dtype,
+        block_size=block_size,
+        soft_cap=soft_cap,
+        num_blocks=num_blocks,
+        use_alibi=use_alibi,
+        use_sink=use_sink,
+        isa=isa,
+    )
+
+
+@pytest.mark.parametrize("seq_lens", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", [96])
+@pytest.mark.parametrize("block_size", [128])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("soft_cap", [None])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("use_alibi", [True])
+@pytest.mark.parametrize("use_sink", [False])
+@pytest.mark.parametrize("isa", [get_attn_isa()])
+def test_varlen_with_paged_kv_alibi(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    use_alibi: bool,
+    use_sink: bool,
+    isa: str,
+) -> None:
+    varlen_with_paged_kv(
+        seq_lens=seq_lens,
+        num_heads=num_heads,
+        head_size=head_size,
+        sliding_window=sliding_window,
+        dtype=dtype,
+        block_size=block_size,
+        soft_cap=soft_cap,
+        num_blocks=num_blocks,
+        use_alibi=use_alibi,
+        use_sink=use_sink,
+        isa=isa,
+    )
+
+
+@pytest.mark.parametrize("seq_lens", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", [96])
+@pytest.mark.parametrize("block_size", [128])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("soft_cap", [None])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("use_alibi", [False])
+@pytest.mark.parametrize("use_sink", [True])
+@pytest.mark.parametrize("isa", [get_attn_isa()])
+def test_varlen_with_paged_kv_sink(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    use_alibi: bool,
+    use_sink: bool,
+    isa: str,
+) -> None:
+    varlen_with_paged_kv(
+        seq_lens=seq_lens,
+        num_heads=num_heads,
+        head_size=head_size,
+        sliding_window=sliding_window,
+        dtype=dtype,
+        block_size=block_size,
+        soft_cap=soft_cap,
+        num_blocks=num_blocks,
+        use_alibi=use_alibi,
+        use_sink=use_sink,
+        isa=isa,
+    )
--- a/tests/kernels/attention/test_cutlass_mla_decode.py
+++ b/tests/kernels/attention/test_cutlass_mla_decode.py
@@ -0,0 +1,214 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
+import random
+
+import pytest
+import torch
+
+import vllm._custom_ops as ops
+from vllm.platforms import current_platform
+from vllm.triton_utils import triton
+
+
+def cal_diff(
+    x: torch.Tensor,
+    y: torch.Tensor,
+    name: str,
+    use_fp8: bool = False,
+    diff_threshold: float | None = None,
+) -> None:
+    x, y = x.double(), y.double()
+    cos_diff = 1 - 2 * (x * y).sum().item() / max((x * x + y * y).sum().item(), 1e-12)
+    if diff_threshold is not None:
+        # directly compare the cos_diff with the threshold
+        assert cos_diff < diff_threshold
+    else:
+        # use the default threshold
+        if use_fp8:
+            assert cos_diff < 1e-4
+        else:
+            assert cos_diff < 1e-5
+
+
+CUTLASS_MLA_UNSUPPORTED_REASON = (
+    "Cutlass MLA Requires compute capability of 100 or above."
+    if not current_platform.is_device_capability_family(100)
+    else "Cutlass MLA is supported"
+)
+
+
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(100),
+    reason=CUTLASS_MLA_UNSUPPORTED_REASON,
+)
+@pytest.mark.parametrize("b", [128])
+@pytest.mark.parametrize("s_q", [1])
+@pytest.mark.parametrize("mean_sk", [4096, 8192, 16384])
+@pytest.mark.parametrize("h_q", [16, 32, 64, 128])
+@pytest.mark.parametrize("h_kv", [1])
+@pytest.mark.parametrize("d", [576])
+@pytest.mark.parametrize("dv", [512])
+@pytest.mark.parametrize("block_size", [64])
+@pytest.mark.parametrize("causal", [True])
+@pytest.mark.parametrize("varlen", [False, True])
+@pytest.mark.parametrize(
+    "torch_dtype",
+    [
+        torch.bfloat16,
+        # fp8 can have occasional precision-related failures.
+        pytest.param(torch.float8_e4m3fn, marks=pytest.mark.flaky(reruns=2)),
+    ],
+)
+@torch.inference_mode()
+def test_cutlass_mla_decode(
+    b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal, varlen, torch_dtype
+):
+    device = torch.device("cuda:0")
+    init_dtype = torch.bfloat16 if torch_dtype == torch.float8_e4m3fn else torch_dtype
+    torch.set_default_dtype(init_dtype)
+    torch.set_default_device(device)
+    torch.cuda.set_device(device)
+    torch.manual_seed(42)
+    random.seed(42)
+
+    print(
+        f"{b=}, {s_q=}, {mean_sk=}, {h_q=}, {h_kv=}, "
+        f"{d=}, {dv=}, {causal=}, {varlen=}, {torch_dtype=}"
+    )
+
+    use_fp8 = torch_dtype == torch.float8_e4m3fn
+    scale = math.sqrt(d) ** (-1)
+    cache_seqlens = torch.full((b,), mean_sk, dtype=torch.int32)
+    if varlen:
+        for i in range(b):
+            cache_seqlens[i] = max(random.normalvariate(mean_sk, mean_sk / 2), s_q)
+    total_seqlens = cache_seqlens.sum().item()
+    max_seqlen = cache_seqlens.max().item()
+    max_seqlen_pad = triton.cdiv(max_seqlen, 256) * 256
+
+    q = torch.randn(b, s_q, h_q, d)
+    block_table = torch.arange(
+        b * max_seqlen_pad // block_size, dtype=torch.int32
+    ).view(b, max_seqlen_pad // block_size)
+    blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
+    blocked_v = blocked_k[..., :dv]
+
+    init_dtype = q.dtype
+    if use_fp8:
+        fp8_dtype = torch.float8_e4m3fn
+        descale_q = torch.ones((1), dtype=torch.float32)
+        descale_k = torch.ones((1), dtype=torch.float32)
+
+        q = q.to(fp8_dtype)
+        blocked_k = blocked_k.to(fp8_dtype)
+        blocked_v = blocked_v.to(fp8_dtype)
+    else:
+        descale_q = None
+        descale_k = None
+
+    def cutlass_mla():
+        MAX_HEADS = 128
+
+        q_reshaped = q.squeeze(1)
+        q_nope = q_reshaped[:, :, :dv].clone()
+        q_pe = q_reshaped[:, :, dv:].clone()
+
+        if h_q < MAX_HEADS:
+            q_nope_padded = q_nope.new_empty((b, MAX_HEADS, dv))
+            q_nope_padded[:, :h_q] = q_nope
+            q_nope = q_nope_padded
+
+            q_pe_padded = q_pe.new_empty((b, MAX_HEADS, d - dv))
+            q_pe_padded[:, :h_q] = q_pe
+            q_pe = q_pe_padded
+
+        kv_cache_flat = blocked_k.squeeze(2)
+        device_properties = torch.cuda.get_device_properties(torch.device("cuda:0"))
+        sm_count = device_properties.multi_processor_count
+        workspace_size = ops.sm100_cutlass_mla_get_workspace_size(
+            max_seqlen * block_size, b, sm_count, num_kv_splits=1
+        )
+        workspace = torch.empty(workspace_size, device="cuda", dtype=torch.uint8)
+
+        out_ans = torch.empty(b, MAX_HEADS, dv, dtype=init_dtype)
+        output_lse = torch.empty(
+            (b, MAX_HEADS), dtype=torch.float32, device=q_nope.device
+        )
+        ops.sm100_cutlass_mla_decode(
+            out_ans,
+            output_lse,
+            q_nope,
+            q_pe,
+            kv_cache_flat,
+            cache_seqlens,
+            block_table,
+            workspace,
+            scale,
+            1,
+        )
+        return out_ans[:, :h_q].contiguous(), output_lse[:, :h_q].contiguous()
+
+    def scaled_dot_product_attention(query, key, value, is_causal=False):
+        query = query.float()
+        key = key.float()
+        value = value.float()
+        key = key.repeat_interleave(h_q // h_kv, dim=0)
+        value = value.repeat_interleave(h_q // h_kv, dim=0)
+        attn_weight = query @ key.transpose(-2, -1) / math.sqrt(query.size(-1))
+        if is_causal:
+            s_q = query.shape[-2]
+            s_k = key.shape[-2]
+            attn_bias = torch.zeros(s_q, s_k, dtype=query.dtype)
+            temp_mask = torch.ones(s_q, s_k, dtype=torch.bool).tril(diagonal=s_k - s_q)
+            attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+            attn_bias.to(query.dtype)
+            attn_weight += attn_bias
+        lse = attn_weight.logsumexp(dim=-1)
+        attn_weight = torch.softmax(attn_weight, dim=-1, dtype=torch.float32)
+        return attn_weight @ value, lse
+
+    def ref_mla():
+        q_ = (q.to(torch.float) * descale_q).to(init_dtype) if use_fp8 else q
+        blocked_k_ = (
+            (blocked_k.to(torch.float) * descale_k).to(init_dtype)
+            if use_fp8
+            else blocked_k
+        )
+        blocked_v_ = (
+            (blocked_v.to(torch.float) * descale_k).to(init_dtype)
+            if use_fp8
+            else blocked_v
+        )
+        out = torch.empty(b, s_q, h_q, dv, dtype=torch.float32)
+        lse = torch.empty(b, h_q, s_q, dtype=torch.float32)
+        for i in range(b):
+            begin = i * max_seqlen_pad
+            end = begin + cache_seqlens[i]
+            out_i, lse_i = scaled_dot_product_attention(
+                q_[i].transpose(0, 1),
+                blocked_k_.view(-1, h_kv, d)[begin:end].transpose(0, 1),
+                blocked_v_.view(-1, h_kv, dv)[begin:end].transpose(0, 1),
+                is_causal=causal,
+            )
+            out[i] = out_i.transpose(0, 1)
+            lse[i] = lse_i
+        return out, lse
+
+    out_cutlass, lse_cutlass = cutlass_mla()
+    out_torch, lse_torch = ref_mla()
+    # Extract the single token (s_q=1) slice to match cutlass output shape
+    out_torch_slice = out_torch[:, 0, :, :]  # [b, h_q, dv]
+    lse_torch_slice = lse_torch[:, 0, :]  # [b, h_q]
+    cal_diff(out_cutlass, out_torch_slice, "out", use_fp8)
+    # lse has larger numerical error, so use a larger threshold
+    cal_diff(lse_cutlass, lse_torch_slice, "lse", use_fp8, diff_threshold=1e-3)
+
+    t = triton.testing.do_bench(cutlass_mla)
+    FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d) * (
+        torch.finfo(torch_dtype).bits // 8
+    ) + (b * s_q * h_q * dv) * (torch.finfo(init_dtype).bits // 8)
+    print(
+        f"{t:.3f} ms, {FLOPS / 10**9 / t:.0f} TFLOPS,", f"{bytes / 10**6 / t:.0f} GB/s"
+    )
--- a/tests/kernels/attention/test_deepgemm_attention.py
+++ b/tests/kernels/attention/test_deepgemm_attention.py
@@ -0,0 +1,294 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import random
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+from vllm.utils.deep_gemm import (
+    _ceil_to_ue8m0,
+    calc_diff,
+    fp8_mqa_logits,
+    fp8_paged_mqa_logits,
+    get_num_sms,
+    get_paged_mqa_logits_metadata,
+)
+from vllm.utils.import_utils import has_deep_gemm
+from vllm.utils.math_utils import cdiv
+
+
+def kv_cache_cast_to_fp8(x: torch.Tensor) -> torch.Tensor:
+    # x: (num_blocks, block_size, 1, head_dim)
+    num_blocks, block_size, num_heads, head_dim = x.shape
+    assert num_heads == 1
+    x_amax = x.abs().float().amax(dim=3, keepdim=True).clamp(1e-4)
+    sf = x_amax / 448.0
+    x_scaled = (x * (1.0 / sf)).to(torch.float8_e4m3fn)
+    x_fp8 = torch.empty(
+        (num_blocks, block_size * (head_dim + 4)),
+        device=x.device,
+        dtype=torch.uint8,
+    )
+    x_fp8[:, : block_size * head_dim] = x_scaled.view(
+        num_blocks, block_size * head_dim
+    ).view(dtype=torch.uint8)
+    x_fp8[:, block_size * head_dim :] = sf.view(num_blocks, block_size).view(
+        dtype=torch.uint8
+    )
+    return x_fp8.view(num_blocks, block_size, num_heads, head_dim + 4)
+
+
+def per_custom_dims_cast_to_fp8(
+    x: torch.Tensor, dims: tuple, use_ue8m0: bool
+) -> tuple[torch.Tensor, torch.Tensor]:
+    excluded_dims = tuple([i for i in range(x.dim()) if i not in set(dims)])
+    x_amax = x.abs().float().amax(dim=excluded_dims, keepdim=True).clamp(1e-4)
+    sf = x_amax / 448.0
+    sf = _ceil_to_ue8m0(sf) if use_ue8m0 else sf
+    x_scaled = (x * (1.0 / sf)).to(torch.float8_e4m3fn)
+    return x_scaled, sf.squeeze()
+
+
+def _generate_cp_test_data(seq_len: int, seq_len_kv: int):
+    assert seq_len_kv % seq_len == 0 and seq_len % 2 == 0
+    chunk_size = seq_len // 2
+    cp_size = seq_len_kv // seq_len
+    cp_id = cp_size // 3
+    ks = torch.zeros(seq_len, dtype=torch.int, device="cuda")
+    ke = torch.zeros(seq_len, dtype=torch.int, device="cuda")
+    for i in range(chunk_size):
+        ke[i] = cp_id * chunk_size + i
+        ke[i + chunk_size] = (cp_size * 2 - 1 - cp_id) * chunk_size + i
+    return ks, ke
+
+
+def _ref_fp8_mqa_logits(
+    q: torch.Tensor,
+    kv: torch.Tensor,
+    weights: torch.Tensor,
+    cu_seqlen_ks: torch.Tensor,
+    cu_seqlen_ke: torch.Tensor,
+):
+    seq_len_kv = kv.shape[0]
+
+    k = kv
+    q = q.float()
+    k = k.float()
+
+    mask_lo = (
+        torch.arange(0, seq_len_kv, device="cuda")[None, :] >= cu_seqlen_ks[:, None]
+    )
+    mask_hi = (
+        torch.arange(0, seq_len_kv, device="cuda")[None, :] < cu_seqlen_ke[:, None]
+    )
+    mask = mask_lo & mask_hi
+    score = torch.einsum("mhd,nd->hmn", q, k)
+    logits = (score.relu() * weights.unsqueeze(-1).transpose(0, 1)).sum(dim=0)
+    logits = logits.masked_fill(~mask, float("-inf"))
+
+    return logits
+
+
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="CUDA only")
+@pytest.mark.skipif(not has_deep_gemm(), reason="DeepGEMM not available")
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(90), reason="SM90 and SM100 only"
+)
+def test_deepgemm_fp8_mqa_logits():
+    torch.manual_seed(0)
+    random.seed(0)
+    num_heads, head_dim = 32, 128
+    for seq_len in (512,):
+        for seq_len_kv in (1024,):
+            for disable_cp in (False, True):
+                q = torch.randn(
+                    seq_len,
+                    num_heads,
+                    head_dim,
+                    device="cuda",
+                    dtype=torch.bfloat16,
+                )
+                kv = torch.randn(
+                    seq_len_kv, head_dim, device="cuda", dtype=torch.bfloat16
+                )
+                weights = torch.randn(
+                    seq_len, num_heads, device="cuda", dtype=torch.float32
+                )
+
+                if disable_cp:
+                    ks = torch.zeros(seq_len, dtype=torch.int, device="cuda")
+                    ke = torch.arange(seq_len, dtype=torch.int, device="cuda") + (
+                        seq_len_kv - seq_len
+                    )
+                else:
+                    ks, ke = _generate_cp_test_data(seq_len, seq_len_kv)
+
+                q_fp8 = q.to(torch.float8_e4m3fn)
+                kv_fp8 = per_custom_dims_cast_to_fp8(kv, (0,), False)
+                logits = fp8_mqa_logits(q_fp8, kv_fp8, weights, ks, ke)
+
+                ref_logits = _ref_fp8_mqa_logits(
+                    q=q,
+                    kv=kv,
+                    weights=weights,
+                    cu_seqlen_ks=ks,
+                    cu_seqlen_ke=ke,
+                )
+
+                ref_neginf_mask = ref_logits == float("-inf")
+                neginf_mask = logits == float("-inf")
+                assert torch.equal(neginf_mask, ref_neginf_mask)
+
+                ref_logits = ref_logits.masked_fill(ref_neginf_mask, 0)
+                logits = logits.masked_fill(neginf_mask, 0)
+                diff = calc_diff(logits, ref_logits)
+                assert diff < 1e-3, f"{diff=}"
+
+
+def _ref_fp8_paged_mqa_logits(
+    q: torch.Tensor,
+    kv_cache: torch.Tensor,
+    weights: torch.Tensor,
+    context_lens: torch.Tensor,
+    block_tables: torch.Tensor,
+    max_model_len: int,
+):
+    batch_size, next_n, _, _ = q.size()
+    _, block_size, _, _ = kv_cache.size()
+    logits = torch.full(
+        [batch_size * next_n, max_model_len],
+        float("-inf"),
+        device=q.device,
+        dtype=torch.float32,
+    )
+    context_lens_list = context_lens.tolist()
+    for i in range(batch_size):
+        context_len = context_lens_list[i]
+        q_offsets = torch.arange(context_len - next_n, context_len, device="cuda")
+        weight_slice = (
+            weights[i * next_n : (i + 1) * next_n, :].transpose(0, 1).contiguous()
+        )
+        for block_rk in range(cdiv(context_len, block_size)):
+            block_idx = block_tables[i][block_rk]
+            qx, kx = q[i], kv_cache[block_idx]
+            k_offsets = torch.arange(
+                block_rk * block_size,
+                (block_rk + 1) * block_size,
+                device="cuda",
+            )
+            mask = (k_offsets[None, :] < context_len) & (
+                k_offsets[None, :] <= q_offsets[:, None]
+            )
+            s = torch.where(
+                mask[None, :, :],
+                (qx.transpose(0, 1) @ kx.transpose(0, 1).transpose(1, 2)).to(
+                    logits.dtype
+                ),
+                float("-inf"),
+            )
+            s = torch.relu(s) * weight_slice[..., None]
+            s = s.sum(dim=0)
+            logits[
+                i * next_n : (i + 1) * next_n,
+                block_rk * block_size : (block_rk + 1) * block_size,
+            ] = torch.where(k_offsets[None, :] <= q_offsets[:, None], s, float("-inf"))
+    return logits
+
+
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="CUDA only")
+@pytest.mark.skipif(not has_deep_gemm(), reason="DeepGEMM not available")
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(90), reason="SM90 and SM100 only"
+)
+def test_deepgemm_fp8_paged_mqa_logits():
+    torch.manual_seed(0)
+    random.seed(0)
+
+    max_model_len = 4096
+    for batch_size, next_n in [(4, 1), (2, 2)]:
+        for heads, index_dim in [(32, 128)]:
+            for avg_kv in (2048,):
+                num_blocks, blocksize = max_model_len * 2, 64
+
+                q = torch.randn(
+                    (batch_size, next_n, heads, index_dim),
+                    device="cuda",
+                    dtype=torch.bfloat16,
+                )
+                kv_cache = torch.randn(
+                    (num_blocks, blocksize, 1, index_dim),
+                    device="cuda",
+                    dtype=torch.bfloat16,
+                )
+                weights = torch.randn(
+                    (batch_size * next_n, heads),
+                    device="cuda",
+                    dtype=torch.float32,
+                )
+
+                context_lens = (
+                    torch.randint(int(0.8 * avg_kv), int(1.2 * avg_kv), (batch_size,))
+                    .cuda()
+                    .to(torch.int32)
+                )
+                max_block_len = (
+                    (context_lens.max().item() + blocksize - 1) // blocksize * blocksize
+                )
+                block_tables = torch.zeros(
+                    (batch_size, max_block_len),
+                    device="cuda",
+                    dtype=torch.int32,
+                )
+
+                counter = 0
+                block_idx_pool = list(range(num_blocks))
+                random.shuffle(block_idx_pool)
+                for i in range(batch_size):
+                    ctx_len = int(context_lens[i].item())
+                    for j in range((ctx_len + blocksize - 1) // blocksize):
+                        block_tables[i][j] = block_idx_pool[counter]
+                        counter += 1
+
+                q_fp8 = q.to(torch.float8_e4m3fn)
+                kv_cache_fp8 = kv_cache_cast_to_fp8(kv_cache)
+
+                schedule_metadata = get_paged_mqa_logits_metadata(
+                    context_lens, blocksize, get_num_sms()
+                )
+                logits = fp8_paged_mqa_logits(
+                    q_fp8,
+                    kv_cache_fp8,
+                    weights,
+                    context_lens,
+                    block_tables,
+                    schedule_metadata,
+                    max_model_len,
+                )
+
+                ref_logits = _ref_fp8_paged_mqa_logits(
+                    q,
+                    kv_cache,
+                    weights,
+                    context_lens,
+                    block_tables,
+                    max_model_len,
+                )
+
+                positions = (
+                    torch.arange(max_model_len, device="cuda")
+                    .unsqueeze(0)
+                    .expand(batch_size * next_n, -1)
+                )
+                row_indices = torch.arange(batch_size * next_n, device="cuda") // next_n
+                next_n_offset = (
+                    torch.arange(batch_size * next_n, device="cuda") % next_n
+                )
+                mask = positions <= (
+                    context_lens[row_indices] - next_n + next_n_offset
+                ).unsqueeze(1)
+
+                logits = logits.masked_fill(~mask, 0)
+                ref_logits = ref_logits.masked_fill(~mask, 0)
+                diff = calc_diff(logits, ref_logits)
+                assert diff < 1e-3, f"{diff=}"
--- a/tests/kernels/attention/test_flash_attn.py
+++ b/tests/kernels/attention/test_flash_attn.py
@@ -0,0 +1,216 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+
+try:
+    from vllm.vllm_flash_attn import (
+        fa_version_unsupported_reason,
+        flash_attn_varlen_func,
+        is_fa_version_supported,
+    )
+except ImportError:
+    if current_platform.is_rocm():
+        pytest.skip(
+            "vllm_flash_attn is not supported for vLLM on ROCm.",
+            allow_module_level=True,
+        )
+
+
+NUM_HEADS = [(4, 4), (8, 2)]
+HEAD_SIZES = [40, 72, 80, 128, 256]
+BLOCK_SIZES = [16]
+DTYPES = [torch.bfloat16]
+QDTYPES = [None, torch.float8_e4m3fn]
+# one value large enough to test overflow in index calculation.
+# one value small enough to test the schema op check
+NUM_BLOCKS = [32768, 2048]
+SOFT_CAPS = [None]
+SLIDING_WINDOWS = [None, 256]
+
+
+def ref_paged_attn(
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    query_lens: list[int],
+    kv_lens: list[int],
+    block_tables: torch.Tensor,
+    scale: float,
+    sliding_window: int | None = None,
+    soft_cap: float | None = None,
+) -> torch.Tensor:
+    num_seqs = len(query_lens)
+    block_tables = block_tables.cpu().numpy()
+    _, block_size, num_kv_heads, head_size = key_cache.shape
+
+    outputs: list[torch.Tensor] = []
+    start_idx = 0
+    for i in range(num_seqs):
+        query_len = query_lens[i]
+        kv_len = kv_lens[i]
+        q = query[start_idx : start_idx + query_len]
+        q *= scale
+
+        num_kv_blocks = (kv_len + block_size - 1) // block_size
+        block_indices = block_tables[i, :num_kv_blocks]
+
+        k = key_cache[block_indices].view(-1, num_kv_heads, head_size)
+        k = k[:kv_len]
+        v = value_cache[block_indices].view(-1, num_kv_heads, head_size)
+        v = v[:kv_len]
+
+        if q.shape[1] != k.shape[1]:
+            k = torch.repeat_interleave(k, q.shape[1] // k.shape[1], dim=1)
+            v = torch.repeat_interleave(v, q.shape[1] // v.shape[1], dim=1)
+        attn = torch.einsum("qhd,khd->hqk", q, k).float()
+        empty_mask = torch.ones(query_len, kv_len)
+        mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
+        if sliding_window is not None:
+            sliding_window_mask = (
+                torch.triu(
+                    empty_mask, diagonal=kv_len - (query_len + sliding_window) + 1
+                )
+                .bool()
+                .logical_not()
+            )
+            mask |= sliding_window_mask
+        if soft_cap is not None:
+            attn = soft_cap * torch.tanh(attn / soft_cap)
+        attn.masked_fill_(mask, float("-inf"))
+        attn = torch.softmax(attn, dim=-1).to(v.dtype)
+        out = torch.einsum("hqk,khd->qhd", attn, v)
+
+        outputs.append(out)
+        start_idx += query_len
+
+    return torch.cat(outputs, dim=0)
+
+
+@pytest.mark.parametrize("use_out", [True, False])
+@pytest.mark.parametrize(
+    "seq_lens", [[(1, 1328), (5, 18), (129, 463)], [(1, 523), (1, 37), (1, 2011)]]
+)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("soft_cap", SOFT_CAPS)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("fa_version", [2, 3])
+@pytest.mark.parametrize("q_dtype", QDTYPES)
+@torch.inference_mode()
+def test_varlen_with_paged_kv(
+    use_out: bool,
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    fa_version: int,
+    q_dtype: torch.dtype | None,
+) -> None:
+    torch.set_default_device("cuda")
+    if not is_fa_version_supported(fa_version):
+        pytest.skip(
+            f"Flash attention version {fa_version} not supported due "
+            f'to: "{fa_version_unsupported_reason(fa_version)}"'
+        )
+    if q_dtype is not None and (dtype != torch.bfloat16 or fa_version == 2):
+        pytest.skip(
+            "Flash attention with quantized inputs is only "
+            "supported on version 3 with bfloat16 base type"
+        )
+    current_platform.seed_everything(0)
+    num_seqs = len(seq_lens)
+    query_lens = [x[0] for x in seq_lens]
+    kv_lens = [x[1] for x in seq_lens]
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+    max_query_len = max(query_lens)
+    max_kv_len = max(kv_lens)
+    window_size = (sliding_window - 1, 0) if sliding_window is not None else (-1, -1)
+    scale = head_size**-0.5
+
+    query = torch.randn(sum(query_lens), num_query_heads, head_size, dtype=dtype)
+    key_cache = torch.randn(
+        num_blocks, block_size, num_kv_heads, head_size, dtype=dtype
+    )
+    value_cache = torch.randn_like(key_cache)
+    cu_query_lens = torch.tensor([0] + query_lens, dtype=torch.int32).cumsum(
+        dim=0, dtype=torch.int32
+    )
+    kv_lens = torch.tensor(kv_lens, dtype=torch.int32)
+
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(
+        0, num_blocks, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
+
+    out = torch.empty_like(query) if use_out else None
+
+    maybe_quantized_query = query
+    maybe_quantized_key_cache = key_cache
+    maybe_quantized_value_cache = value_cache
+    q_descale = None
+    k_descale = None
+    v_descale = None
+    if q_dtype is not None:
+        # QKV are drawn from N(0, 1): no need for a fp8 scaling factor
+        maybe_quantized_query = query.to(q_dtype)
+        maybe_quantized_key_cache = key_cache.to(q_dtype)
+        maybe_quantized_value_cache = value_cache.to(q_dtype)
+
+        scale_shape = (num_seqs, num_kv_heads)
+        q_descale = torch.ones(scale_shape, dtype=torch.float32)
+        k_descale = torch.ones(scale_shape, dtype=torch.float32)
+        v_descale = torch.ones(scale_shape, dtype=torch.float32)
+
+    output = flash_attn_varlen_func(
+        q=maybe_quantized_query,
+        k=maybe_quantized_key_cache,
+        v=maybe_quantized_value_cache,
+        out=out,
+        cu_seqlens_q=cu_query_lens,
+        seqused_k=kv_lens,
+        max_seqlen_q=max_query_len,
+        max_seqlen_k=max_kv_len,
+        softmax_scale=scale,
+        causal=True,
+        window_size=window_size,
+        block_table=block_tables,
+        softcap=soft_cap if soft_cap is not None else 0,
+        fa_version=fa_version,
+        q_descale=q_descale,
+        k_descale=k_descale,
+        v_descale=v_descale,
+    )
+    output = output if not use_out else out
+
+    ref_output = ref_paged_attn(
+        query=query,
+        key_cache=key_cache,
+        value_cache=value_cache,
+        query_lens=query_lens,
+        kv_lens=kv_lens,
+        block_tables=block_tables,
+        scale=scale,
+        sliding_window=sliding_window,
+        soft_cap=soft_cap,
+    )
+    atol, rtol = 1.5e-2, 1e-2
+    if q_dtype is not None:
+        atol, rtol = 1.5e-1, 1.5e-1
+    (
+        torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol),
+        f"{torch.max(torch.abs(output - ref_output))}",
+    )
--- a/tests/kernels/attention/test_flashinfer.py
+++ b/tests/kernels/attention/test_flashinfer.py
@@ -0,0 +1,497 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import pytest
+
+from vllm.platforms import current_platform
+
+try:
+    import flashinfer
+except ImportError:
+    if current_platform.is_rocm():
+        pytest.skip(
+            "flashinfer is not supported for vLLM on ROCm.", allow_module_level=True
+        )
+
+import torch
+
+NUM_HEADS = [(32, 8), (6, 1)]
+HEAD_SIZES = [128, 256]
+BLOCK_SIZES = [16, 32]
+DTYPES = [torch.bfloat16]
+NUM_BLOCKS = 32768  # Large enough to test overflow in index calculation.
+SOFT_CAPS = [None, 30.0]
+SLIDING_WINDOWS = [None, 64]
+
+
+def ref_paged_attn(
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    query_lens: list[int],
+    kv_lens: list[int],
+    block_tables: torch.Tensor,
+    scale: float,
+    sliding_window: int | None = None,
+    soft_cap: float | None = None,
+) -> torch.Tensor:
+    num_seqs = len(query_lens)
+    block_tables = block_tables.cpu().numpy()
+    _, block_size, num_kv_heads, head_size = key_cache.shape
+
+    outputs: list[torch.Tensor] = []
+    start_idx = 0
+    for i in range(num_seqs):
+        query_len = query_lens[i]
+        kv_len = kv_lens[i]
+        q = query[start_idx : start_idx + query_len]
+        q *= scale
+
+        num_kv_blocks = (kv_len + block_size - 1) // block_size
+        block_indices = block_tables[i, :num_kv_blocks]
+
+        k = key_cache[block_indices].view(-1, num_kv_heads, head_size)
+        k = k[:kv_len]
+        v = value_cache[block_indices].view(-1, num_kv_heads, head_size)
+        v = v[:kv_len]
+
+        if q.shape[1] != k.shape[1]:
+            k = torch.repeat_interleave(k, q.shape[1] // k.shape[1], dim=1)
+            v = torch.repeat_interleave(v, q.shape[1] // v.shape[1], dim=1)
+        attn = torch.einsum("qhd,khd->hqk", q, k).float()
+        empty_mask = torch.ones(query_len, kv_len)
+        mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
+        if sliding_window is not None:
+            sliding_window_mask = (
+                torch.triu(
+                    empty_mask, diagonal=kv_len - (query_len + sliding_window) + 1
+                )
+                .bool()
+                .logical_not()
+            )
+            mask |= sliding_window_mask
+        if soft_cap is not None:
+            attn = soft_cap * torch.tanh(attn / soft_cap)
+        attn.masked_fill_(mask, float("-inf"))
+        attn = torch.softmax(attn, dim=-1).to(v.dtype)
+        out = torch.einsum("hqk,khd->qhd", attn, v)
+
+        outputs.append(out)
+        start_idx += query_len
+
+    return torch.cat(outputs, dim=0)
+
+
+@pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]])
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("soft_cap", SOFT_CAPS)
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
+@torch.inference_mode
+def test_flashinfer_decode_with_paged_kv(
+    kv_lens: list[int],
+    num_heads: tuple[int, int],
+    head_size: int,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    sliding_window: int | None,
+) -> None:
+    torch.set_default_device("cuda")
+    current_platform.seed_everything(0)
+    num_seqs = len(kv_lens)
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+    max_kv_len = max(kv_lens)
+    scale = head_size**-0.5
+
+    query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype)
+
+    key_value_cache = torch.randn(
+        NUM_BLOCKS, 2, block_size, num_kv_heads, head_size, dtype=dtype
+    )
+    key_cache = key_value_cache[:, 0, :, :, :].squeeze(1)
+    value_cache = key_value_cache[:, 1, :, :, :].squeeze(1)
+
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(
+        0, NUM_BLOCKS, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
+
+    kv_indptr = [0]
+    kv_indices = []
+    kv_last_page_lens = []
+    for i in range(num_seqs):
+        seq_len = kv_lens[i]
+        assert seq_len > 0
+        num_blocks = (seq_len + block_size - 1) // block_size
+        kv_indices.extend(block_tables[i, :num_blocks])
+        kv_indptr.append(kv_indptr[-1] + num_blocks)
+        kv_last_page_len = seq_len % block_size
+        if kv_last_page_len == 0:
+            kv_last_page_len = block_size
+        kv_last_page_lens.append(kv_last_page_len)
+
+    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
+    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
+    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
+
+    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
+    wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
+        workspace_buffer, "NHD", use_tensor_cores=True
+    )
+    wrapper.plan(
+        kv_indptr,
+        kv_indices,
+        kv_last_page_lens,
+        num_query_heads,
+        num_kv_heads,
+        head_size,
+        block_size,
+        "NONE",
+        window_left=sliding_window - 1 if sliding_window is not None else -1,
+        q_data_type=dtype,
+        kv_data_type=dtype,
+        logits_soft_cap=soft_cap,
+    )
+
+    output = wrapper.run(query, key_value_cache)
+
+    ref_output = ref_paged_attn(
+        query=query,
+        key_cache=key_cache,
+        value_cache=value_cache,
+        query_lens=[1] * num_seqs,
+        kv_lens=kv_lens,
+        block_tables=block_tables,
+        scale=scale,
+        soft_cap=soft_cap,
+        sliding_window=sliding_window,
+    )
+    (
+        torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2),
+        f"{torch.max(torch.abs(output - ref_output))}",
+    )
+
+
+@pytest.mark.parametrize("seq_lens", [[(1, 1328), (5, 18), (129, 463)]])
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("soft_cap", SOFT_CAPS)
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
+@torch.inference_mode
+def test_flashinfer_prefill_with_paged_kv(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    sliding_window: int | None,
+) -> None:
+    torch.set_default_device("cuda")
+    current_platform.seed_everything(0)
+    num_seqs = len(seq_lens)
+    query_lens = [x[0] for x in seq_lens]
+    kv_lens = [x[1] for x in seq_lens]
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+    max_kv_len = max(kv_lens)
+    scale = head_size**-0.5
+
+    query = torch.randn(sum(query_lens), num_query_heads, head_size, dtype=dtype)
+    key_value_cache = torch.randn(
+        NUM_BLOCKS, 2, block_size, num_kv_heads, head_size, dtype=dtype
+    )
+    key_cache = key_value_cache[:, 0, :, :, :].squeeze(1)
+    value_cache = key_value_cache[:, 1, :, :, :].squeeze(1)
+
+    # Normalize the scale of the key and value caches to mitigate
+    # numerical instability.
+    key_cache /= head_size**0.5
+    value_cache /= head_size**0.5
+
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(
+        0, NUM_BLOCKS, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
+
+    qo_indptr = [0]
+    kv_indptr = [0]
+    kv_indices = []
+    kv_last_page_lens = []
+    for i in range(num_seqs):
+        seq_len = kv_lens[i]
+        assert seq_len > 0
+        num_blocks = (seq_len + block_size - 1) // block_size
+        kv_indices.extend(block_tables[i, :num_blocks])
+        kv_indptr.append(kv_indptr[-1] + num_blocks)
+        kv_last_page_len = seq_len % block_size
+        if kv_last_page_len == 0:
+            kv_last_page_len = block_size
+        kv_last_page_lens.append(kv_last_page_len)
+        qo_indptr.append(qo_indptr[-1] + query_lens[i])
+
+    qo_indptr = torch.tensor(qo_indptr, dtype=torch.int32)
+    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
+    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
+    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
+
+    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
+    wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(workspace_buffer, "NHD")
+    wrapper.plan(
+        qo_indptr,
+        kv_indptr,
+        kv_indices,
+        kv_last_page_lens,
+        num_query_heads,
+        num_kv_heads,
+        head_size,
+        block_size,
+        window_left=sliding_window - 1 if sliding_window is not None else -1,
+        q_data_type=dtype,
+        kv_data_type=dtype,
+        logits_soft_cap=soft_cap,
+    )
+
+    output = wrapper.run(
+        query,
+        key_value_cache,
+    )
+
+    ref_output = ref_paged_attn(
+        query=query,
+        key_cache=key_cache,
+        value_cache=value_cache,
+        query_lens=query_lens,
+        kv_lens=kv_lens,
+        block_tables=block_tables,
+        scale=scale,
+        soft_cap=soft_cap,
+        sliding_window=sliding_window,
+    )
+    (
+        torch.testing.assert_close(output, ref_output, atol=5e-2, rtol=1e-2),
+        f"{torch.max(torch.abs(output - ref_output))}",
+    )
+
+
+@pytest.mark.parametrize("seq_lens", [[(1, 132), (5, 18)]])
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("soft_cap", SOFT_CAPS)
+def test_flashinfer_prefill_with_paged_fp8_kv(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+) -> None:
+    pytest.skip("TODO: fix the accuracy issue")
+    torch.set_default_device("cuda")
+    current_platform.seed_everything(0)
+    num_seqs = len(seq_lens)
+    query_lens = [x[0] for x in seq_lens]
+    kv_lens = [x[1] for x in seq_lens]
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+    max_kv_len = max(kv_lens)
+    scale = head_size**-0.5
+
+    kv_cache_dtype = torch.float8_e4m3fn
+
+    query = torch.randn(sum(query_lens), num_query_heads, head_size, dtype=dtype)
+    NUM_BLOCKS_FP8 = 2048
+    key_value_cache = torch.randn(
+        NUM_BLOCKS_FP8, 2, block_size, num_kv_heads, head_size, dtype=dtype
+    )
+    key_cache, value_cache = torch.chunk(key_value_cache, 2, dim=1)
+    key_cache /= head_size**0.5
+    value_cache /= head_size**0.5
+
+    k_scale = key_cache.amax().item() / 448.0
+    v_scale = value_cache.amax().item() / 448.0
+
+    kv_cache_fp8 = torch.cat([key_cache / k_scale, value_cache / v_scale], dim=1).to(
+        kv_cache_dtype
+    )
+
+    assert kv_cache_fp8.shape == key_value_cache.shape
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(
+        0, NUM_BLOCKS_FP8, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
+
+    qo_indptr = [0]
+    kv_indptr = [0]
+    kv_indices = []
+    kv_last_page_lens = []
+    for i in range(num_seqs):
+        seq_len = kv_lens[i]
+        assert seq_len > 0
+        num_blocks = (seq_len + block_size - 1) // block_size
+        kv_indices.extend(block_tables[i, :num_blocks])
+        kv_indptr.append(kv_indptr[-1] + num_blocks)
+        kv_last_page_len = seq_len % block_size
+        if kv_last_page_len == 0:
+            kv_last_page_len = block_size
+        kv_last_page_lens.append(kv_last_page_len)
+        qo_indptr.append(qo_indptr[-1] + query_lens[i])
+
+    qo_indptr = torch.tensor(qo_indptr, dtype=torch.int32)
+    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
+    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
+    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
+
+    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
+    wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(workspace_buffer, "NHD")
+    wrapper.plan(
+        qo_indptr,
+        kv_indptr,
+        kv_indices,
+        kv_last_page_lens,
+        num_query_heads,
+        num_kv_heads,
+        head_size,
+        block_size,
+        q_data_type=dtype,
+        kv_data_type=kv_cache_dtype,
+        logits_soft_cap=soft_cap,
+    )
+
+    output = wrapper.run(query, kv_cache_fp8, k_scale=k_scale, v_scale=v_scale)
+
+    ref_output = ref_paged_attn(
+        query=query,
+        key_cache=key_cache.squeeze(1),
+        value_cache=value_cache.squeeze(1),
+        query_lens=query_lens,
+        kv_lens=kv_lens,
+        block_tables=block_tables,
+        scale=scale,
+        soft_cap=soft_cap,
+    )
+    del query
+    del block_tables
+    # verify prefill fp8
+    (
+        torch.testing.assert_close(output, ref_output, atol=5e-2, rtol=1e-2),
+        f"{torch.max(torch.abs(output - ref_output))}",
+    )
+
+
+@pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]])
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("soft_cap", SOFT_CAPS)
+@pytest.mark.skip(reason="TODO: fix the accuracy issue")
+@torch.inference_mode
+def test_flashinfer_decode_with_paged_fp8_kv(
+    kv_lens: list[int],
+    num_heads: tuple[int, int],
+    head_size: int,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+) -> None:
+    # test doesn't work for num_heads = (16,16)
+    torch.set_default_device("cuda")
+    current_platform.seed_everything(0)
+    num_seqs = len(kv_lens)
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+    max_kv_len = max(kv_lens)
+    scale = head_size**-0.5
+    use_tensor_cores = True
+    kv_cache_dtype = torch.float8_e4m3fn
+
+    query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype)
+    NUM_BLOCKS_FP8 = 2048
+    key_value_cache = torch.randn(
+        NUM_BLOCKS_FP8, 2, block_size, num_kv_heads, head_size, dtype=dtype
+    )
+    key_cache, value_cache = torch.chunk(key_value_cache, 2, dim=1)
+    key_cache /= head_size**0.5
+    value_cache /= head_size**0.5
+
+    k_scale = key_cache.amax().item() / 448.0
+    v_scale = value_cache.amax().item() / 448.0
+
+    key_cache_fp8 = (key_cache / k_scale).to(kv_cache_dtype)
+    value_cache_fp8 = (value_cache / v_scale).to(kv_cache_dtype)
+    assert key_cache_fp8.shape[1] == 1 and value_cache_fp8.shape[1] == 1
+    kv_cache_fp8 = torch.cat([key_cache_fp8, value_cache_fp8], dim=1)
+
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(
+        0, NUM_BLOCKS_FP8, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
+
+    kv_indptr = [0]
+    kv_indices = []
+    kv_last_page_lens = []
+    for i in range(num_seqs):
+        seq_len = kv_lens[i]
+        assert seq_len > 0
+        num_blocks = (seq_len + block_size - 1) // block_size
+        kv_indices.extend(block_tables[i, :num_blocks])
+        kv_indptr.append(kv_indptr[-1] + num_blocks)
+        kv_last_page_len = seq_len % block_size
+        if kv_last_page_len == 0:
+            kv_last_page_len = block_size
+        kv_last_page_lens.append(kv_last_page_len)
+
+    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
+    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
+    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
+
+    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
+    wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
+        workspace_buffer, "NHD", use_tensor_cores=use_tensor_cores
+    )
+    wrapper.plan(
+        kv_indptr,
+        kv_indices,
+        kv_last_page_lens,
+        num_query_heads,
+        num_kv_heads,
+        head_size,
+        block_size,
+        "NONE",
+        q_data_type=dtype,
+        kv_data_type=kv_cache_dtype,
+        logits_soft_cap=soft_cap,
+    )
+    output = wrapper.run(query, kv_cache_fp8, k_scale=k_scale, v_scale=v_scale)
+    key_cache = key_value_cache[:, 0, :, :, :].squeeze(1)
+    value_cache = key_value_cache[:, 1, :, :, :].squeeze(1)
+
+    ref_output = ref_paged_attn(
+        query=query,
+        key_cache=key_cache,
+        value_cache=value_cache,
+        query_lens=[1] * num_seqs,
+        kv_lens=kv_lens,
+        block_tables=block_tables,
+        scale=scale,
+        soft_cap=soft_cap,
+    )
+    # Temporary fix: Increasing the tolerance. Seems like a flashinfer issue
+    (
+        torch.testing.assert_close(output, ref_output, atol=2e-2, rtol=1e-2),
+        f"{torch.max(torch.abs(output - ref_output))}",
+    )
--- a/tests/kernels/attention/test_flashinfer_mla_decode.py
+++ b/tests/kernels/attention/test_flashinfer_mla_decode.py
@@ -0,0 +1,119 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+from vllm.platforms import current_platform
+
+FLASHINFER_WORKSPACE_BUFFER_SIZE = 128 * 1024 * 1024
+
+if not current_platform.has_device_capability(100):
+    pytest.skip(
+        reason="FlashInfer MLA Requires compute capability of 10 or above.",
+        allow_module_level=True,
+    )
+else:
+    from flashinfer.decode import trtllm_batch_decode_with_kv_cache_mla
+
+
+def ref_mla(
+    out: Tensor,  # (bs, num_heads, v_head_dim)
+    query: Tensor,  # (bs, num_heads, head_dim)
+    kv_cache: Tensor,  # (num_blocks, block_size, head_dim)
+    scale: float,
+    block_tables: Tensor,  # (bs, max_num_blocks)
+    seq_lens: Tensor,  # (bs,)
+):
+    bs, num_heads, v_head_dim = out.shape
+    head_dim = query.shape[2]
+
+    for i in range(bs):
+        # gather and flatten KV-cache
+        kv = kv_cache[block_tables[i]]  # (max_num_blocks, block_size, head_dim)
+        kv = kv.view(1, -1, head_dim)[:, : seq_lens[i]]  # (1, seq_len, head_dim)
+        v = kv[:, :, :v_head_dim]
+
+        q = query[i].view(num_heads, 1, head_dim)
+        o = F.scaled_dot_product_attention(q, kv, v, scale=scale, enable_gqa=True)
+        out[i] = o.view(num_heads, v_head_dim)
+
+    return out
+
+
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("bs", [1, 2, 4, 16])
+@pytest.mark.parametrize("block_size", [32, 64])
+def test_flashinfer_mla_decode(dtype: torch.dtype, bs: int, block_size: int):
+    torch.set_default_device("cuda")
+    torch.manual_seed(42)
+
+    # Deepseek R1 config
+    num_heads = 128
+    kv_lora_rank = 512
+    qk_nope_head_dim = 128
+    qk_rope_head_dim = 64
+    qk_head_dim = kv_lora_rank + qk_rope_head_dim
+    scale = (qk_nope_head_dim + qk_rope_head_dim) ** -0.5
+
+    MAX_SEQ_LEN = 1024
+
+    seq_lens = [torch.randint(2, MAX_SEQ_LEN, (1,)).item() for _ in range(bs)]
+    seq_lens[-1] = MAX_SEQ_LEN
+    max_seq_len = max(seq_lens)
+    seq_lens_tensor = torch.tensor(seq_lens, dtype=torch.int32)
+
+    # Generate block tables with random but unique block IDs
+    # From https://github.com/flashinfer-ai/flashinfer/pull/1222
+    blocks_per_seq = (seq_lens_tensor + block_size - 1) // block_size
+    max_num_blocks_per_seq = max(blocks_per_seq.max().item(), 4)
+    total_blocks_needed = sum(blocks_per_seq)
+    # Get random unique IDs for all blocks
+    all_block_ids = torch.randperm(total_blocks_needed)
+
+    block_id = 0
+    block_tables = torch.zeros(
+        (bs, max_num_blocks_per_seq),
+        dtype=torch.int32,
+    )
+
+    # Populate block tables and track block assignments
+    block_id = 0
+    for i in range(bs):
+        num_blocks_needed = blocks_per_seq[i]
+        block_tables[i, :num_blocks_needed] = all_block_ids[
+            block_id : block_id + num_blocks_needed
+        ]
+        block_id += num_blocks_needed
+
+    kv_cache = torch.randn(block_tables.numel(), block_size, qk_head_dim).to(dtype)
+    q = torch.randn(bs, num_heads, qk_head_dim).to(dtype)
+
+    out_ref = q.new_zeros(bs, num_heads, kv_lora_rank)
+    ref_mla(out_ref, q, kv_cache, scale, block_tables, seq_lens_tensor)
+
+    workspace_buffer = torch.zeros(
+        FLASHINFER_WORKSPACE_BUFFER_SIZE,
+        dtype=torch.uint8,
+        device=q.device,
+    )
+    # Flashinfer MLA expects the query to be of shape
+    # (bs, q_len_per_request, num_heads, qk_head_dim),
+    # where q_len_per_request is the MTP query length (=1 without MTP)
+    q = q.unsqueeze(1)
+
+    out_ans = trtllm_batch_decode_with_kv_cache_mla(
+        query=q,
+        kv_cache=kv_cache.unsqueeze(1),
+        workspace_buffer=workspace_buffer,
+        qk_nope_head_dim=qk_nope_head_dim,
+        kv_lora_rank=kv_lora_rank,
+        qk_rope_head_dim=qk_rope_head_dim,
+        block_tables=block_tables,
+        seq_lens=seq_lens_tensor,
+        max_seq_len=max_seq_len,
+        bmm1_scale=scale,
+    )
+    out_ans = out_ans.squeeze(1)
+    torch.testing.assert_close(out_ans, out_ref, atol=1e-2, rtol=1e-2)
--- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py
+++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
@@ -0,0 +1,457 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from tests.kernels.quantization.nvfp4_utils import (
+    dequantize_nvfp4_to_dtype,
+    get_nvfp4_global_scale,
+)
+from vllm.platforms import current_platform
+from vllm.utils.math_utils import round_up
+
+if not current_platform.is_device_capability_family(100):
+    pytest.skip(
+        "This TRTLLM kernel requires NVIDIA Blackwell.", allow_module_level=True
+    )
+else:
+    import flashinfer
+
+FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
+FP8_DTYPE = current_platform.fp8_dtype()
+FP4_DTYPE = torch.uint8
+
+
+def to_float8(x, dtype=torch.float8_e4m3fn):
+    finfo = torch.finfo(dtype)
+    min_val, max_val = x.aminmax()
+    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
+    scale = finfo.max / amax * 0.1
+    x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max)
+    return x_scl_sat.to(dtype), scale.float().reciprocal()
+
+
+DTYPE = [torch.bfloat16]
+QUANT_DTYPES = [
+    # (q_quant_dtype, kv_quant_dtype, o_quant_dtype)
+    (None, None, None),
+    (None, FP8_DTYPE, None),
+    (FP8_DTYPE, FP8_DTYPE, None),
+    (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
+    (FP8_DTYPE, FP8_DTYPE, FP4_DTYPE),
+]
+BATCH_SIZE = [4, 12]
+MAX_SEQ_LENS = [(1024, 4096)]
+NUM_HEADS = [(64, 8), (40, 8)]
+HEAD_SIZE = [128]
+KV_LAYOUT = ["HND"]  # currently only HND is supported
+BLOCK_SIZE = [16]
+WINDOW_LEFT = [-1, 127]
+SOFT_CAP = [None, 50.0]
+HAS_SINKS = [True, False]
+
+NUM_BLOCKS = 32768  # Large enough to test overflow in index calculation.
+
+
+@pytest.mark.parametrize("dtype", DTYPE)
+@pytest.mark.parametrize("quant_dtypes", QUANT_DTYPES)
+@pytest.mark.parametrize("batch_size", BATCH_SIZE)
+@pytest.mark.parametrize("max_seq_lens", MAX_SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZE)
+@pytest.mark.parametrize("kv_layout", KV_LAYOUT)
+@pytest.mark.parametrize("block_size", BLOCK_SIZE)
+@pytest.mark.parametrize("window_left", WINDOW_LEFT)
+@pytest.mark.parametrize("soft_cap", SOFT_CAP)
+@pytest.mark.parametrize("has_sinks", HAS_SINKS)
+@torch.inference_mode
+def test_flashinfer_trtllm_decode_with_baseline(
+    dtype: torch.dtype,
+    quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
+    batch_size: int,
+    max_seq_lens: tuple[int, int],
+    num_heads: tuple[int, int],
+    head_size: int,
+    kv_layout: str,
+    block_size: int,
+    window_left: int,
+    soft_cap: float | None,
+    has_sinks: bool,
+) -> None:
+    torch.set_default_device("cuda")
+    current_platform.seed_everything(42)
+
+    q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes
+    q_quant_dtype = q_quant_dtype or dtype
+    kv_quant_dtype = kv_quant_dtype or dtype
+    o_quant_dtype = o_quant_dtype or dtype
+
+    _, max_kv_len = max_seq_lens
+
+    num_qo_heads, num_kv_heads = num_heads
+    assert num_qo_heads % num_kv_heads == 0
+
+    sm_scale = float(1.0 / (head_size**0.5))
+
+    kv_cache_shape = None
+    if kv_layout == "NHD":
+        kv_cache_shape = (NUM_BLOCKS, 2, block_size, num_kv_heads, head_size)
+    elif kv_layout == "HND":
+        kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, block_size, head_size)
+    else:
+        raise ValueError(f"Invalid kv_layout: {kv_layout}")
+
+    # max_q_len = 1
+    q_lens = torch.ones((batch_size,), dtype=torch.int32)
+    q_indptr = torch.cat(
+        [
+            torch.tensor([0], dtype=torch.int32),
+            torch.cumsum(q_lens, dim=0, dtype=torch.int32),
+        ]
+    )
+
+    query = torch.randn(torch.sum(q_lens).item(), num_qo_heads, head_size, dtype=dtype)
+    if q_quant_dtype == FP8_DTYPE:
+        query, q_scale = to_float8(query)
+        ref_query = query.to(dtype) * q_scale
+    else:
+        q_scale = 1.0
+        ref_query = query
+
+    kv_lens = torch.randint(1, max_kv_len, (batch_size,), dtype=torch.int32)
+    kv_lens[-1] = max_kv_len
+
+    seq_lens = kv_lens + q_lens
+    max_seq_len = torch.max(seq_lens).item()
+
+    kv_cache = torch.randn(kv_cache_shape, dtype=dtype)
+    if kv_quant_dtype == FP8_DTYPE:
+        kv_cache, kv_scale = to_float8(kv_cache)
+        ref_kv_cache = kv_cache.to(dtype) * kv_scale
+    else:
+        kv_scale = 1.0
+        ref_kv_cache = kv_cache
+    k_scale = v_scale = kv_scale
+
+    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
+    block_tables = torch.randint(
+        0, NUM_BLOCKS, (batch_size, max_num_blocks_per_seq), dtype=torch.int32
+    )
+    kv_indptr = [0]
+    kv_indices = []
+    kv_last_page_lens = []
+    for i in range(batch_size):
+        seq_len = seq_lens[i]
+        assert seq_len > 0
+        num_blocks = (seq_len + block_size - 1) // block_size
+        kv_indices.extend(block_tables[i, :num_blocks])
+        kv_indptr.append(kv_indptr[-1] + num_blocks)
+        kv_last_page_len = seq_len % block_size
+        if kv_last_page_len == 0:
+            kv_last_page_len = block_size
+        kv_last_page_lens.append(kv_last_page_len)
+
+    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
+    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
+    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
+    workspace_buffer = torch.zeros(128 * 1024 * 1024, dtype=torch.int8)
+
+    # Baseline Decode
+    if has_sinks:
+        sinks = torch.rand(num_qo_heads, dtype=torch.float32) * 5
+        wrapper = flashinfer.BatchAttentionWithAttentionSinkWrapper(
+            float_workspace_buffer=workspace_buffer, kv_layout=kv_layout, backend="fa2"
+        )
+    else:
+        sinks = None
+        wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
+            float_workspace_buffer=workspace_buffer, kv_layout=kv_layout, backend="fa2"
+        )
+
+    wrapper.plan(
+        qo_indptr=q_indptr,
+        paged_kv_indptr=kv_indptr,
+        paged_kv_indices=kv_indices,
+        paged_kv_last_page_len=kv_last_page_lens,
+        num_qo_heads=num_qo_heads,
+        num_kv_heads=num_kv_heads,
+        head_dim_qk=head_size,
+        page_size=block_size,
+        causal=True,
+        sm_scale=sm_scale,
+        window_left=window_left,
+        logits_soft_cap=soft_cap,
+        q_data_type=dtype,
+        kv_data_type=dtype,
+    )
+    output = torch.empty(ref_query.shape, dtype=dtype)
+    wrapper.run(ref_query, ref_kv_cache, sinks, sm_scale, out=output)
+
+    o_scale = 1.0
+    o_sf_scale_float = None
+    if o_quant_dtype == FP8_DTYPE:
+        _, o_scale = to_float8(output)
+    elif o_quant_dtype == FP4_DTYPE:
+        o_sf_scale = get_nvfp4_global_scale(output)
+        o_sf_scale_float = o_sf_scale.item()
+
+    # TRTLLM Decode
+    if o_quant_dtype == FP4_DTYPE:
+        output_trtllm = flashinfer.utils.FP4Tensor(
+            torch.empty(query.shape[:-1] + (query.shape[-1] // 2,), dtype=torch.uint8),
+            torch.empty(
+                (
+                    round_up(query.shape[0], 128),
+                    round_up(query.shape[1] * query.shape[2] // 16, 4),
+                ),
+                dtype=torch.float8_e4m3fn,
+            ),
+        )
+    else:
+        output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
+
+    flashinfer.decode.trtllm_batch_decode_with_kv_cache(
+        query=query,
+        kv_cache=kv_cache,
+        workspace_buffer=workspace_buffer,
+        block_tables=block_tables,
+        seq_lens=seq_lens,
+        max_seq_len=max_seq_len,
+        bmm1_scale=q_scale * k_scale * sm_scale,
+        bmm2_scale=v_scale / o_scale,
+        window_left=window_left,
+        sinks=sinks,
+        o_sf_scale=o_sf_scale_float,
+        out=output_trtllm,
+    )
+    if o_quant_dtype == FP8_DTYPE:
+        output_trtllm = output_trtllm.to(dtype) * o_scale
+    elif o_quant_dtype == FP4_DTYPE:
+        output_trtllm.data = output_trtllm.data.reshape(
+            -1, query.shape[1] * query.shape[2] // 2
+        )
+        output_trtllm = dequantize_nvfp4_to_dtype(
+            output_trtllm.data, output_trtllm.scale, o_sf_scale, dtype, query.device
+        )
+        output_trtllm = output_trtllm.reshape(-1, query.shape[1], query.shape[2])
+
+    if q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP4_DTYPE:
+        rtol, atol = 7e-2, 9e-2
+    elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP8_DTYPE:
+        rtol, atol = 3e-2, 4e-2
+    elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == dtype:
+        rtol, atol = 2e-2, 2e-2
+    elif kv_quant_dtype == FP8_DTYPE:
+        rtol, atol = 4e-2, 6e-2
+    else:
+        rtol, atol = 1e-2, 1e-2
+
+    (
+        torch.testing.assert_close(output, output_trtllm, atol=atol, rtol=rtol),
+        f"{torch.max(torch.abs(output - output_trtllm))}",
+    )
+
+
+@pytest.mark.parametrize("dtype", DTYPE)
+@pytest.mark.parametrize("quant_dtypes", QUANT_DTYPES)
+@pytest.mark.parametrize("batch_size", BATCH_SIZE)
+@pytest.mark.parametrize("max_seq_lens", MAX_SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZE)
+@pytest.mark.parametrize("kv_layout", KV_LAYOUT)
+@pytest.mark.parametrize("block_size", BLOCK_SIZE)
+@pytest.mark.parametrize("window_left", WINDOW_LEFT)
+@pytest.mark.parametrize("soft_cap", [None])
+@pytest.mark.parametrize("has_sinks", HAS_SINKS)
+@torch.inference_mode
+def test_flashinfer_trtllm_prefill_with_baseline(
+    dtype: torch.dtype,
+    quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
+    batch_size: int,
+    max_seq_lens: tuple[int, int],
+    num_heads: tuple[int, int],
+    head_size: int,
+    kv_layout: str,
+    block_size: int,
+    window_left: int,
+    soft_cap: float | None,
+    has_sinks: bool,
+) -> None:
+    torch.set_default_device("cuda")
+    current_platform.seed_everything(42)
+
+    q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes
+    q_quant_dtype = q_quant_dtype or dtype
+    kv_quant_dtype = kv_quant_dtype or dtype
+    o_quant_dtype = o_quant_dtype or dtype
+
+    if q_quant_dtype != kv_quant_dtype:
+        pytest.skip("Skipped mixed QKV dtypes for prefill")
+
+    max_q_len, max_kv_len = max_seq_lens
+
+    num_qo_heads, num_kv_heads = num_heads
+    assert num_qo_heads % num_kv_heads == 0
+
+    sm_scale = float(1.0 / (head_size**0.5))
+
+    kv_cache_shape = None
+    if kv_layout == "NHD":
+        kv_cache_shape = (NUM_BLOCKS, 2, block_size, num_kv_heads, head_size)
+    elif kv_layout == "HND":
+        kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, block_size, head_size)
+    else:
+        raise ValueError(f"Invalid kv_layout: {kv_layout}")
+
+    q_lens = torch.randint(1, max_q_len, (batch_size,), dtype=torch.int32)
+    q_lens[-1] = max_q_len
+    q_indptr = torch.cat(
+        [
+            torch.tensor([0], dtype=torch.int32),
+            torch.cumsum(q_lens, dim=0, dtype=torch.int32),
+        ]
+    )
+
+    query = torch.randn(torch.sum(q_lens).item(), num_qo_heads, head_size, dtype=dtype)
+    if q_quant_dtype == FP8_DTYPE:
+        query, q_scale = to_float8(query)
+        ref_query = query.to(dtype) * q_scale
+    else:
+        q_scale = 1.0
+        ref_query = query
+
+    kv_lens = torch.randint(1, max_kv_len, (batch_size,), dtype=torch.int32)
+    kv_lens[-1] = max_kv_len
+
+    seq_lens = kv_lens + q_lens
+    max_seq_len = torch.max(seq_lens).item()
+
+    kv_cache = torch.randn(kv_cache_shape, dtype=dtype)
+    if kv_quant_dtype == FP8_DTYPE:
+        kv_cache, kv_scale = to_float8(kv_cache)
+        ref_kv_cache = kv_cache.to(dtype) * kv_scale
+    else:
+        kv_scale = 1.0
+        ref_kv_cache = kv_cache
+    k_scale = v_scale = kv_scale
+
+    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
+    block_tables = torch.randint(
+        0, NUM_BLOCKS, (batch_size, max_num_blocks_per_seq), dtype=torch.int32
+    )
+    kv_indptr = [0]
+    kv_indices = []
+    kv_last_page_lens = []
+    for i in range(batch_size):
+        seq_len = seq_lens[i]
+        assert seq_len > 0
+        num_blocks = (seq_len + block_size - 1) // block_size
+        kv_indices.extend(block_tables[i, :num_blocks])
+        kv_indptr.append(kv_indptr[-1] + num_blocks)
+        kv_last_page_len = seq_len % block_size
+        if kv_last_page_len == 0:
+            kv_last_page_len = block_size
+        kv_last_page_lens.append(kv_last_page_len)
+
+    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
+    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
+    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
+    workspace_buffer = torch.zeros(128 * 1024 * 1024, dtype=torch.int8)
+
+    # Baseline Prefill
+    if has_sinks:
+        sinks = torch.rand(num_qo_heads, dtype=torch.float32) * 5
+        wrapper = flashinfer.BatchAttentionWithAttentionSinkWrapper(
+            float_workspace_buffer=workspace_buffer, kv_layout=kv_layout, backend="fa2"
+        )
+    else:
+        sinks = None
+        wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
+            float_workspace_buffer=workspace_buffer, kv_layout=kv_layout, backend="fa2"
+        )
+
+    wrapper.plan(
+        qo_indptr=q_indptr,
+        paged_kv_indptr=kv_indptr,
+        paged_kv_indices=kv_indices,
+        paged_kv_last_page_len=kv_last_page_lens,
+        num_qo_heads=num_qo_heads,
+        num_kv_heads=num_kv_heads,
+        head_dim_qk=head_size,
+        page_size=block_size,
+        causal=True,
+        sm_scale=sm_scale,
+        window_left=window_left,
+        logits_soft_cap=soft_cap,
+        q_data_type=dtype,
+        kv_data_type=dtype,
+    )
+    output = torch.empty(ref_query.shape, dtype=dtype)
+    wrapper.run(ref_query, ref_kv_cache, sinks, sm_scale, out=output)
+
+    o_scale = 1.0
+    o_sf_scale_float = None
+    if o_quant_dtype == FP8_DTYPE:
+        _, o_scale = to_float8(output)
+    elif o_quant_dtype == FP4_DTYPE:
+        o_sf_scale = get_nvfp4_global_scale(output)
+        o_sf_scale_float = o_sf_scale.item()
+
+    # TRTLLM Prefill
+    if o_quant_dtype == FP4_DTYPE:
+        output_trtllm = flashinfer.utils.FP4Tensor(
+            torch.empty(query.shape[:-1] + (query.shape[-1] // 2,), dtype=torch.uint8),
+            torch.empty(
+                (
+                    round_up(query.shape[0], 128),
+                    round_up(query.shape[1] * query.shape[2] // 16, 4),
+                ),
+                dtype=torch.float8_e4m3fn,
+            ),
+        )
+    else:
+        output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
+
+    flashinfer.prefill.trtllm_batch_context_with_kv_cache(
+        query=query,
+        kv_cache=kv_cache,
+        workspace_buffer=workspace_buffer,
+        block_tables=block_tables,
+        seq_lens=seq_lens,
+        max_q_len=max_q_len,
+        max_kv_len=max_seq_len,
+        bmm1_scale=q_scale * k_scale * sm_scale,
+        bmm2_scale=v_scale / o_scale,
+        batch_size=batch_size,
+        cum_seq_lens_q=q_indptr,
+        cum_seq_lens_kv=kv_indptr,
+        window_left=window_left,
+        sinks=sinks,
+        o_sf_scale=o_sf_scale_float,
+        out=output_trtllm,
+    )
+    if o_quant_dtype == FP8_DTYPE:
+        output_trtllm = output_trtllm.to(dtype) * o_scale
+    elif o_quant_dtype == FP4_DTYPE:
+        output_trtllm.data = output_trtllm.data.reshape(
+            -1, query.shape[1] * query.shape[2] // 2
+        )
+        output_trtllm = dequantize_nvfp4_to_dtype(
+            output_trtllm.data, output_trtllm.scale, o_sf_scale, dtype, query.device
+        )
+        output_trtllm = output_trtllm.reshape(-1, query.shape[1], query.shape[2])
+
+    if q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP4_DTYPE:
+        rtol, atol = 3e-1, 4e-1
+    elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP8_DTYPE:
+        rtol, atol = 4e-2, 6e-2
+    elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == dtype:
+        rtol, atol = 2e-2, 3e-2
+    else:
+        rtol, atol = 1e-2, 1e-2
+
+    (
+        torch.testing.assert_close(output, output_trtllm, atol=atol, rtol=rtol),
+        f"{torch.max(torch.abs(output - output_trtllm))}",
+    )
--- a/tests/kernels/attention/test_flashmla.py
+++ b/tests/kernels/attention/test_flashmla.py
@@ -0,0 +1,178 @@
+# Adapted from: https://github.com/deepseek-ai/FlashMLA/blob/main/tests/test_flash_mla.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
+import random
+
+import pytest
+import torch
+
+from vllm.attention.ops.flashmla import (
+    flash_mla_with_kvcache,
+    get_mla_metadata,
+    is_flashmla_dense_supported,
+)
+from vllm.triton_utils import triton
+
+
+def cal_diff(
+    x: torch.Tensor, y: torch.Tensor, name: str, use_fp8: bool = False
+) -> None:
+    x, y = x.double(), y.double()
+    cos_diff = 1 - 2 * (x * y).sum().item() / max((x * x + y * y).sum().item(), 1e-12)
+    if use_fp8:
+        assert cos_diff < 1e-4
+    else:
+        assert cos_diff < 1e-5
+
+
+FLASH_MLA_UNSUPPORTED_REASON = (
+    is_flashmla_dense_supported()[1]
+    if not is_flashmla_dense_supported()[0]
+    else "FlashMLA is supported"
+)
+
+
+@pytest.mark.skipif(
+    not is_flashmla_dense_supported()[0], reason=FLASH_MLA_UNSUPPORTED_REASON
+)
+@pytest.mark.parametrize("b", [128])
+@pytest.mark.parametrize("s_q", [1, 2])
+@pytest.mark.parametrize("mean_sk", [4096, 8192, 16384])
+@pytest.mark.parametrize("h_q", [16, 32, 64, 128])
+@pytest.mark.parametrize("h_kv", [1])
+@pytest.mark.parametrize("d", [576])
+@pytest.mark.parametrize("dv", [512])
+@pytest.mark.parametrize("block_size", [64])
+@pytest.mark.parametrize("causal", [True])
+@pytest.mark.parametrize("varlen", [False, True])
+@pytest.mark.parametrize(
+    "torch_dtype", [torch.bfloat16, torch.float16, torch.float8_e4m3fn]
+)
+@torch.inference_mode()
+def test_flash_mla(
+    b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal, varlen, torch_dtype
+):
+    device = torch.device("cuda:0")
+    init_dtype = torch.bfloat16 if torch_dtype == torch.float8_e4m3fn else torch_dtype
+    torch.set_default_dtype(init_dtype)
+    torch.set_default_device(device)
+    torch.cuda.set_device(device)
+    torch.manual_seed(0)
+    random.seed(0)
+
+    print(
+        f"{b=}, {s_q=}, {mean_sk=}, {h_q=}, {h_kv=}, "
+        f"{d=}, {dv=}, {causal=}, {varlen=}, {torch_dtype=}"
+    )
+
+    use_fp8 = torch_dtype == torch.float8_e4m3fn
+    cache_seqlens = torch.full((b,), mean_sk, dtype=torch.int32)
+    if varlen:
+        for i in range(b):
+            cache_seqlens[i] = max(random.normalvariate(mean_sk, mean_sk / 2), s_q)
+    total_seqlens = cache_seqlens.sum().item()
+    max_seqlen = cache_seqlens.max().item()
+    max_seqlen_pad = triton.cdiv(max_seqlen, 256) * 256
+
+    q = torch.randn(b, s_q, h_q, d)
+    block_table = torch.arange(
+        b * max_seqlen_pad // block_size, dtype=torch.int32
+    ).view(b, max_seqlen_pad // block_size)
+    blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
+    for i in range(b):
+        blocked_k.view(b, max_seqlen_pad, h_kv, d)[i, cache_seqlens[i].item() :] = (
+            float("nan")
+        )
+    blocked_v = blocked_k[..., :dv]
+
+    tile_scheduler_metadata, num_splits = get_mla_metadata(
+        cache_seqlens, s_q * h_q // h_kv, h_kv
+    )
+
+    init_dtype = q.dtype
+    if use_fp8:
+        fp8_dtype = torch.float8_e4m3fn
+        descale_q = torch.ones((1), dtype=torch.float32)
+        descale_k = torch.ones((1), dtype=torch.float32)
+
+        q = q.to(fp8_dtype)
+        blocked_k = blocked_k.to(fp8_dtype)
+        blocked_v = blocked_v.to(fp8_dtype)
+    else:
+        descale_q = None
+        descale_k = None
+
+    def flash_mla():
+        return flash_mla_with_kvcache(
+            q,
+            blocked_k,
+            block_table,
+            cache_seqlens,
+            dv,
+            tile_scheduler_metadata,
+            num_splits,
+            causal=causal,
+            descale_q=descale_q,
+            descale_k=descale_k,
+        )
+
+    def scaled_dot_product_attention(query, key, value, is_causal=False):
+        query = query.float()
+        key = key.float()
+        value = value.float()
+        key = key.repeat_interleave(h_q // h_kv, dim=0)
+        value = value.repeat_interleave(h_q // h_kv, dim=0)
+        attn_weight = query @ key.transpose(-2, -1) / math.sqrt(query.size(-1))
+        if is_causal:
+            s_q = query.shape[-2]
+            s_k = key.shape[-2]
+            attn_bias = torch.zeros(s_q, s_k, dtype=query.dtype)
+            temp_mask = torch.ones(s_q, s_k, dtype=torch.bool).tril(diagonal=s_k - s_q)
+            attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+            attn_bias.to(query.dtype)
+            attn_weight += attn_bias
+        lse = attn_weight.logsumexp(dim=-1)
+        attn_weight = torch.softmax(attn_weight, dim=-1, dtype=torch.float32)
+        return attn_weight @ value, lse
+
+    def ref_mla():
+        q_ = (q.to(torch.float) * descale_q).to(init_dtype) if use_fp8 else q
+        blocked_k_ = (
+            (blocked_k.to(torch.float) * descale_k).to(init_dtype)
+            if use_fp8
+            else blocked_k
+        )
+        blocked_v_ = (
+            (blocked_v.to(torch.float) * descale_k).to(init_dtype)
+            if use_fp8
+            else blocked_v
+        )
+        out = torch.empty(b, s_q, h_q, dv, dtype=torch.float32)
+        lse = torch.empty(b, h_q, s_q, dtype=torch.float32)
+        for i in range(b):
+            begin = i * max_seqlen_pad
+            end = begin + cache_seqlens[i]
+            out_i, lse_i = scaled_dot_product_attention(
+                q_[i].transpose(0, 1),
+                blocked_k_.view(-1, h_kv, d)[begin:end].transpose(0, 1),
+                blocked_v_.view(-1, h_kv, dv)[begin:end].transpose(0, 1),
+                is_causal=causal,
+            )
+            out[i] = out_i.transpose(0, 1)
+            lse[i] = lse_i
+        return out, lse
+
+    out_flash, lse_flash = flash_mla()
+    out_torch, lse_torch = ref_mla()
+    cal_diff(out_flash, out_torch, "out", use_fp8)
+    cal_diff(lse_flash, lse_torch, "lse")
+
+    t = triton.testing.do_bench(flash_mla)
+    FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d) * (
+        torch.finfo(torch_dtype).bits // 8
+    ) + (b * s_q * h_q * dv) * (torch.finfo(init_dtype).bits // 8)
+    print(
+        f"{t:.3f} ms, {FLOPS / 10**9 / t:.0f} TFLOPS,", f"{bytes / 10**6 / t:.0f} GB/s"
+    )
--- a/tests/kernels/attention/test_flashmla_sparse.py
+++ b/tests/kernels/attention/test_flashmla_sparse.py
@@ -0,0 +1,122 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+
+def test_sparse_flashmla_metadata_smoke():
+    import vllm.attention.ops.flashmla as fm
+
+    ok, reason = fm.is_flashmla_sparse_supported()
+    if not ok:
+        pytest.skip(reason)
+
+    device = torch.device("cuda")
+    batch_size = 1
+    seqlen_q = 1
+    num_heads_q = 128
+    num_heads_k = 1
+    q_seq_per_hk = seqlen_q * num_heads_q // num_heads_k
+    topk = 128
+
+    cache_seqlens = torch.zeros(batch_size, dtype=torch.int32, device=device)
+
+    tile_md, num_splits = fm.get_mla_metadata(
+        cache_seqlens,
+        q_seq_per_hk,
+        num_heads_k,
+        num_heads_q=num_heads_q,
+        topk=topk,
+        is_fp8_kvcache=True,
+    )
+    assert tile_md.dtype == torch.int32
+    assert num_splits.dtype == torch.int32
+
+
+def test_sparse_flashmla_decode_smoke():
+    import vllm.attention.ops.flashmla as fm
+
+    ok, reason = fm.is_flashmla_sparse_supported()
+    if not ok:
+        pytest.skip(reason)
+
+    device = torch.device("cuda")
+    batch_size = 1
+    seqlen_q = 1
+    num_heads_q = 1
+    head_dim_k = 576
+    head_dim_v = 512
+    num_heads_k = 1
+    page_block_size = 64
+    bytes_per_token = 656
+    topk = 128
+
+    # Metadata
+    q_seq_per_hk = seqlen_q * num_heads_q // num_heads_k
+    # q_heads_per_hk = num_heads_q // num_heads_k
+    cache_seqlens = torch.zeros(batch_size, dtype=torch.int32, device=device)
+    tile_md, num_splits = fm.get_mla_metadata(
+        cache_seqlens,
+        q_seq_per_hk,
+        num_heads_k,
+        num_heads_q=num_heads_q,
+        topk=topk,
+        is_fp8_kvcache=True,
+    )
+
+    # Inputs
+    q = torch.zeros(
+        (batch_size, seqlen_q, num_heads_q, head_dim_k),
+        dtype=torch.bfloat16,
+        device=device,
+    )
+    k_cache = torch.zeros(
+        (1, page_block_size, num_heads_k, bytes_per_token),
+        dtype=torch.uint8,
+        device=device,
+    )
+    indices = torch.zeros(
+        (batch_size, seqlen_q, topk), dtype=torch.int32, device=device
+    )
+
+    block_table = torch.zeros((batch_size, 128), dtype=torch.int32, device=device)
+    out, lse = fm.flash_mla_with_kvcache(
+        q,
+        k_cache,
+        block_table,
+        cache_seqlens,
+        head_dim_v,
+        tile_md,
+        num_splits,
+        indices=indices,
+        is_fp8_kvcache=True,
+    )
+    assert out.shape[0] == batch_size
+    assert out.shape[-1] == head_dim_v
+    assert lse.shape[0] == batch_size
+
+
+def test_sparse_flashmla_prefill_smoke():
+    import vllm.attention.ops.flashmla as fm
+
+    ok, reason = fm.is_flashmla_sparse_supported()
+    if not ok:
+        pytest.skip(reason)
+
+    device = torch.device("cuda")
+    s_q = 1
+    s_kv = 1
+    h_q = 64  # kernel expects multiple of 64
+    h_kv = 1
+    d_qk = 576
+    d_v = 512
+    topk = 128
+
+    q = torch.zeros((s_q, h_q, d_qk), dtype=torch.bfloat16, device=device)
+    kv = torch.zeros((s_kv, h_kv, d_qk), dtype=torch.bfloat16, device=device)
+    indices = torch.zeros((s_q, h_kv, topk), dtype=torch.int32, device=device)
+
+    out, max_logits, lse = fm.flash_mla_sparse_prefill(q, kv, indices, 1.0, d_v)
+    assert out.shape == (s_q, h_q, d_v)
+    assert max_logits.shape == (s_q, h_q)
+    assert lse.shape == (s_q, h_q)
--- a/tests/kernels/attention/test_lightning_attn.py
+++ b/tests/kernels/attention/test_lightning_attn.py
@@ -0,0 +1,266 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.lightning_attn import linear_decode_forward_triton
+from vllm.platforms import current_platform
+
+NUM_HEADS = [4, 8]
+HEAD_SIZES = [64]
+BATCH_SIZES = [1, 2]
+SEQ_LENGTHS = [16]
+DTYPES = [torch.float32]
+
+
+def reference_lightning_attention(q, k, v, ed, block_size, kv_history):
+    """Reference implementation of lightning attention core algorithm
+
+    The difference from the main implementation is that this processes
+    each step sequentially, instead of using parallelized triton kernels
+    """
+    B, H, S, D = q.shape
+    E = v.shape[-1]
+    dtype = q.dtype
+    output = torch.zeros((B, H, S, E), dtype=dtype, device=q.device)
+
+    # Use clone() to ensure an independent copy
+    if kv_history is None:
+        kv_cache = torch.zeros((B, H, D, E), dtype=dtype, device=q.device)
+    else:
+        kv_cache = kv_history.clone()
+
+    # More efficient implementation
+    # Convert decay factors to matrix form
+    decay = torch.exp(-ed).view(1, -1, 1, 1) if ed.dim() == 1 else torch.exp(-ed)
+
+    for b in range(B):
+        for step in range(S):
+            # Process all heads at once for this position
+            q_bs = q[b, :, step]  # [H, D]
+            k_bs = k[b, :, step]  # [H, D]
+            v_bs = v[b, :, step]  # [H, E]
+
+            # Calculate KV outer products for all heads
+            for h in range(H):
+                # Calculate KV outer product
+                kv_outer = torch.outer(k_bs[h], v_bs[h])
+
+                # Update KV cache with decay
+                # Note: Using the same order as in the Triton kernel
+                kv_cache[b, h] = decay[0, h, 0, 0] * kv_cache[b, h] + kv_outer
+
+                # Calculate attention output
+                output[b, h, step] = torch.matmul(q_bs[h], kv_cache[b, h])
+
+    # Match the shape returned by the actual implementation
+    # The actual implementation returns a tensor of shape [B, H, 2, D, E]
+    # where dimension 2 contains both KV and KV history
+    kv_reshaped = kv_cache.unsqueeze(2)  # [B, H, 1, D, E]
+    final_kv_cache = torch.cat([kv_reshaped, kv_reshaped], dim=2)  # [B, H, 2, D, E]
+
+    return output, final_kv_cache
+
+
+def reference_linear_decode(q, k, v, kv_caches, slope_rate, slot_idx):
+    """Reference implementation: linear attention decode function"""
+    B, H, _, D = q.shape
+    output = torch.zeros(B, H * D, dtype=q.dtype, device=q.device)
+
+    # Calculate decay factors once (more efficient)
+    decay = torch.exp(-slope_rate).view(-1, 1, 1)  # [H, 1, 1]
+
+    # Process each batch
+    for b in range(B):
+        slot_id = slot_idx[b].item()
+
+        # Skip padding positions
+        if slot_id == -1:
+            continue
+
+        # Process all heads at once for this batch
+        q_b = q[b, :, 0]  # [H, D]
+        k_b = k[b, :, 0]  # [H, D]
+        v_b = v[b, :, 0]  # [H, D]
+
+        # Process each attention head
+        for h in range(H):
+            # Get current query, key and value
+            q_bh = q_b[h]
+            k_bh = k_b[h]
+            v_bh = v_b[h]
+
+            # Get cache
+            kv_cache_old = kv_caches[b, h]
+
+            # Calculate new key-value outer product
+            kv_outer = torch.outer(k_bh, v_bh)
+
+            # Apply decay and update cache
+            kv_new = kv_outer + decay[h, 0, 0] * kv_cache_old
+
+            # Calculate output
+            out_h = torch.matmul(q_bh, kv_new)
+
+            # Update output and cache
+            output[b, h * D : (h + 1) * D] = out_h
+            kv_caches[b, h] = kv_new
+
+    return output
+
+
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@torch.inference_mode()
+def test_linear_decode_forward_triton(
+    batch_size: int,
+    num_heads: int,
+    head_size: int,
+    dtype: torch.dtype,
+):
+    torch.set_default_device("cuda")
+    torch.manual_seed(42)
+    torch.cuda.manual_seed_all(42)
+    current_platform.seed_everything(42)
+    base = 0.01
+    q = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+    k = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+    v = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+
+    kv_caches = base * torch.randn(
+        batch_size, num_heads, head_size, head_size, dtype=dtype, device="cuda"
+    )
+
+    kv_caches_copy = kv_caches.clone()
+
+    slope_rate = torch.zeros(num_heads, device="cuda")
+    for h in range(num_heads):
+        slope_rate[h] = 0.1 * (h + 1)
+
+    slot_idx = torch.arange(batch_size, device="cuda")
+
+    triton_output = linear_decode_forward_triton(
+        q, k, v, kv_caches, slope_rate, slot_idx
+    )
+
+    reference_output = reference_linear_decode(
+        q, k, v, kv_caches_copy, slope_rate, slot_idx
+    )
+    torch.testing.assert_close(triton_output, reference_output, rtol=1e-1, atol=1e-1)
+    torch.testing.assert_close(kv_caches, kv_caches_copy, rtol=1e-1, atol=1e-1)
+
+    assert triton_output.shape == (batch_size, num_heads * head_size)
+
+
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@torch.inference_mode()
+def test_linear_decode_forward_triton_with_padding(
+    num_heads: int,
+    head_size: int,
+    dtype: torch.dtype,
+):
+    torch.set_default_device("cuda")
+    torch.manual_seed(42)
+    torch.cuda.manual_seed_all(42)
+    current_platform.seed_everything(42)
+
+    batch_size = 4
+    base = 0.01
+    q = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+    k = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+    v = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+
+    kv_caches = base * torch.randn(
+        batch_size, num_heads, head_size, head_size, dtype=dtype, device="cuda"
+    )
+
+    kv_caches_copy = kv_caches.clone()
+
+    slope_rate = torch.zeros(num_heads, device="cuda")
+    for h in range(num_heads):
+        slope_rate[h] = 0.1 * (h + 1)
+
+    slot_idx = torch.tensor([0, 1, -1, 2], device="cuda")
+
+    triton_output = linear_decode_forward_triton(
+        q, k, v, kv_caches, slope_rate, slot_idx
+    )
+
+    reference_output = reference_linear_decode(
+        q, k, v, kv_caches_copy, slope_rate, slot_idx
+    )
+
+    padding_mask = (slot_idx != -1).unsqueeze(1).expand(-1, num_heads * head_size)
+
+    triton_masked = triton_output[padding_mask]
+    reference_masked = reference_output[padding_mask]
+
+    atol, rtol = 1.5e-1, 1.5e-1
+
+    valid_indices = slot_idx != -1
+
+    for i in range(batch_size):
+        if valid_indices[i] > 0:
+            torch.testing.assert_close(
+                kv_caches[i], kv_caches_copy[i], rtol=rtol, atol=atol
+            )
+
+    torch.testing.assert_close(triton_masked, reference_masked, rtol=rtol, atol=atol)
+
+    assert triton_output.shape == (batch_size, num_heads * head_size)
+
+
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("seq_len", SEQ_LENGTHS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@torch.inference_mode()
+def test_lightning_attention_reference(
+    batch_size: int,
+    num_heads: int,
+    head_size: int,
+    seq_len: int,
+    dtype: torch.dtype,
+):
+    torch.set_default_device("cuda")
+    torch.manual_seed(42)
+    torch.cuda.manual_seed_all(42)
+    current_platform.seed_everything(42)
+
+    base = 0.01
+    q = base * torch.randn(batch_size, num_heads, seq_len, head_size, dtype=dtype)
+    k = base * torch.randn(batch_size, num_heads, seq_len, head_size, dtype=dtype)
+    v = base * torch.randn(batch_size, num_heads, seq_len, head_size, dtype=dtype)
+
+    ed = torch.zeros(num_heads, device="cuda")
+    for h in range(num_heads):
+        ed[h] = 0.1 * (h + 1)
+
+    kv_history = base * torch.randn(
+        batch_size, num_heads, head_size, head_size, dtype=dtype, device="cuda"
+    )
+
+    kv_history_clone = kv_history.clone()
+
+    ref_output, ref_kv_cache = reference_lightning_attention(
+        q, k, v, ed, 256, kv_history
+    )
+
+    from vllm.model_executor.layers.lightning_attn import lightning_attention
+
+    actual_output, actual_kv_cache = lightning_attention(
+        q, k, v, ed, 256, kv_history_clone
+    )
+
+    atol, rtol = 1.5e-1, 1.5e-1
+    torch.testing.assert_close(ref_output, actual_output, rtol=rtol, atol=atol)
+    torch.testing.assert_close(ref_kv_cache, actual_kv_cache, rtol=rtol, atol=atol)
+
+    assert ref_output.shape == (batch_size, num_heads, seq_len, head_size)
+    assert ref_kv_cache.shape == actual_kv_cache.shape
--- a/tests/kernels/attention/test_merge_attn_states.py
+++ b/tests/kernels/attention/test_merge_attn_states.py
@@ -0,0 +1,319 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm._custom_ops import merge_attn_states as merge_attn_states_cuda
+from vllm.attention.ops.triton_merge_attn_states import (
+    merge_attn_states as merge_attn_states_triton,
+)
+from vllm.platforms import current_platform
+
+
+# Naive PyTorch Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
+# can be used to combine partial attention results (in the split-KV case)
+def merge_attn_states_torch(
+    output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+    prefix_output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+    prefix_lse: torch.Tensor,  # [NUM_HEADS, NUM_TOKENS]
+    suffix_output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+    suffix_lse: torch.Tensor,  # [NUM_HEADS, NUM_TOKENS]
+    output_lse: torch.Tensor | None = None,  # [NUM_HEADS, NUM_TOKENS]
+):
+    p_lse = prefix_lse
+    s_lse = suffix_lse
+    # inf -> -inf
+    p_lse[p_lse == torch.inf] = -torch.inf
+    s_lse[s_lse == torch.inf] = -torch.inf
+    # max_lse [NUM_HEADS, NUM_TOKENS]
+    max_lse = torch.maximum(p_lse, s_lse)
+    p_lse = p_lse - max_lse
+    s_lse = s_lse - max_lse
+    p_lse_exp = torch.exp(p_lse)
+    s_lse_exp = torch.exp(s_lse)
+    out_se = p_lse_exp + s_lse_exp
+    if output_lse is not None:
+        output_lse = torch.log(out_se) + max_lse
+    p_scale = p_lse_exp / out_se  # [NUM_HEADS, NUM_TOKENS]
+    s_scale = s_lse_exp / out_se  # [NUM_HEADS, NUM_TOKENS]
+    p_scale = torch.transpose(p_scale, 0, 1).unsqueeze(2)  # [NUM_TOKENS, NUM_HEADS, 1]
+    s_scale = torch.transpose(s_scale, 0, 1).unsqueeze(2)  # [NUM_TOKENS, NUM_HEADS, 1]
+    output = prefix_output * p_scale + suffix_output * s_scale
+    return output, output_lse
+
+
+NUM_BATCH_TOKENS = [256, 512, 613, 1024, 1536, 4096]
+NUM_QUERY_HEADS = [4, 8, 16, 32, 48, 64]
+HEAD_SIZES = [32, 48, 64, 96, 128, 256]
+DTYPES = [torch.float32, torch.half, torch.bfloat16]
+
+all_case_info: list[tuple] = []
+
+
+def generate_markdown_table():
+    global all_case_info
+    table_header = (
+        "| tokens | heads | headsize | dtype "
+        "| device | torch | triton | cuda | speedup |"
+    )
+    table_separator = "| --- | --- | --- | --- | --- | --- | --- | --- | --- |"
+
+    def shortly_dtype(dtype: torch.dtype) -> str:
+        return str(dtype).removeprefix("torch.")
+
+    def shortly_device(device: str) -> str:
+        return device.removeprefix("NVIDIA").strip()
+
+    print(table_header)
+    print(table_separator)
+    for info in all_case_info:
+        (
+            num_tokens,
+            num_heads,
+            head_size,
+            dtype,
+            device,
+            avg_time_torch_kernel,
+            avg_time_triton_kernel,
+            avg_time_cuda_kernel,
+            performance_improved,
+        ) = info
+        dtype = shortly_dtype(dtype)
+        device = shortly_device(device)
+        print(
+            f"| {num_tokens} | {num_heads} | {head_size} "
+            f"| {dtype} | {device} | {avg_time_torch_kernel:.5f}ms "
+            f"| {avg_time_triton_kernel:.5f}ms "
+            f"| {avg_time_cuda_kernel:.5f}ms "
+            f"| {performance_improved:.4f}x |"
+        )
+
+
+@pytest.mark.parametrize("num_tokens", NUM_BATCH_TOKENS)
+@pytest.mark.parametrize("num_query_heads", NUM_QUERY_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("output_dtype", DTYPES)
+@torch.inference_mode()
+def test_merge_attn_states(
+    num_tokens: int, num_query_heads: int, head_size: int, output_dtype: torch.dtype
+):
+    if not current_platform.is_cuda():
+        pytest.skip(
+            "Currently only support compare triton merge_attn_states "
+            "with custom cuda merge_attn_states kernel"
+        )
+
+    NUM_TOKENS = num_tokens
+    NUM_HEADS = num_query_heads
+    HEAD_SIZE = head_size
+
+    print(
+        f"\nNUM_TOKENS:{NUM_TOKENS}, NUM_HEADS:{NUM_HEADS}, "
+        f"HEAD_SIZE:{HEAD_SIZE}, DTYPE: {output_dtype}, "
+        f"Device: {current_platform.get_device_name()}"
+    )
+
+    # prefix_lse and suffix_lse contain inf and normal values
+    prefix_lse = torch.randn(NUM_HEADS, NUM_TOKENS, dtype=torch.float32, device="cuda")
+    suffix_lse = torch.randn(NUM_HEADS, NUM_TOKENS, dtype=torch.float32, device="cuda")
+
+    # Generate boolean masks
+    mask_prefix = torch.rand(NUM_HEADS, NUM_TOKENS) < 0.1
+    mask_suffix = torch.rand(NUM_HEADS, NUM_TOKENS) < 0.1
+    # Ensure that the same position is not True at the same time
+    combined_mask = torch.logical_and(mask_prefix, mask_suffix)
+    mask_prefix = torch.logical_and(mask_prefix, ~combined_mask)
+    mask_suffix = torch.logical_and(mask_suffix, ~combined_mask)
+
+    prefix_lse[mask_prefix] = float("inf")
+    suffix_lse[mask_suffix] = float("inf")
+
+    # Other input tensors (need to be initialized but
+    # no actual calculation needed)
+    output = torch.zeros(
+        (NUM_TOKENS, NUM_HEADS, HEAD_SIZE), dtype=output_dtype, device="cuda"
+    )
+    output_lse = torch.zeros(
+        (NUM_HEADS, NUM_TOKENS), dtype=torch.float32, device="cuda"
+    )
+    prefix_output = torch.randn(
+        (NUM_TOKENS, NUM_HEADS, HEAD_SIZE), dtype=output_dtype, device="cuda"
+    )
+    suffix_output = torch.randn(
+        (NUM_TOKENS, NUM_HEADS, HEAD_SIZE), dtype=output_dtype, device="cuda"
+    )
+
+    warmup_times = 2
+    repeat_times = 20
+
+    output_torch = output.clone()
+    output_lse_torch = output_lse.clone()
+    total_time_torch_kernel = 0
+    start = torch.Event(enable_timing=True)
+    end = torch.Event(enable_timing=True)
+
+    # 0. Run the Torch kernel
+    prefix_lse_torch = prefix_lse.clone()
+    suffix_lse_torch = suffix_lse.clone()
+    for _ in range(warmup_times):
+        output_torch, output_lse_torch = merge_attn_states_torch(
+            output_torch,
+            prefix_output,
+            prefix_lse_torch,
+            suffix_output,
+            suffix_lse_torch,
+            output_lse_torch,
+        )
+    torch.cuda.synchronize()
+
+    for _ in range(repeat_times):
+        start.record()
+        output_torch, output_lse_torch = merge_attn_states_torch(
+            output_torch,
+            prefix_output,
+            prefix_lse_torch,
+            suffix_output,
+            suffix_lse_torch,
+            output_lse_torch,
+        )
+        end.record()
+        torch.cuda.synchronize()
+        total_time_torch_kernel += start.elapsed_time(end)
+
+    avg_time_torch_kernel = total_time_torch_kernel / repeat_times
+
+    # 1. Run the Triton kernel
+    output_ref_triton = output.clone()
+    output_lse_ref_triton = output_lse.clone()
+
+    total_time_triton_kernel = 0
+    start = torch.Event(enable_timing=True)
+    end = torch.Event(enable_timing=True)
+
+    for _ in range(warmup_times):
+        merge_attn_states_triton(
+            output_ref_triton,
+            prefix_output,
+            prefix_lse,
+            suffix_output,
+            suffix_lse,
+            output_lse_ref_triton,
+        )
+    torch.cuda.synchronize()
+
+    for _ in range(repeat_times):
+        start.record()
+        merge_attn_states_triton(
+            output_ref_triton,
+            prefix_output,
+            prefix_lse,
+            suffix_output,
+            suffix_lse,
+            output_lse_ref_triton,
+        )
+        end.record()
+        torch.cuda.synchronize()
+        total_time_triton_kernel += start.elapsed_time(end)
+
+    avg_time_triton_kernel = total_time_triton_kernel / repeat_times
+
+    # 2. Run the CUDA kernel
+    total_time_cuda_kernel = 0
+    output_cuda = output.clone()
+    output_lse_cuda = output_lse.clone()
+
+    for _ in range(warmup_times):
+        merge_attn_states_cuda(
+            output_cuda,
+            prefix_output,
+            prefix_lse,
+            suffix_output,
+            suffix_lse,
+            output_lse_cuda,
+        )
+    torch.cuda.synchronize()
+
+    for _ in range(repeat_times):
+        start.record()
+        merge_attn_states_cuda(
+            output_cuda,
+            prefix_output,
+            prefix_lse,
+            suffix_output,
+            suffix_lse,
+            output_lse_cuda,
+        )
+        end.record()
+        torch.cuda.synchronize()
+        total_time_cuda_kernel += start.elapsed_time(end)
+
+    avg_time_cuda_kernel = total_time_cuda_kernel / repeat_times
+
+    # 3. Performance compare
+    performance_improved = avg_time_triton_kernel / avg_time_cuda_kernel
+    print(f" Torch time: {avg_time_torch_kernel:.6f}ms")
+    print(f"Triton time: {avg_time_triton_kernel:.6f}ms")
+    print(
+        f"  CUDA time: {avg_time_cuda_kernel:.6f}ms, "
+        f"Performance: {performance_improved:.5f}x"
+    )
+    print("-" * 100)
+
+    # 4. Correctness compare
+    # Liger Kernel: Efficient Triton Kernels for LLM Training
+    # https://arxiv.org/pdf/2410.10989, 3.3 Correctness
+    # use rtol = 1e-2 for bfloat16.
+    rtol = 1e-2 if output_dtype == torch.bfloat16 else 1e-3
+
+    def diff(a: torch.Tensor, b: torch.Tensor):
+        max_diff = torch.max(torch.abs(a.float() - b.float()))
+        return max_diff
+
+    # Use Triton output as reference because we want to replace
+    # the Triton kernel with custom CUDA kernel for merge attn
+    # states operation.
+    output_ref = output_ref_triton
+    output_lse_ref = output_lse_ref_triton
+    torch.testing.assert_close(
+        output_cuda.float(), output_ref.float(), atol=1e-3, rtol=rtol
+    )
+    print("Output all match, max abs diff:")
+    print(f"(Triton vs Torch) : {diff(output_torch, output_ref)}")
+    print(f"  (CUDA vs Torch) : {diff(output_torch, output_cuda)}")
+    print(f"  (CUDA vs Triton): {diff(output_ref, output_cuda)}")
+    print("-" * 100)
+
+    torch.testing.assert_close(
+        output_lse_cuda.float(), output_lse_ref.float(), atol=1e-3, rtol=rtol
+    )
+    print("Output LSE all match, max abs diff:")
+    print(f"(Triton vs Torch) : {diff(output_lse_torch, output_lse_ref)}")
+    print(f"  (CUDA vs Torch) : {diff(output_lse_torch, output_lse_cuda)}")
+    print(f"  (CUDA vs Triton): {diff(output_lse_ref, output_lse_cuda)}")
+    print("-" * 100)
+
+    print(
+        "All output values test passed! All inf values "
+        "are correctly replaced with -inf."
+    )
+    print("-" * 100)
+
+    device = current_platform.get_device_name()
+    all_case_info.append(
+        (
+            NUM_TOKENS,
+            NUM_HEADS,
+            HEAD_SIZE,
+            output_dtype,
+            device,
+            avg_time_torch_kernel,
+            avg_time_triton_kernel,
+            avg_time_cuda_kernel,
+            performance_improved,
+        )
+    )
+    if len(all_case_info) == (
+        len(NUM_BATCH_TOKENS) * len(HEAD_SIZES) * len(NUM_QUERY_HEADS) * len(DTYPES)
+    ):
+        generate_markdown_table()
--- a/tests/kernels/attention/test_mha_attn.py
+++ b/tests/kernels/attention/test_mha_attn.py
@@ -0,0 +1,153 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Test:
+
+* Tests for MultiHeadAttention layer
+"""
+
+from unittest.mock import patch
+
+import pytest
+import torch
+
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.selector import _cached_get_attn_backend
+from vllm.platforms import current_platform
+from vllm.platforms.cpu import CpuPlatform
+from vllm.platforms.cuda import CudaPlatform
+from vllm.platforms.rocm import RocmPlatform
+
+
+@pytest.fixture(autouse=True)
+def clear_cache():
+    """Clear lru cache to ensure each test case runs without caching."""
+    _cached_get_attn_backend.cache_clear()
+
+
+devices = ["cpu"]
+if current_platform.is_cuda():
+    devices.append("cuda")
+if current_platform.is_rocm():
+    devices.append("hip")
+
+
+@pytest.mark.parametrize("device", devices)
+def test_mha_attn_platform(device: str):
+    """
+    Test the attention selector between different platform and device.
+    """
+    torch.set_default_dtype(torch.float16)
+
+    if device == "cpu":
+        with (
+            patch("vllm.attention.layer.current_platform", CpuPlatform()),
+            patch("vllm.model_executor.models.vision.current_platform", CpuPlatform()),
+        ):
+            attn = MultiHeadAttention(16, 64, scale=1)
+            assert attn.attn_backend == AttentionBackendEnum.TORCH_SDPA
+    elif device == "hip":
+        with (
+            patch("vllm.attention.layer.current_platform", RocmPlatform()),
+            patch("vllm.model_executor.models.vision.current_platform", RocmPlatform()),
+        ):
+            attn = MultiHeadAttention(16, 64, scale=1)
+            assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
+    else:
+        # Test CUDA with head_size=64 (divisible by 32)
+        # - should use vLLM's FlashAttention
+        with (
+            patch("vllm.attention.layer.current_platform", CudaPlatform()),
+            patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()),
+        ):
+            attn = MultiHeadAttention(16, 64, scale=1)
+            assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
+
+        # Test CUDA with head_size=72 (not divisible by 32)
+        # - should use vLLM's FlashAttention
+        with (
+            patch("vllm.attention.layer.current_platform", CudaPlatform()),
+            patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()),
+        ):
+            attn = MultiHeadAttention(16, 72, scale=1)
+            assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
+
+
+def ref_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    scale: float,
+) -> torch.Tensor:
+    """
+    Native implementation of scaled dot product attention without mask:
+    - query, key, value: [batch_size, seq_len, num_heads, head_size]
+    - attn_mask: [batch_size, seq_len, seq_len]
+    """
+    query, key, value = (x.transpose(1, 2) for x in (query, key, value))
+    attn_weights = scale * torch.matmul(query, key.transpose(2, 3))
+    attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
+    out = torch.matmul(attn_weights, value).transpose(1, 2)
+    return out
+
+
+BATCH_SIZES = [1, 16]
+SEQ_LENS = [1]
+NUM_HEADS = [1, 16]
+NUM_KV_HEADS = [1]
+HEAD_SIZES = [64, 80]
+# flshattF and tritonflashattF supported: {torch.float16, torch.bfloat16}
+DTYPES = (
+    [torch.half, torch.bfloat16, torch.float]
+    if not current_platform.is_rocm()
+    else [torch.half, torch.bfloat16]
+)
+CUDA_DEVICES = ["cuda"]
+
+
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("seq_len", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("num_kv_heads", NUM_KV_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_mha_attn_forward(
+    batch_size: int,
+    seq_len: int,
+    num_heads: int,
+    num_kv_heads: int,
+    head_size: int,
+    dtype: torch.dtype,
+    device: str,
+):
+    current_platform.seed_everything(0)
+    torch.set_default_device(device)
+    torch.set_default_dtype(dtype)
+
+    q = torch.randn(batch_size, seq_len, num_heads * head_size)
+    k = torch.randn(batch_size, seq_len, num_kv_heads * head_size)
+    v = torch.randn(batch_size, seq_len, num_kv_heads * head_size)
+    scale = 1.0 / head_size**0.5
+    attn = MultiHeadAttention(
+        num_heads, head_size, scale=scale, num_kv_heads=num_kv_heads
+    )
+    output = attn(q, k, v)
+
+    assert num_heads % num_kv_heads == 0
+    num_queries_per_kv = num_heads // num_kv_heads
+    q = q.reshape(batch_size, seq_len, num_heads, head_size)
+    k = k.reshape(batch_size, seq_len, num_kv_heads, head_size)
+    v = v.reshape(batch_size, seq_len, num_kv_heads, head_size)
+    if num_queries_per_kv > 1:
+        k = torch.repeat_interleave(k, num_queries_per_kv, dim=2)
+        v = torch.repeat_interleave(v, num_queries_per_kv, dim=2)
+
+    ref_output = ref_attention(
+        q,
+        k,
+        v,
+        scale=scale,
+    ).reshape(batch_size, seq_len, num_heads * head_size)
+    torch.testing.assert_close(output, ref_output)
--- a/tests/kernels/attention/test_mla_decode_cpu.py
+++ b/tests/kernels/attention/test_mla_decode_cpu.py
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+import vllm._custom_ops as ops
+from vllm.platforms import current_platform
+from vllm.utils.math_utils import cdiv
+
+
+def ref_mla(
+    out: Tensor,  # (bs, num_heads, v_head_dim)
+    query: Tensor,  # (bs, num_heads, head_dim)
+    kv_cache: Tensor,  # (num_blocks, block_size, head_dim)
+    scale: float,
+    block_tables: Tensor,  # (bs, max_num_blocks)
+    seq_lens: Tensor,  # (bs,)
+):
+    bs, num_heads, v_head_dim = out.shape
+    head_dim = query.shape[2]
+
+    for i in range(bs):
+        # gather and flatten KV-cache
+        kv = kv_cache[block_tables[i]]  # (max_num_blocks, block_size, head_dim)
+        kv = kv.view(1, -1, head_dim)[:, : seq_lens[i]]  # (1, seq_len, head_dim)
+        v = kv[:, :, :v_head_dim]
+
+        q = query[i].view(num_heads, 1, head_dim)
+        o = F.scaled_dot_product_attention(q, kv, v, scale=scale, enable_gqa=True)
+        out[i] = o.view(num_heads, v_head_dim)
+
+    return out
+
+
+@pytest.mark.parametrize("bs", [4])
+@pytest.mark.parametrize("mean_seq_len", [256])
+@pytest.mark.parametrize("h_q", [16])
+@pytest.mark.parametrize("d", [576])
+@pytest.mark.parametrize("dv", [512])
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("dtype", [torch.float, torch.half, torch.bfloat16])
+@pytest.mark.parametrize("varlen", [False, True])
+@pytest.mark.cpu_model
+@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
+def test_mla_decode_cpu(
+    bs: int,
+    mean_seq_len: int,
+    h_q: int,
+    d: int,
+    dv: int,
+    block_size: int,
+    dtype: torch.dtype,
+    varlen: bool,
+):
+    torch.set_default_dtype(dtype)
+    torch.manual_seed(0)
+
+    scale = d ** (-0.5)
+    if varlen:
+        seq_lens = torch.empty(bs).normal_(mean_seq_len, mean_seq_len / 2)
+        seq_lens = seq_lens.clip(2).to(torch.int32)
+    else:
+        seq_lens = torch.full((bs,), mean_seq_len, dtype=torch.int32)
+    max_seq_len = seq_lens.max().item()
+    seqlen_pad = cdiv(max_seq_len, 256) * 256  # is this necessary?
+
+    q = torch.randn(bs, h_q, d)
+    block_table = torch.arange(bs * seqlen_pad // block_size, dtype=torch.int32)
+    block_table = block_table.view(bs, seqlen_pad // block_size)
+
+    kv_cache = torch.randn(block_table.numel(), block_size, d)
+    for i, seq_len in enumerate(seq_lens.tolist()):
+        kv_cache.view(bs, seqlen_pad, d)[i, seq_len:] = float("nan")
+
+    out_mla = q.new_zeros(bs, h_q, dv)
+    ops.mla_decode_kvcache_cpu(out_mla, q, kv_cache, scale, block_table, seq_lens)
+
+    out_ref = q.new_zeros(bs, h_q, dv)
+    ref_mla(out_ref, q, kv_cache, scale, block_table, seq_lens)
+
+    assert not out_mla.isnan().any(), "Likely read out of bounds"
+    torch.testing.assert_close(out_mla, out_ref)
--- a/tests/kernels/attention/test_pack_unpack_triton.py
+++ b/tests/kernels/attention/test_pack_unpack_triton.py
@@ -0,0 +1,234 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+from torch.testing import assert_close
+
+from vllm.attention.ops.common import pack_seq_triton, unpack_seq_triton
+
+
+def test_pack_seq_basic_fp8():
+    """Test basic functionality of pack_seq_triton with fp8 and 3D tensors."""
+    device = "cuda"
+    dtype = torch.float8_e4m3fn
+
+    # Test cases with 3D tensors (N, H, D)
+    test_cases = [
+        (6, 8, 4, 2, [3, 3]),  # (6, 8, 4) -> (2, 3, 8, 4)
+        (10, 4, 8, 3, [2, 4, 4]),  # (10, 4, 8) -> (3, 4, 4, 8)
+        (20, 16, 32, 4, [5, 5, 5, 5]),  # (20, 16, 32) -> (4, 5, 16, 32)
+    ]
+
+    for N, H, D, B, lengths_list in test_cases:
+        # Create input tensor with small values for fp8
+        x = torch.randn(N, H, D, dtype=torch.float32, device=device) * 0.1
+        x = x.to(dtype=dtype)
+        lengths = torch.tensor(lengths_list, device=device)
+
+        # Pack the data
+        packed = pack_seq_triton(x, lengths)
+
+        # Check output shape and properties
+        expected_shape = (B, max(lengths_list), H, D)
+        assert packed.shape == expected_shape
+        assert packed.dtype == dtype
+        assert packed.device == x.device
+
+        # Check that valid data is preserved (within fp8 precision)
+        for b in range(B):
+            start_idx = sum(lengths_list[:b])
+            seq_len = lengths_list[b]
+
+            expected_data = x[start_idx : start_idx + seq_len].to(torch.float32)
+            actual_data = packed[b, :seq_len].to(torch.float32)
+
+            assert_close(actual_data, expected_data, rtol=1e-1, atol=1e-2)
+
+
+def test_pack_seq_custom_padding_fp8():
+    """Test pack_seq_triton with custom padding values for fp8."""
+    device = "cuda"
+    dtype = torch.float8_e4m3fn
+    N, H, D, B = 20, 8, 16, 2
+    lengths = torch.tensor([10, 10], device=device)
+
+    x = torch.randn(N, H, D, dtype=torch.float32, device=device) * 0.1
+    x = x.to(dtype=dtype)
+
+    # Test with different padding values
+    for pad_value in [-100.0, -10.0, 0.0, 10.0, 100.0]:
+        result = pack_seq_triton(x, lengths, pad_value=pad_value)
+
+        # Check valid data
+        for b in range(B):
+            start_idx = b * 10
+            expected_data = x[start_idx : start_idx + 10].to(torch.float32)
+            actual_data = result[b, :10].to(torch.float32)
+            assert_close(actual_data, expected_data, rtol=1e-1, atol=1e-2)
+
+        # Check padding (fp8 has limited range, so check for large values)
+        padded_data = result[:, 10:].to(torch.float32)
+        if pad_value < 0:
+            assert torch.all(padded_data < -50)  # Large negative values
+        elif pad_value > 0:
+            assert torch.all(padded_data > 50)  # Large positive values
+        else:
+            assert torch.allclose(padded_data, torch.zeros_like(padded_data), atol=1e-2)
+
+
+def test_pack_seq_default_negative_inf_padding_fp8():
+    """Test that pack_seq_triton uses -inf padding by default for fp8."""
+    device = "cuda"
+    dtype = torch.float8_e4m3fn
+    # B = 2
+    N, H, D = 20, 8, 16
+    lengths = torch.tensor([10, 10], device=device)
+
+    x = torch.randn(N, H, D, dtype=torch.float32, device=device) * 0.1
+    x = x.to(dtype=dtype)
+    result = pack_seq_triton(x, lengths)
+
+    # Check that padding is large negative values (fp8 representation of -inf)
+    padded_data = result[:, 10:].to(torch.float32)
+    assert torch.all(
+        padded_data < -100
+    )  # fp8 -inf is represented as large negative number
+
+
+def test_pack_seq_edge_cases_fp8():
+    """Test pack_seq_triton with edge cases for fp8."""
+    device = "cuda"
+    dtype = torch.float8_e4m3fn
+
+    # Test with single batch element
+    x = torch.randn(10, 8, 16, dtype=torch.float32, device=device) * 0.1
+    x = x.to(dtype=dtype)
+    lengths = torch.tensor([10], device=device)
+    result = pack_seq_triton(x, lengths)
+    assert result.shape == (1, 10, 8, 16)
+
+    # Test with very short sequences
+    x = torch.randn(20, 4, 8, dtype=torch.float32, device=device) * 0.1
+    x = x.to(dtype=dtype)
+    lengths = torch.tensor([1, 1, 1], device=device)
+    result = pack_seq_triton(x, lengths)
+    assert result.shape == (3, 1, 4, 8)
+
+    # Test with different sequence lengths
+    x = torch.randn(15, 8, 16, dtype=torch.float32, device=device) * 0.1
+    x = x.to(dtype=dtype)
+    lengths = torch.tensor([5, 7, 3], device=device)
+    result = pack_seq_triton(x, lengths)
+    assert result.shape == (3, 7, 8, 16)
+
+
+def test_pack_seq_different_block_sizes_fp8():
+    """Test pack_seq_triton with different block sizes for fp8."""
+    device = "cuda"
+    dtype = torch.float8_e4m3fn
+    N, H, D, B = 100, 16, 32, 4
+    lengths = torch.tensor([25, 25, 25, 25], device=device)
+
+    x = torch.randn(N, H, D, dtype=torch.float32, device=device) * 0.1
+    x = x.to(dtype=dtype)
+
+    # Test different block sizes
+    for block_t, block_d in [(32, 32), (64, 64), (128, 128)]:
+        result = pack_seq_triton(x, lengths, block_t=block_t, block_d=block_d)
+
+        assert result.shape == (B, 25, H, D)
+
+        # Check that valid data is preserved (within fp8 precision)
+        for b in range(B):
+            start_idx = b * 25
+            expected_data = x[start_idx : start_idx + 25].to(torch.float32)
+            actual_data = result[b, :25].to(torch.float32)
+            assert_close(actual_data, expected_data, rtol=1e-1, atol=1e-2)
+
+
+def test_pack_seq_shape_consistency():
+    """Test that pack_seq_triton maintains shape consistency."""
+    device = "cuda"
+    dtype = torch.float8_e4m3fn
+    N, H, D, B = 20, 8, 16, 2
+    lengths = torch.tensor([10, 10], device=device)
+
+    x = torch.randn(N, H, D, dtype=torch.float32, device=device) * 0.1
+    x = x.to(dtype=dtype)
+
+    result = pack_seq_triton(x, lengths)
+
+    # Check shape consistency
+    assert result.shape[0] == B  # Batch dimension
+    assert result.shape[1] == lengths.max().item()  # Max sequence length
+    assert result.shape[2:] == x.shape[1:]  # Feature dimensions preserved
+
+
+def test_pack_unpack_roundtrip_fp8():
+    """Test that pack -> unpack gives us back the original data for fp8."""
+    device = "cuda"
+    dtype = torch.float8_e4m3fn
+
+    # Test cases with 3D tensors
+    test_cases = [
+        (6, 8, 4, 2, [3, 3]),
+        (10, 4, 8, 3, [2, 4, 4]),
+        (20, 16, 32, 4, [5, 5, 5, 5]),
+        (15, 8, 16, 3, [7, 5, 3]),
+    ]
+
+    for N, H, D, B, lengths_list in test_cases:
+        # Create input tensor with small values for fp8
+        x = torch.randn(N, H, D, dtype=torch.float32, device=device) * 0.1
+        x = x.to(dtype=dtype)
+        lengths = torch.tensor(lengths_list, device=device)
+
+        # Pack the data
+        packed = pack_seq_triton(x, lengths)
+
+        # Unpack the data
+        unpacked = unpack_seq_triton(packed, lengths)
+
+        # Check that we get back the original data (within fp8 precision)
+        assert unpacked.shape == x.shape
+        x_f32 = x.to(torch.float32)
+        unpacked_f32 = unpacked.to(torch.float32)
+        assert_close(x_f32, unpacked_f32, rtol=1e-3, atol=1e-3)
+
+        # Unpack without explicit start locations (computed in kernel)
+        unpacked_with_loc = unpack_seq_triton(packed, lengths)
+        assert_close(x_f32, unpacked_with_loc.to(torch.float32), rtol=1e-3, atol=1e-2)
+
+
+def test_unpack_seq_triton_edge_cases_fp8():
+    """Test unpack function with edge cases for fp8."""
+    device = "cuda"
+    dtype = torch.float8_e4m3fn
+
+    # Test with single batch element
+    x = torch.randn(10, 8, 16, dtype=torch.float32, device=device) * 0.1
+    x = x.to(dtype=dtype)
+    lengths = torch.tensor([10], device=device)
+    packed = pack_seq_triton(x, lengths)
+    unpacked = unpack_seq_triton(packed, lengths)
+    assert unpacked.shape == x.shape
+    assert_close(x.to(torch.float32), unpacked.to(torch.float32), rtol=1e-1, atol=1e-2)
+
+    # Test with very short sequences
+    x = torch.randn(20, 4, 8, dtype=torch.float32, device=device) * 0.1
+    x = x.to(dtype=dtype)
+    lengths = torch.tensor([1, 1, 1], device=device)
+    packed = pack_seq_triton(x, lengths)
+    unpacked = unpack_seq_triton(packed, lengths)
+    # Only compare the first 3 elements that were actually packed
+    assert_close(
+        x[:3].to(torch.float32), unpacked.to(torch.float32), rtol=1e-1, atol=1e-2
+    )
+
+    x = torch.randn(15, 8, 16, dtype=torch.float32, device=device) * 0.1
+    x = x.to(dtype=dtype)
+    lengths = torch.tensor([5, 7, 3], device=device)
+    packed = pack_seq_triton(x, lengths)
+    unpacked = unpack_seq_triton(packed, lengths)
+    assert unpacked.shape == x.shape
+    assert_close(x.to(torch.float32), unpacked.to(torch.float32), rtol=1e-1, atol=1e-2)
--- a/tests/kernels/attention/test_prefix_prefill.py
+++ b/tests/kernels/attention/test_prefix_prefill.py
@@ -0,0 +1,639 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+import random
+import time
+from collections.abc import Callable
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from vllm.attention.ops.chunked_prefill_paged_decode import chunked_prefill_paged_decode
+from vllm.attention.ops.prefix_prefill import context_attention_fwd
+from vllm.platforms import current_platform
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
+
+NUM_HEADS = [64]
+NUM_QUERIES_PER_KV = [1, 64]
+HEAD_SIZES = [24, 128]
+DTYPES = [torch.float16]
+CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+SLIDING_WINDOW = [0, 16, 2048]
+KV_CACHE_DTYPES = ["auto", "fp8", "fp8_e5m2"]
+
+OPS = [chunked_prefill_paged_decode, context_attention_fwd]
+
+
+def create_causal_attention_mask_for_sdpa(
+    query_lens: list[int],
+    seq_lens: list[int],
+    sliding_window: int = 0,
+    device: torch.device = None,
+    dtype: torch.dtype = None,
+) -> torch.Tensor:
+    total_queries = sum(query_lens)
+    total_keys = sum(seq_lens)
+
+    # Create a mask filled with -inf
+    mask = torch.full(
+        (total_queries, total_keys), float("-inf"), device=device, dtype=dtype
+    )
+
+    query_start = 0
+    key_start = 0
+
+    for query_len, seq_len in zip(query_lens, seq_lens):
+        query_end = query_start + query_len
+        key_end = key_start + seq_len
+        q_indices = torch.arange(query_len, device=device)
+        k_indices = torch.arange(seq_len, device=device)
+        q_pos_in_seq = seq_len - query_len + q_indices
+
+        valid_mask = k_indices[None, :] <= q_pos_in_seq[:, None]
+
+        if sliding_window > 0:
+            valid_mask &= k_indices[None, :] >= (
+                q_pos_in_seq[:, None] - sliding_window + 1
+            )
+
+        mask[query_start:query_end, key_start:key_end][valid_mask] = 0.0
+
+        query_start = query_end
+        key_start = key_end
+
+    return mask
+
+
+def create_alibi_causal_mask(
+    query_len: int,
+    seq_len: int,
+    alibi_slopes: torch.Tensor,
+    device: torch.device,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    query_pos = torch.arange(
+        seq_len - query_len, seq_len, device=device, dtype=torch.float32
+    )
+    key_pos = torch.arange(seq_len, device=device, dtype=torch.float32)
+
+    rel_pos = key_pos[None, :] - query_pos[:, None]
+
+    # Apply ALiBi slopes: [num_heads, query_len, seq_len]
+    alibi_bias = alibi_slopes[:, None, None] * rel_pos[None, :, :]
+    alibi_bias = alibi_bias.to(dtype)
+
+    # Apply causal mask: prevent attending to future positions
+    # causal_mask[i, j] = True if key_pos[j] <= query_pos[i]
+    causal_mask = key_pos[None, :] <= query_pos[:, None]
+    alibi_bias = alibi_bias.masked_fill(~causal_mask[None, :, :], float("-inf"))
+
+    # Add batch dimension: [1, num_heads, query_len, seq_len]
+    # SDPA expects batch dimension even for single sequences
+    return alibi_bias.unsqueeze(0)
+
+
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOW)
+@pytest.mark.parametrize("op", OPS)
+@torch.inference_mode()
+def test_contexted_kv_attention(
+    num_heads: int,
+    num_queries_per_kv: int,
+    head_size: int,
+    sliding_window: int,
+    dtype: torch.dtype,
+    kv_cache_dtype: str,
+    device: str,
+    op: Callable,
+) -> None:
+    if "fp8" in kv_cache_dtype and not current_platform.has_device_capability(89):
+        pytest.skip(
+            "Triton limitation: fp8e4nv data type is not supported on CUDA arch < 89"
+        )
+
+    if (
+        current_platform.is_rocm()
+        and op is chunked_prefill_paged_decode
+        and kv_cache_dtype == "fp8_e5m2"
+    ):
+        pytest.skip("ROCm custom paged attention does not support fp8_e5m2 KV cache")
+
+    current_platform.seed_everything(0)
+    torch.set_default_device(device)
+
+    # Need this, otherwise when we capture the graph the process
+    # for GPU 1 would run on both GPU0 and GPU1 and things would hang
+    #
+    # see also similar issue: https://github.com/Dao-AILab/flash-attention/issues/523
+    torch.cuda.set_device(device)
+
+    MAX_SEQ_LEN = 1024
+    MAX_CTX_LEN = 1024
+    BS = 10
+    cache_size = 640
+    block_size = 32
+    max_block_per_request = 64
+    query_lens = [random.randint(16, MAX_SEQ_LEN) for _ in range(BS)]
+    # ensure one sequence in batch is a decode
+    query_lens[-1] = 1
+
+    ctx_lens = [random.randint(16, MAX_CTX_LEN) for _ in range(BS)]
+    seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)]
+    num_kv_heads = num_heads // num_queries_per_kv
+
+    num_tokens = sum(query_lens)
+    query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
+    query.uniform_(-1e-3, 1e-3)
+    output = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
+
+    kv = torch.empty(sum(seq_lens), 2, num_kv_heads, head_size, dtype=dtype)
+    kv.uniform_(-1e-3, 1e-3)
+    key, value = kv.unbind(dim=1)
+
+    if kv_cache_dtype == "auto":
+        cache_dtype = dtype
+    else:
+        cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[kv_cache_dtype]
+    k_cache = torch.zeros(
+        cache_size, block_size, num_kv_heads, head_size, dtype=cache_dtype
+    )
+    v_cache = torch.zeros(
+        cache_size, block_size, num_kv_heads, head_size, dtype=cache_dtype
+    )
+    k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
+    v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
+    values = torch.arange(0, cache_size, dtype=torch.int32)
+    values = values[torch.randperm(cache_size)]
+    block_table = values[: BS * max_block_per_request].view(BS, max_block_per_request)
+    b_seq_len = torch.tensor(seq_lens, dtype=torch.int32)
+    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.int32)
+    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens), dim=0).to(torch.int32)
+    max_input_len = MAX_SEQ_LEN
+    # copy kv to cache
+    b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1]), dim=0).to(
+        torch.int32
+    )
+    for i in range(BS):
+        for j in range(query_lens[i]):
+            k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] + j])
+            v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] + b_ctx_len[i] + j])
+        cur_ctx = 0
+        block_id = 0
+        while cur_ctx < b_ctx_len[i]:
+            start_loc = b_seq_start_loc[i] + cur_ctx
+            if cur_ctx + block_size > b_ctx_len[i]:
+                end_loc = b_seq_start_loc[i] + b_ctx_len[i]
+            else:
+                end_loc = start_loc + block_size
+            start_slot = block_table[i, block_id] * block_size
+            end_slot = start_slot + end_loc - start_loc
+            k_cache.view(-1, num_kv_heads, head_size)[start_slot:end_slot].copy_(
+                key[start_loc:end_loc]
+            )
+            v_cache.view(-1, num_kv_heads, head_size)[start_slot:end_slot].copy_(
+                value[start_loc:end_loc]
+            )
+            cur_ctx += block_size
+            block_id += 1
+    # transpose K_cache[num_blocks, block_size, num_kv_heads, head_size]
+    # to K_cache[num_blocks, num_kv_heads, head_size/8, block_size, 8]
+    k_cache = (
+        k_cache.view(-1, block_size, num_kv_heads, head_size // 8, 8)
+        .permute(0, 2, 3, 1, 4)
+        .contiguous()
+    )
+    # transpose V_cache[num_blocks, block_size, num_kv_heads, head_size]
+    # to V_cache[num_blocks, num_kv_heads, head_size, block_size]
+    v_cache = (
+        v_cache.view(-1, block_size, num_kv_heads, head_size)
+        .permute(0, 2, 3, 1)
+        .contiguous()
+    )
+    k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device)
+
+    # Warm up the Triton kernel by calling it once before actually measuring
+    # generation time
+    op(
+        query,
+        k,
+        v,
+        output,
+        kv_cache_dtype,
+        k_cache,
+        v_cache,
+        block_table,
+        b_start_loc,
+        b_seq_len,
+        MAX_CTX_LEN,
+        max_input_len,
+        k_scale,
+        v_scale,
+        sliding_window=sliding_window,
+    )
+    torch.cuda.synchronize()
+    start_time = time.time()
+    op(
+        query,
+        k,
+        v,
+        output,
+        kv_cache_dtype,
+        k_cache,
+        v_cache,
+        block_table,
+        b_start_loc,
+        b_seq_len,
+        MAX_CTX_LEN,
+        max_input_len,
+        k_scale,
+        v_scale,
+        sliding_window=sliding_window,
+    )
+    torch.cuda.synchronize()
+    end_time = time.time()
+    print(f"triton Time: {(end_time - start_time) * 1000:.2f} ms")
+
+    scale = float(1.0 / (head_size**0.5))
+
+    # Reshape for SDPA: (seq_len, num_heads, head_size) ->
+    # (1, num_heads, seq_len, head_size)
+    query_sdpa = query.view(num_tokens, num_kv_heads, num_queries_per_kv, head_size)
+    query_sdpa = query_sdpa.permute(1, 2, 0, 3).reshape(
+        1, num_heads, num_tokens, head_size
+    )
+
+    # Expand key and value for GQA/MQA to match query heads
+    key_sdpa = key[:, :, None, :].expand(
+        key.shape[0], num_kv_heads, num_queries_per_kv, key.shape[-1]
+    )
+    key_sdpa = key_sdpa.permute(1, 2, 0, 3).reshape(
+        1, num_heads, sum(seq_lens), head_size
+    )
+
+    value_sdpa = value[:, :, None, :].expand(
+        value.shape[0], num_kv_heads, num_queries_per_kv, value.shape[-1]
+    )
+    value_sdpa = value_sdpa.permute(1, 2, 0, 3).reshape(
+        1, num_heads, sum(seq_lens), head_size
+    )
+
+    attn_mask = create_causal_attention_mask_for_sdpa(
+        query_lens, seq_lens, sliding_window, device=device, dtype=dtype
+    )
+
+    output_ref = F.scaled_dot_product_attention(
+        query_sdpa,
+        key_sdpa,
+        value_sdpa,
+        attn_mask=attn_mask,
+        dropout_p=0.0,
+        scale=scale,
+    )
+    torch.cuda.synchronize()
+    start_time = time.time()
+    output_ref = F.scaled_dot_product_attention(
+        query_sdpa,
+        key_sdpa,
+        value_sdpa,
+        attn_mask=attn_mask,
+        dropout_p=0.0,
+        scale=scale,
+    )
+    torch.cuda.synchronize()
+    end_time = time.time()
+    print(f"PyTorch SDPA Time: {(end_time - start_time) * 1000:.2f} ms")
+
+    # Reshape output back to (num_tokens, num_heads, head_size)
+    output_ref = output_ref.view(num_heads, num_tokens, head_size)
+    output_ref = output_ref.permute(1, 0, 2).contiguous()
+    atol = 1e-3 if "fp8" in kv_cache_dtype else 1e-4
+    torch.testing.assert_close(output, output_ref, atol=atol, rtol=0)
+
+
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("op", OPS)
+@torch.inference_mode()
+def test_contexted_kv_attention_alibi(
+    num_heads: int,
+    num_queries_per_kv: int,
+    head_size: int,
+    dtype: torch.dtype,
+    kv_cache_dtype: str,
+    device: str,
+    op: Callable,
+) -> None:
+    if "fp8" in kv_cache_dtype and not current_platform.has_device_capability(89):
+        pytest.skip(
+            "Triton limitation: fp8e4nv data type is not supported on CUDA arch < 89"
+        )
+
+    if (
+        current_platform.is_rocm()
+        and op is chunked_prefill_paged_decode
+        and kv_cache_dtype == "fp8_e5m2"
+    ):
+        pytest.skip("ROCm custom paged attention does not support fp8_e5m2 KV cache")
+
+    current_platform.seed_everything(0)
+    torch.set_default_device(device)
+
+    # Need this, otherwise when we capture the graph the process
+    # for GPU 1 would run on both GPU0 and GPU1 and things would hang
+    #
+    # see also similar issue: https://github.com/Dao-AILab/flash-attention/issues/523
+    torch.cuda.set_device(device)
+
+    def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
+        # Fork from: vllm/vllm/model_executor/models/bloom.py#L44
+        closest_power_of_2 = 2 ** math.floor(math.log2(total_num_heads))
+        base = torch.tensor(
+            2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))),
+            dtype=torch.float32,
+        )
+        powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
+        slopes = torch.pow(base, powers)
+
+        if closest_power_of_2 != total_num_heads:
+            extra_base = torch.tensor(
+                2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))),
+                dtype=torch.float32,
+            )
+            num_remaining_heads = min(
+                closest_power_of_2, total_num_heads - closest_power_of_2
+            )
+            extra_powers = torch.arange(
+                start=1, end=1 + 2 * num_remaining_heads, step=2, dtype=torch.int32
+            )
+            slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
+        return slopes
+
+    alibi_slopes = _get_alibi_slopes(num_heads).to(device)
+
+    MAX_SEQ_LEN = 1024
+    MAX_CTX_LEN = 1024
+    BS = 10
+    cache_size = 640
+    block_size = 32
+    max_block_per_request = 64
+    query_lens = [random.randint(16, MAX_SEQ_LEN) for _ in range(BS)]
+    ctx_lens = [random.randint(16, MAX_CTX_LEN) for _ in range(BS)]
+    seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)]
+    num_kv_heads = num_heads // num_queries_per_kv
+
+    num_tokens = sum(query_lens)
+    query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
+    query.uniform_(-1e-3, 1e-3)
+    output = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
+
+    kv = torch.empty(sum(seq_lens), 2, num_kv_heads, head_size, dtype=dtype)
+    kv.uniform_(-1e-3, 1e-3)
+    key, value = kv.unbind(dim=1)
+    if kv_cache_dtype == "auto":
+        cache_dtype = dtype
+    else:
+        cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[kv_cache_dtype]
+    k_cache = torch.zeros(
+        cache_size, block_size, num_kv_heads, head_size, dtype=cache_dtype
+    )
+    v_cache = torch.zeros(
+        cache_size, block_size, num_kv_heads, head_size, dtype=cache_dtype
+    )
+    k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
+    v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
+    values = torch.arange(0, cache_size, dtype=torch.int32)
+    values = values[torch.randperm(cache_size)]
+    block_table = values[: BS * max_block_per_request].view(BS, max_block_per_request)
+    b_seq_len = torch.tensor(seq_lens, dtype=torch.int32)
+    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.int32)
+    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens), dim=0).to(torch.int32)
+    max_input_len = MAX_SEQ_LEN
+    # copy kv to cache
+    b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1]), dim=0).to(
+        torch.int32
+    )
+    for i in range(BS):
+        for j in range(query_lens[i]):
+            k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] + j])
+            v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] + b_ctx_len[i] + j])
+        cur_ctx = 0
+        block_id = 0
+        while cur_ctx < b_ctx_len[i]:
+            start_loc = b_seq_start_loc[i] + cur_ctx
+            if cur_ctx + block_size > b_ctx_len[i]:
+                end_loc = b_seq_start_loc[i] + b_ctx_len[i]
+            else:
+                end_loc = start_loc + block_size
+            start_slot = block_table[i, block_id] * block_size
+            end_slot = start_slot + end_loc - start_loc
+            k_cache.view(-1, num_kv_heads, head_size)[start_slot:end_slot].copy_(
+                key[start_loc:end_loc]
+            )
+            v_cache.view(-1, num_kv_heads, head_size)[start_slot:end_slot].copy_(
+                value[start_loc:end_loc]
+            )
+            cur_ctx += block_size
+            block_id += 1
+    # transpose K_cache[num_blocks, block_size, num_kv_heads, head_size]
+    # to K_cache[num_blocks, num_kv_heads, head_size/8, block_size, 8]
+    k_cache = (
+        k_cache.view(-1, block_size, num_kv_heads, head_size // 8, 8)
+        .permute(0, 2, 3, 1, 4)
+        .contiguous()
+    )
+    # transpose V_cache[num_blocks, block_size, num_kv_heads, head_size]
+    # to V_cache[num_blocks, num_kv_heads, head_size, block_size]
+    v_cache = (
+        v_cache.view(-1, block_size, num_kv_heads, head_size)
+        .permute(0, 2, 3, 1)
+        .contiguous()
+    )
+    k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device)
+
+    # Warm up the Triton kernel by calling it once before actually measuring
+    # generation time
+    op(
+        query,
+        k,
+        v,
+        output,
+        kv_cache_dtype,
+        k_cache,
+        v_cache,
+        block_table,
+        b_start_loc,
+        b_seq_len,
+        MAX_CTX_LEN,
+        max_input_len,
+        k_scale,
+        v_scale,
+        alibi_slopes=alibi_slopes,
+    )
+    torch.cuda.synchronize()
+    start_time = time.time()
+    op(
+        query,
+        k,
+        v,
+        output,
+        kv_cache_dtype,
+        k_cache,
+        v_cache,
+        block_table,
+        b_start_loc,
+        b_seq_len,
+        MAX_CTX_LEN,
+        max_input_len,
+        k_scale,
+        v_scale,
+        alibi_slopes=alibi_slopes,
+    )
+    torch.cuda.synchronize()
+    end_time = time.time()
+    print(f"triton Time: {(end_time - start_time) * 1000:.2f} ms")
+    scale = float(1.0 / (head_size**0.5))
+
+    # Prepare query, key, value for SDPA
+    # Expand key and value for GQA/MQA to match query heads
+    key_expanded = key[:, :, None, :].expand(
+        key.shape[0], num_kv_heads, num_queries_per_kv, key.shape[-1]
+    )
+    value_expanded = value[:, :, None, :].expand(
+        value.shape[0], num_kv_heads, num_queries_per_kv, value.shape[-1]
+    )
+
+    output_ref = torch.empty_like(output)
+
+    torch.cuda.synchronize()
+    start_time = time.time()
+
+    query_start = 0
+    key_start = 0
+    for i, (query_len, seq_len) in enumerate(zip(query_lens, seq_lens)):
+        query_end = query_start + query_len
+        key_end = key_start + seq_len
+
+        # Get query, key, value for this sequence
+        q = query[query_start:query_end]  # [query_len, num_heads, head_size]
+        k = key_expanded[
+            key_start:key_end
+        ]  # [seq_len, num_kv_heads, num_queries_per_kv, head_size]
+        v = value_expanded[
+            key_start:key_end
+        ]  # [seq_len, num_kv_heads, num_queries_per_kv, head_size]
+
+        # Reshape for SDPA: (batch=1, num_heads, seq_len, head_size)
+        q_sdpa = q.view(query_len, num_kv_heads, num_queries_per_kv, head_size)
+        q_sdpa = (
+            q_sdpa.permute(1, 2, 0, 3)
+            .reshape(1, num_heads, query_len, head_size)
+            .contiguous()
+        )
+
+        k_sdpa = (
+            k.permute(1, 2, 0, 3).reshape(1, num_heads, seq_len, head_size).contiguous()
+        )
+        v_sdpa = (
+            v.permute(1, 2, 0, 3).reshape(1, num_heads, seq_len, head_size).contiguous()
+        )
+
+        # Create ALiBi causal mask for this sequence using utility function
+        alibi_mask = create_alibi_causal_mask(
+            query_len, seq_len, alibi_slopes, device, dtype
+        )
+
+        # Compute attention
+        out = F.scaled_dot_product_attention(
+            q_sdpa,
+            k_sdpa,
+            v_sdpa,
+            attn_mask=alibi_mask,
+            dropout_p=0.0,
+            scale=scale,
+        )
+
+        # Reshape output back to [query_len, num_heads, head_size]
+        out = out.view(num_heads, query_len, head_size).permute(1, 0, 2)
+        output_ref[query_start:query_end].copy_(out)
+
+        query_start = query_end
+        key_start = key_end
+
+    torch.cuda.synchronize()
+    end_time = time.time()
+    print(f"PyTorch SDPA Time: {(end_time - start_time) * 1000:.2f} ms")
+    atol = 1e-3 if "fp8" in kv_cache_dtype else 1e-6
+    torch.testing.assert_close(output, output_ref, atol=atol, rtol=0)
+
+
+# These tests are optional to only run when explicitly invoked
+#
+# pytest -v -s --optional \
+# tests/kernels/test_prefix_prefill.py::test_contexted_kv_attention_f32
+#
+# These tests are useful to test model dtype float32 on Turing devices.
+# We skip them to not increase the time when running tests on CI
+@pytest.mark.optional
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", [torch.float32])
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOW)
+@pytest.mark.parametrize("op", OPS)
+@torch.inference_mode()
+def test_contexted_kv_attention_f32(
+    num_heads: int,
+    num_queries_per_kv: int,
+    head_size: int,
+    sliding_window: int,
+    dtype: torch.dtype,
+    kv_cache_dtype: str,
+    device: str,
+    op: Callable,
+) -> None:
+    test_contexted_kv_attention(
+        num_heads,
+        num_queries_per_kv,
+        head_size,
+        sliding_window,
+        dtype,
+        kv_cache_dtype,
+        device,
+        op,
+    )
+
+
+@pytest.mark.optional
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", [torch.float32])
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("op", OPS)
+@torch.inference_mode()
+def test_contexted_kv_attention_alibi_f32(
+    num_heads: int,
+    num_queries_per_kv: int,
+    head_size: int,
+    dtype: torch.dtype,
+    kv_cache_dtype: str,
+    device: str,
+    op: Callable,
+) -> None:
+    test_contexted_kv_attention_alibi(
+        num_heads, num_queries_per_kv, head_size, dtype, kv_cache_dtype, device, op
+    )
--- a/tests/kernels/attention/test_rocm_attention_selector.py
+++ b/tests/kernels/attention/test_rocm_attention_selector.py
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
+from vllm.platforms.rocm import RocmPlatform
+
+
+@pytest.fixture(autouse=True)
+def clear_cache():
+    """Clear lru cache to ensure each test case runs without caching."""
+    _cached_get_attn_backend.cache_clear()
+
+
+@pytest.mark.skip(reason="Skipped for now. Should be revisited.")
+def test_selector(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_ATTN")
+
+        # Set the current platform to ROCm using monkeypatch
+        monkeypatch.setattr("vllm.attention.selector.current_platform", RocmPlatform())
+
+        # Test standard ROCm attention
+        backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
+        assert backend.get_name() == "ROCM_FLASH" or backend.get_name() == "TRITON_ATTN"
+
+        # MLA test for deepseek related
+
+        # change the attention backend to triton MLA
+        m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_MLA")
+        backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, use_mla=True)
+        assert backend.get_name() == "TRITON_MLA"
+
+        # If attention backend is None
+        # If use_mla is true
+        # The selected backend is triton MLA
+        m.setenv("VLLM_ATTENTION_BACKEND", "")
+        backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, use_mla=True)
+        assert backend.get_name() == "TRITON_MLA"
+
+        # change the attention backend to AITER MLA
+        m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_MLA")
+        backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, use_mla=True)
+        assert backend.get_name() == "ROCM_AITER_MLA"
+
+        # If attention backend is None
+        # If use_mla is true
+        # If VLLM_ROCM_USE_AITER is enabled
+        # The selected backend is ROCM_AITER_MLA
+        m.setenv("VLLM_ATTENTION_BACKEND", "")
+        m.setenv("VLLM_ROCM_USE_AITER", "1")
+        backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, use_mla=True)
+        assert backend.get_name() == "ROCM_AITER_MLA"
--- a/tests/kernels/attention/test_triton_decode_attention.py
+++ b/tests/kernels/attention/test_triton_decode_attention.py
@@ -0,0 +1,92 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm.attention.ops.triton_decode_attention import decode_attention_fwd
+from vllm.utils.math_utils import cdiv
+
+
+@pytest.mark.parametrize("B", [3, 5])
+@pytest.mark.parametrize("L", [1027, 1025])
+@pytest.mark.parametrize("H_Q", [32])
+@pytest.mark.parametrize("H_KV", [32, 8])
+@pytest.mark.parametrize("D_QK", [128, 192, 576])
+@pytest.mark.parametrize("D_V", [128, 512])
+@pytest.mark.parametrize("CACHE_SIZE", [16384])
+@pytest.mark.parametrize("PAGE_SIZE", [1, 16])
+def test_decode_attention(B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE):
+    assert CACHE_SIZE % PAGE_SIZE == 0
+    dtype = torch.bfloat16
+    seq_len = L  # This represents the number of tokens already in the sequence
+    sm_scale = 1.0 / (D_QK**0.5)
+    num_kv_splits = 8
+
+    num_pages_per_batch = cdiv(seq_len, PAGE_SIZE)
+    req_to_page = torch.randint(
+        0, CACHE_SIZE // PAGE_SIZE, (B, num_pages_per_batch, 1), device="cuda"
+    )
+    req_to_token = req_to_page * PAGE_SIZE
+    req_to_token = req_to_token.expand(B, num_pages_per_batch, PAGE_SIZE)
+    req_to_token = req_to_token + torch.arange(PAGE_SIZE, device="cuda").view(1, 1, -1)
+    req_to_token = req_to_token.view(B, -1)
+    req_to_token = req_to_token[:, :seq_len].contiguous()
+
+    # q represents the new token being generated, one per batch
+    q = torch.randn(B, H_Q, D_QK, dtype=dtype, device="cuda")
+
+    # k_buffer and v_buffer represent all previous tokens
+    # Page size is 1.
+    k_buffer = torch.randn(CACHE_SIZE, H_KV, D_QK, dtype=dtype, device="cuda")
+    v_buffer = torch.randn(CACHE_SIZE, H_KV, D_V, dtype=dtype, device="cuda")
+
+    # o will have the same shape as q
+    o = torch.zeros(B, H_Q, D_V, dtype=dtype, device="cuda")
+
+    lse = torch.zeros(B, H_Q, dtype=dtype, device="cuda")
+
+    b_seq_len = torch.full((B,), seq_len, device="cuda")
+
+    attn_logits = torch.empty(
+        (B, H_Q, num_kv_splits, D_V + 1),
+        dtype=torch.float32,
+        device="cuda",
+    )
+
+    # Call the original implementation.
+    decode_attention_fwd(
+        q,
+        k_buffer,
+        v_buffer,
+        o,
+        lse,
+        req_to_token,
+        b_seq_len,
+        attn_logits,
+        num_kv_splits,
+        sm_scale,
+    )
+
+    # Page size can be larger than 1.
+    k_buffer = k_buffer.view(CACHE_SIZE // PAGE_SIZE, PAGE_SIZE, H_KV, D_QK)
+    v_buffer = v_buffer.view(CACHE_SIZE // PAGE_SIZE, PAGE_SIZE, H_KV, D_V)
+
+    o1 = torch.zeros_like(o)
+    lse1 = torch.zeros_like(lse)
+
+    decode_attention_fwd(
+        q,
+        k_buffer,
+        v_buffer,
+        o1,
+        lse1,
+        req_to_page,
+        b_seq_len,
+        attn_logits,
+        num_kv_splits,
+        sm_scale,
+        PAGE_SIZE,
+    )
+
+    assert torch.allclose(o, o1)
--- a/tests/kernels/attention/test_triton_unified_attention.py
+++ b/tests/kernels/attention/test_triton_unified_attention.py
@@ -0,0 +1,218 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import pytest
+import torch
+
+from vllm.attention.ops.triton_unified_attention import unified_attention
+from vllm.platforms import current_platform
+from vllm.utils.math_utils import next_power_of_2
+
+NUM_HEADS = [(4, 4), (8, 2)]
+HEAD_SIZES = [128, 256]
+BLOCK_SIZES = [16]
+
+DTYPES = [torch.bfloat16]
+QDTYPES = (
+    [None, torch.float8_e4m3fn]
+    if not current_platform.is_rocm()
+    else [None, torch.float8_e4m3fnuz]
+)
+# one value large enough to test overflow in index calculation.
+# one value small enough to test the schema op check
+NUM_BLOCKS = [32768, 2048]
+
+# 0: use 2D kernel for decode
+# 8: use 3D kernel for decode
+SEQ_THRESHOLD_3D_VALUES = [0, 8]
+
+
+def ref_paged_attn(
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    query_lens: list[int],
+    kv_lens: list[int],
+    block_tables: torch.Tensor,
+    scale: float,
+    sliding_window: int | None = None,
+    soft_cap: float | None = None,
+) -> torch.Tensor:
+    num_seqs = len(query_lens)
+    block_tables = block_tables.cpu().numpy()
+    _, block_size, num_kv_heads, head_size = key_cache.shape
+
+    outputs: list[torch.Tensor] = []
+    start_idx = 0
+    for i in range(num_seqs):
+        query_len = query_lens[i]
+        kv_len = kv_lens[i]
+        q = query[start_idx : start_idx + query_len]
+        q *= scale
+
+        num_kv_blocks = (kv_len + block_size - 1) // block_size
+        block_indices = block_tables[i, :num_kv_blocks]
+
+        k = key_cache[block_indices].view(-1, num_kv_heads, head_size)
+        k = k[:kv_len]
+        v = value_cache[block_indices].view(-1, num_kv_heads, head_size)
+        v = v[:kv_len]
+
+        if q.shape[1] != k.shape[1]:
+            k = torch.repeat_interleave(k, q.shape[1] // k.shape[1], dim=1)
+            v = torch.repeat_interleave(v, q.shape[1] // v.shape[1], dim=1)
+        attn = torch.einsum("qhd,khd->hqk", q, k).float()
+        empty_mask = torch.ones(query_len, kv_len)
+        mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
+        if sliding_window is not None:
+            sliding_window_mask = (
+                torch.triu(
+                    empty_mask, diagonal=kv_len - (query_len + sliding_window) + 1
+                )
+                .bool()
+                .logical_not()
+            )
+            mask |= sliding_window_mask
+        if soft_cap is not None and soft_cap > 0:
+            attn = soft_cap * torch.tanh(attn / soft_cap)
+        attn.masked_fill_(mask, float("-inf"))
+        attn = torch.softmax(attn, dim=-1).to(v.dtype)
+        out = torch.einsum("hqk,khd->qhd", attn, v)
+
+        outputs.append(out)
+        start_idx += query_len
+
+    return torch.cat(outputs, dim=0)
+
+
+@pytest.mark.parametrize(
+    "seq_lens", [[(1, 1328), (5, 18), (129, 463)], [(1, 523), (1, 37), (1, 2011)]]
+)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("sliding_window", [None, 64, 128, 256])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("soft_cap", [None, 50.0])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("q_dtype", QDTYPES)
+@pytest.mark.parametrize("seq_threshold_3D", SEQ_THRESHOLD_3D_VALUES)
+@torch.inference_mode()
+def test_triton_unified_attn(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    q_dtype: torch.dtype | None,
+    seq_threshold_3D: int,
+) -> None:
+    torch.set_default_device("cuda")
+
+    current_platform.seed_everything(0)
+    num_seqs = len(seq_lens)
+    query_lens = [x[0] for x in seq_lens]
+    kv_lens = [x[1] for x in seq_lens]
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+    max_query_len = max(query_lens)
+    max_kv_len = max(kv_lens)
+    window_size = (sliding_window - 1, 0) if sliding_window is not None else (-1, -1)
+    scale = head_size**-0.5
+
+    query = torch.randn(sum(query_lens), num_query_heads, head_size, dtype=dtype)
+    key_cache = torch.randn(
+        num_blocks, block_size, num_kv_heads, head_size, dtype=dtype
+    )
+    value_cache = torch.randn_like(key_cache)
+    cu_query_lens = torch.tensor([0] + query_lens, dtype=torch.int32).cumsum(
+        dim=0, dtype=torch.int32
+    )
+    kv_lens = torch.tensor(kv_lens, dtype=torch.int32)
+
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(
+        0, num_blocks, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
+
+    output = torch.empty_like(query)
+
+    maybe_quantized_query = query
+    maybe_quantized_key_cache = key_cache
+    maybe_quantized_value_cache = value_cache
+    q_descale = None
+    k_descale = None
+    v_descale = None
+    if q_dtype is not None:
+        # QKV are drawn from N(0, 1): no need for a fp8 scaling factor
+        maybe_quantized_query = query.to(q_dtype)
+        maybe_quantized_key_cache = key_cache.to(q_dtype)
+        maybe_quantized_value_cache = value_cache.to(q_dtype)
+
+        scale_shape = (num_seqs, num_kv_heads)
+        q_descale = None  # Not yet supported
+        k_descale = torch.rand(scale_shape, dtype=torch.float32)
+        v_descale = torch.rand(scale_shape, dtype=torch.float32)
+
+    num_par_softmax_segments = 16
+    head_size_padded = next_power_of_2(head_size)
+    softmax_segm_output = torch.empty(
+        (seq_threshold_3D, num_query_heads, num_par_softmax_segments, head_size_padded),
+        dtype=torch.float32,
+    )
+    softmax_segm_max = torch.empty(
+        (seq_threshold_3D, num_query_heads, num_par_softmax_segments),
+        dtype=torch.float32,
+    )
+    softmax_segm_expsum = torch.empty(
+        (seq_threshold_3D, num_query_heads, num_par_softmax_segments),
+        dtype=torch.float32,
+    )
+
+    unified_attention(
+        q=maybe_quantized_query,
+        k=maybe_quantized_key_cache,
+        v=maybe_quantized_value_cache,
+        out=output,
+        cu_seqlens_q=cu_query_lens,
+        seqused_k=kv_lens,
+        max_seqlen_q=max_query_len,
+        max_seqlen_k=max_kv_len,
+        softmax_scale=scale,
+        causal=True,
+        window_size=window_size,
+        block_table=block_tables,
+        softcap=soft_cap if soft_cap is not None else 0,
+        q_descale=q_descale,
+        k_descale=k_descale,
+        v_descale=v_descale,
+        seq_threshold_3D=seq_threshold_3D,
+        num_par_softmax_segments=num_par_softmax_segments,
+        softmax_segm_output=softmax_segm_output,
+        softmax_segm_max=softmax_segm_max,
+        softmax_segm_expsum=softmax_segm_expsum,
+    )
+
+    ref_output = ref_paged_attn(
+        query=query,
+        key_cache=key_cache,
+        value_cache=value_cache,
+        query_lens=query_lens,
+        kv_lens=kv_lens,
+        block_tables=block_tables,
+        scale=scale,
+        sliding_window=sliding_window,
+        soft_cap=soft_cap,
+    )
+    atol, rtol = 1.5e-2, 1e-2
+    if q_dtype is not None:
+        atol, rtol = 1.5e-1, 1.5e-1
+    (
+        torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol),
+        f"{torch.max(torch.abs(output - ref_output))}",
+    )