Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/tests/v1/kv_offload/test_cpu_gpu.py
+++ b/tests/v1/kv_offload/test_cpu_gpu.py
@@ -0,0 +1,191 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import random
+import time
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
+from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
+from vllm.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandlers
+
+BACKENDS_TO_TEST = [FlashAttentionBackend]
+
+if not current_platform.is_rocm():
+    from vllm.v1.attention.backends.flashinfer import FlashInferBackend
+
+    BACKENDS_TO_TEST.append(FlashInferBackend)
+
+    from vllm.v1.attention.backends.mla.flashattn_mla import FlashAttnMLABackend
+
+    BACKENDS_TO_TEST.append(FlashAttnMLABackend)
+
+NUM_GPU_BLOCKS = [64]
+NUM_CPU_BLOCKS = [256]
+GPU_BLOCK_SIZES = [16]
+GPU_BLOCKS_PER_CPU_BLOCK = [1, 3]
+HEAD_SIZES = [64]
+NUM_HEADS = [8]
+NUM_LAYERS = [4]
+DTYPES = [torch.bfloat16]
+SEEDS = [0]
+CUDA_DEVICES = ["cuda:0"]
+NUM_MAPPINGS = [3]
+
+
+@pytest.mark.parametrize("gpu_to_cpu", [True, False])
+@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("gpu_block_size", GPU_BLOCK_SIZES)
+@pytest.mark.parametrize("gpu_blocks_per_cpu_block", GPU_BLOCKS_PER_CPU_BLOCK)
+@pytest.mark.parametrize("num_gpu_blocks", NUM_GPU_BLOCKS)
+@pytest.mark.parametrize("num_cpu_blocks", NUM_CPU_BLOCKS)
+@pytest.mark.parametrize("num_layers", NUM_LAYERS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_transfer(
+    gpu_to_cpu: bool,
+    num_mappings: int,
+    head_size: int,
+    num_heads: int,
+    gpu_block_size: int,
+    gpu_blocks_per_cpu_block: int,
+    num_gpu_blocks: int,
+    num_cpu_blocks: int,
+    num_layers: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+) -> None:
+    current_platform.seed_everything(seed)
+
+    # create per-layer GPU KV caches based on available attn_backends
+    attn_backends_list = BACKENDS_TO_TEST
+
+    gpu_caches = {}
+    attn_backends = {}
+    for i in range(num_layers):
+        layer_name = f"layer {i}"
+
+        attn_backend = attn_backends_list[i % len(attn_backends_list)]
+        attn_backends[layer_name] = attn_backend
+
+        gpu_cache_shape = attn_backend.get_kv_cache_shape(
+            num_gpu_blocks, gpu_block_size, num_heads, head_size
+        )
+        gpu_caches[layer_name] = torch.rand(gpu_cache_shape, dtype=dtype, device=device)
+
+    # create handler
+    cpu_block_size = gpu_blocks_per_cpu_block * gpu_block_size
+    handlers = CpuGpuOffloadingHandlers(
+        attn_backends=attn_backends,
+        gpu_block_size=gpu_block_size,
+        cpu_block_size=cpu_block_size,
+        num_cpu_blocks=num_cpu_blocks,
+        gpu_caches=gpu_caches,
+    )
+
+    # select block mappings
+    gpu_blocks = random.sample(
+        range(num_gpu_blocks), num_mappings * gpu_blocks_per_cpu_block
+    )
+    cpu_blocks = random.sample(range(num_cpu_blocks), num_mappings)
+
+    # convert cpu blocks to gpu block size
+    cpu_blocks_in_gpu_block_size = []
+    for cpu_block in cpu_blocks:
+        base_block_id = cpu_block * gpu_blocks_per_cpu_block
+        for i in range(gpu_blocks_per_cpu_block):
+            cpu_blocks_in_gpu_block_size.append(i + base_block_id)
+
+    # maybe skip a GPU block to test reading from the middle of a CPU block
+    if not gpu_to_cpu:
+        gpu_blocks = gpu_blocks[gpu_blocks_per_cpu_block - 1 :]
+        cpu_blocks_in_gpu_block_size = cpu_blocks_in_gpu_block_size[
+            gpu_blocks_per_cpu_block - 1 :
+        ]
+
+    # set transfer direction
+    if gpu_to_cpu:
+        handler = handlers.gpu_to_cpu_handler
+        src_spec_class = GPULoadStoreSpec
+        dst_spec_class = CPULoadStoreSpec
+        src_blocks = gpu_blocks
+        dst_blocks = cpu_blocks
+        src_blocks_in_gpu_block_size = gpu_blocks
+        dst_blocks_in_gpu_block_size = cpu_blocks_in_gpu_block_size
+        dst_size_in_gpu_blocks = num_cpu_blocks * gpu_blocks_per_cpu_block
+    else:
+        handler = handlers.cpu_to_gpu_handler
+        src_spec_class = CPULoadStoreSpec
+        dst_spec_class = GPULoadStoreSpec
+        src_blocks = cpu_blocks
+        dst_blocks = gpu_blocks
+        src_blocks_in_gpu_block_size = cpu_blocks_in_gpu_block_size
+        dst_blocks_in_gpu_block_size = gpu_blocks
+        dst_size_in_gpu_blocks = num_gpu_blocks
+
+    # build dst -> src mapping
+    dst_to_src = {}
+    for src_block, dst_block in zip(
+        src_blocks_in_gpu_block_size, dst_blocks_in_gpu_block_size
+    ):
+        dst_to_src[dst_block] = src_block
+
+    # build transfer specs
+    src_spec = src_spec_class(src_blocks)
+    dst_spec = dst_spec_class(dst_blocks)
+
+    # clone src and dst tensors before transfer
+    orig_src_caches = [x.clone() for x in handler.src_tensors]
+    orig_dst_caches = [x.clone() for x in handler.dst_tensors]
+
+    # call transfer function
+    assert handler.transfer_async(1, (src_spec, dst_spec))
+    assert set({x[0] for x in handler._transfers}) == {1}
+
+    # wait for transfer to complete
+    end_time = time.time() + 10
+    while time.time() < end_time:
+        finished = handler.get_finished()
+        if finished:
+            assert finished == [(1, True)]
+            break
+        time.sleep(0.1)
+
+    # verify src tensors did not change
+    for orig_tensor, tensor in zip(orig_src_caches, handler.src_tensors):
+        assert torch.equal(orig_tensor, tensor)
+
+    # verify dst tensors
+    for dst_block in range(dst_size_in_gpu_blocks):
+        src_block_candidate = dst_to_src.get(dst_block)
+        for src_cache, dst_cache, orig_dst_cache, kv_dim in zip(
+            handler.src_tensors,
+            handler.dst_tensors,
+            orig_dst_caches,
+            handler.kv_dim_before_num_blocks,
+        ):
+            if kv_dim:
+                # iterate over key, value
+                for i in range(2):
+                    if src_block_candidate is not None:
+                        expected_value = src_cache[i][src_block_candidate]
+                    else:
+                        expected_value = orig_dst_cache[i][dst_block]
+                    torch.testing.assert_close(
+                        dst_cache[i][dst_block].cpu(), expected_value.cpu()
+                    )
+            else:
+                if src_block_candidate is not None:
+                    expected_value = src_cache[src_block_candidate]
+                else:
+                    expected_value = orig_dst_cache[dst_block]
+                torch.testing.assert_close(
+                    dst_cache[dst_block].cpu(), expected_value.cpu()
+                )
--- a/tests/v1/kv_offload/test_cpu_manager.py
+++ b/tests/v1/kv_offload/test_cpu_manager.py
@@ -0,0 +1,497 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable
+from dataclasses import dataclass
+
+import numpy as np
+
+from vllm.v1.core.kv_cache_utils import BlockHash
+from vllm.v1.kv_offload.abstract import (
+    LoadStoreSpec,
+    OffloadingEvent,
+    PrepareStoreOutput,
+)
+from vllm.v1.kv_offload.arc_manager import ARCOffloadingManager
+from vllm.v1.kv_offload.backends.cpu import CPUBackend
+from vllm.v1.kv_offload.lru_manager import LRUOffloadingManager
+from vllm.v1.kv_offload.mediums import CPULoadStoreSpec
+
+
+@dataclass
+class ExpectedPrepareStoreOutput:
+    block_hashes_to_store: list[int]
+    store_block_ids: list[int]
+    block_hashes_evicted: list[int]
+
+
+def to_hashes(int_hashes: list[int]) -> list[BlockHash]:
+    return [BlockHash(str(i).encode()) for i in int_hashes]
+
+
+def verify_store_output(
+    prepare_store_output: PrepareStoreOutput | None,
+    expected_prepare_store_output: ExpectedPrepareStoreOutput,
+):
+    assert prepare_store_output is not None
+    assert prepare_store_output.block_hashes_to_store == to_hashes(
+        expected_prepare_store_output.block_hashes_to_store
+    )
+    assert prepare_store_output.block_hashes_evicted == to_hashes(
+        expected_prepare_store_output.block_hashes_evicted
+    )
+    store_spec = prepare_store_output.store_spec
+    assert isinstance(store_spec, CPULoadStoreSpec)
+    expected_array = np.array(
+        expected_prepare_store_output.store_block_ids, dtype=np.int64
+    )
+    assert np.array_equal(expected_array, store_spec.block_ids)
+
+
+def verify_load_output(
+    prepare_load_output: LoadStoreSpec, expected_prepare_load_output: list[int]
+):
+    assert isinstance(prepare_load_output, CPULoadStoreSpec)
+    expected_array = np.array(expected_prepare_load_output, dtype=np.int64)
+    assert np.array_equal(expected_array, prepare_load_output.block_ids)
+
+
+def verify_events(
+    events: Iterable[OffloadingEvent],
+    block_size: int,
+    expected_stores: tuple[set[int], ...] = (),
+    expected_evictions: tuple[set[int], ...] = (),
+):
+    stores: list[set[BlockHash]] = []
+    evictions: list[set[BlockHash]] = []
+    for event in events:
+        assert event.medium == CPULoadStoreSpec.medium()
+        assert event.block_size == block_size
+        if event.removed:
+            evictions.append(set(event.block_hashes))
+        else:
+            stores.append(set(event.block_hashes))
+
+    def to_hash_sets(int_sets: tuple[set[int], ...]) -> tuple[set[BlockHash], ...]:
+        return tuple([set(to_hashes(list(int_set))) for int_set in int_sets])
+
+    assert tuple(evictions) == to_hash_sets(expected_evictions)
+    assert tuple(stores) == to_hash_sets(expected_stores)
+
+
+def test_cpu_manager():
+    """
+    Tests LRUOffloadingManager with a CPUBackend.
+    """
+    # initialize a CPU backend with a capacity of 4 blocks
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
+    cpu_manager = LRUOffloadingManager(cpu_backend, enable_events=True)
+
+    # prepare store [1, 2]
+    prepare_store_output = cpu_manager.prepare_store(to_hashes([1, 2]))
+    verify_store_output(
+        prepare_store_output,
+        ExpectedPrepareStoreOutput(
+            block_hashes_to_store=[1, 2],
+            store_block_ids=[0, 1],
+            block_hashes_evicted=[],
+        ),
+    )
+
+    # lookup [1, 2] -> not ready
+    assert cpu_manager.lookup(to_hashes([1, 2])) == 0
+
+    # no events so far
+    assert list(cpu_manager.take_events()) == []
+
+    # complete store [1, 2]
+    cpu_manager.complete_store(to_hashes([1, 2]))
+    verify_events(
+        cpu_manager.take_events(), block_size=block_size, expected_stores=({1, 2},)
+    )
+
+    # lookup [1, 2]
+    assert cpu_manager.lookup(to_hashes([1])) == 1
+    assert cpu_manager.lookup(to_hashes([1, 2])) == 2
+    assert cpu_manager.lookup(to_hashes([1, 2, 3])) == 2
+
+    # prepare store [2, 3, 4, 5] -> evicts [1]
+    prepare_store_output = cpu_manager.prepare_store(to_hashes([2, 3, 4, 5]))
+    verify_store_output(
+        prepare_store_output,
+        ExpectedPrepareStoreOutput(
+            block_hashes_to_store=[3, 4, 5],
+            store_block_ids=[2, 3, 0],
+            block_hashes_evicted=[1],
+        ),
+    )
+
+    # verify eviction event
+    verify_events(
+        cpu_manager.take_events(), block_size=block_size, expected_evictions=({1},)
+    )
+
+    # prepare store with no space
+    assert cpu_manager.prepare_store(to_hashes([1, 6])) is None
+
+    # complete store [2, 3, 4, 5]
+    cpu_manager.complete_store(to_hashes([2, 3, 4, 5]))
+
+    # prepare load [2, 3]
+    prepare_load_output = cpu_manager.prepare_load(to_hashes([2, 3]))
+    verify_load_output(prepare_load_output, [1, 2])
+
+    # prepare store with no space ([2, 3] is being loaded)
+    assert cpu_manager.prepare_store(to_hashes([6, 7, 8])) is None
+
+    # complete load [2, 3]
+    cpu_manager.complete_load(to_hashes([2, 3]))
+
+    # prepare store [6, 7, 8] -> evicts [2, 3, 4] (oldest)
+    prepare_store_output = cpu_manager.prepare_store(to_hashes([6, 7, 8]))
+    verify_store_output(
+        prepare_store_output,
+        ExpectedPrepareStoreOutput(
+            block_hashes_to_store=[6, 7, 8],
+            store_block_ids=[3, 2, 1],
+            block_hashes_evicted=[2, 3, 4],
+        ),
+    )
+
+    # complete store [6, 7, 8]
+    cpu_manager.complete_store(to_hashes([6, 7, 8]))
+
+    # touch [5, 6, 7] (move to end of LRU order)
+    cpu_manager.touch(to_hashes([5, 6, 7]))
+
+    # prepare store [7, 9] -> evicts [8] (oldest following previous touch)
+    prepare_store_output = cpu_manager.prepare_store(to_hashes([9]))
+    verify_store_output(
+        prepare_store_output,
+        ExpectedPrepareStoreOutput(
+            block_hashes_to_store=[9],
+            store_block_ids=[1],
+            block_hashes_evicted=[8],
+        ),
+    )
+
+    # complete store [7, 9] with failure
+    cpu_manager.complete_store(to_hashes([7, 9]), success=False)
+
+    # assert [7] is still stored, but [9] is not
+    assert cpu_manager.lookup(to_hashes([7])) == 1
+    assert cpu_manager.lookup(to_hashes([9])) == 0
+
+    verify_events(
+        cpu_manager.take_events(),
+        block_size=block_size,
+        expected_stores=({3, 4, 5}, {6, 7, 8}),
+        expected_evictions=({2, 3, 4}, {8}),
+    )
+
+
+def test_arc_manager_basic():
+    """
+    Tests ARCOffloadingManager basic operations with a CPUBackend.
+    Verifies that ARC handles store, load, and lookup operations correctly.
+    """
+    # initialize a CPU backend with a capacity of 4 blocks
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=True)
+
+    # prepare store [1, 2]
+    prepare_store_output = arc_manager.prepare_store(to_hashes([1, 2]))
+    verify_store_output(
+        prepare_store_output,
+        ExpectedPrepareStoreOutput(
+            block_hashes_to_store=[1, 2],
+            store_block_ids=[0, 1],
+            block_hashes_evicted=[],
+        ),
+    )
+
+    # lookup [1, 2] -> not ready
+    assert arc_manager.lookup(to_hashes([1, 2])) == 0
+
+    # no events so far
+    assert list(arc_manager.take_events()) == []
+
+    # complete store [1, 2]
+    arc_manager.complete_store(to_hashes([1, 2]))
+    verify_events(
+        arc_manager.take_events(), block_size=block_size, expected_stores=({1, 2},)
+    )
+
+    # lookup [1, 2]
+    assert arc_manager.lookup(to_hashes([1])) == 1
+    assert arc_manager.lookup(to_hashes([1, 2])) == 2
+    assert arc_manager.lookup(to_hashes([1, 2, 3])) == 2
+
+    # blocks should be in T1 (recent)
+    assert len(arc_manager.t1) == 2
+    assert len(arc_manager.t2) == 0
+
+
+def test_arc_manager_t1_to_t2_promotion():
+    """
+    Tests that accessing a block in T1 promotes it to T2 (frequent).
+    This is a key feature of ARC's adaptive behavior.
+    """
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=False)
+
+    # store and complete block 1
+    arc_manager.prepare_store(to_hashes([1]))
+    arc_manager.complete_store(to_hashes([1]))
+
+    # block 1 starts in T1 (recent)
+    assert to_hashes([1])[0] in arc_manager.t1
+    assert to_hashes([1])[0] not in arc_manager.t2
+
+    # touch block 1 (simulate second access)
+    arc_manager.touch(to_hashes([1]))
+
+    # block 1 should now be in T2 (frequent)
+    assert to_hashes([1])[0] not in arc_manager.t1
+    assert to_hashes([1])[0] in arc_manager.t2
+
+
+def test_arc_manager_eviction_with_load():
+    """
+    Tests ARC eviction behavior similar to LRU test.
+    Verifies that blocks being loaded (ref_cnt > 0) cannot be evicted.
+    """
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=True)
+
+    # prepare and complete store [1, 2, 3, 4]
+    prepare_store_output = arc_manager.prepare_store(to_hashes([1, 2, 3, 4]))
+    verify_store_output(
+        prepare_store_output,
+        ExpectedPrepareStoreOutput(
+            block_hashes_to_store=[1, 2, 3, 4],
+            store_block_ids=[0, 1, 2, 3],
+            block_hashes_evicted=[],
+        ),
+    )
+    arc_manager.complete_store(to_hashes([1, 2, 3, 4]))
+
+    # prepare load [2, 3] (increases ref_cnt)
+    prepare_load_output = arc_manager.prepare_load(to_hashes([2, 3]))
+    verify_load_output(prepare_load_output, [1, 2])
+
+    # prepare store [5, 6, 7] with [2, 3] being loaded
+    # should fail because [2, 3] have ref_cnt > 0
+    assert arc_manager.prepare_store(to_hashes([5, 6, 7])) is None
+
+    # complete load [2, 3]
+    arc_manager.complete_load(to_hashes([2, 3]))
+
+    # now prepare store [5, 6, 7] should succeed
+    # ARC will evict blocks one at a time from T1 as needed
+    prepare_store_output = arc_manager.prepare_store(to_hashes([5, 6, 7]))
+    assert prepare_store_output is not None
+    # Should successfully evict enough blocks to make room (at least 1)
+    assert len(prepare_store_output.block_hashes_evicted) >= 1
+
+
+def test_arc_manager_adaptive_target():
+    """
+    Tests ARC's adaptive target adjustment via ghost lists.
+    When a block in B1 (ghost list) is accessed, target_t1_size increases.
+    When a block in B2 is accessed, target_t1_size decreases.
+    """
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=2)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=False)
+
+    # store blocks 1, 2 (fills cache)
+    arc_manager.prepare_store(to_hashes([1, 2]))
+    arc_manager.complete_store(to_hashes([1, 2]))
+
+    initial_target = arc_manager.target_t1_size
+
+    # store block 3, evicting block 1 (moves to B1 ghost list)
+    arc_manager.prepare_store(to_hashes([3]))
+    arc_manager.complete_store(to_hashes([3]))
+
+    # block 1 should be in B1 (ghost list)
+    assert to_hashes([1])[0] in arc_manager.b1
+
+    # touch block 1 (cache miss, but in B1)
+    # this should increase target_t1_size (favor recency)
+    arc_manager.touch(to_hashes([1]))
+
+    # target should have increased
+    assert arc_manager.target_t1_size > initial_target
+
+
+def test_arc_manager_t1_t2_eviction_policy():
+    """
+    Tests that ARC evicts from T1 or T2 based on target_t1_size.
+    If |T1| >= target_t1_size, evict from T1, otherwise from T2.
+    """
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=False)
+
+    # store blocks 1, 2, 3, 4
+    arc_manager.prepare_store(to_hashes([1, 2, 3, 4]))
+    arc_manager.complete_store(to_hashes([1, 2, 3, 4]))
+
+    # promote blocks 3, 4 to T2 by touching them
+    arc_manager.touch(to_hashes([3, 4]))
+
+    # now: T1 = {1, 2}, T2 = {3, 4}
+    assert len(arc_manager.t1) == 2
+    assert len(arc_manager.t2) == 2
+
+    # set target_t1_size to prefer evicting from T1
+    # (when |T1| >= target, evict from T1)
+    arc_manager.target_t1_size = 1
+
+    # store block 5, should evict from T1 (block 1, LRU in T1)
+    output = arc_manager.prepare_store(to_hashes([5]))
+    assert output is not None
+    assert to_hashes([1]) == output.block_hashes_evicted
+
+    arc_manager.complete_store(to_hashes([5]))
+
+    # block 1 should be in B1 (ghost list)
+    assert to_hashes([1])[0] in arc_manager.b1
+    # block 5 should be in T1
+    assert to_hashes([5])[0] in arc_manager.t1
+
+
+def test_arc_manager_ghost_list_bounds():
+    """
+    Tests that ghost lists (B1, B2) don't grow unbounded.
+    They should be capped at cache_capacity.
+    """
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=2)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=False)
+
+    # fill cache with blocks 1, 2
+    arc_manager.prepare_store(to_hashes([1, 2]))
+    arc_manager.complete_store(to_hashes([1, 2]))
+
+    # store many blocks to fill ghost lists
+    for i in range(3, 20):
+        arc_manager.prepare_store(to_hashes([i]))
+        arc_manager.complete_store(to_hashes([i]))
+
+    # ghost lists should not exceed cache_capacity
+    assert len(arc_manager.b1) <= arc_manager.cache_capacity
+    assert len(arc_manager.b2) <= arc_manager.cache_capacity
+
+
+def test_arc_manager_touch_ordering():
+    """
+    Tests that touch() correctly updates access patterns.
+    Similar to LRU test but verifies T1/T2 ordering.
+    """
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=True)
+
+    # store blocks 1, 2, 3, 4
+    arc_manager.prepare_store(to_hashes([1, 2, 3, 4]))
+    arc_manager.complete_store(to_hashes([1, 2, 3, 4]))
+
+    # promote 3, 4 to T2
+    arc_manager.touch(to_hashes([3, 4]))
+
+    # T1 = {1, 2}, T2 = {3, 4}
+    # touch [1, 3, 4] - should promote 1 to T2, and move 3,4 to end of T2
+    arc_manager.touch(to_hashes([1, 3, 4]))
+
+    # T1 = {2}, T2 = {1, 3, 4} (in that order, with 4 most recent)
+    assert len(arc_manager.t1) == 1
+    assert len(arc_manager.t2) == 3
+
+    # store block 5, should evict from T1 (block 2, only one in T1)
+    prepare_store_output = arc_manager.prepare_store(to_hashes([5]))
+    verify_store_output(
+        prepare_store_output,
+        ExpectedPrepareStoreOutput(
+            block_hashes_to_store=[5],
+            store_block_ids=[1],  # reuses block 2's storage
+            block_hashes_evicted=[2],
+        ),
+    )
+
+
+def test_arc_manager_failed_store():
+    """
+    Tests that failed store operations clean up correctly.
+    Similar to LRU test but for ARC.
+    """
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=True)
+
+    # store blocks 1, 2, 3, 4
+    arc_manager.prepare_store(to_hashes([1, 2, 3, 4]))
+    arc_manager.complete_store(to_hashes([1, 2, 3, 4]))
+
+    # prepare store block 5 (will evict block 1)
+    prepare_store_output = arc_manager.prepare_store(to_hashes([5]))
+    assert prepare_store_output is not None
+    assert len(prepare_store_output.block_hashes_evicted) == 1
+
+    # complete store with failure
+    arc_manager.complete_store(to_hashes([5]), success=False)
+
+    # block 5 should not be in cache
+    assert arc_manager.lookup(to_hashes([5])) == 0
+    # block 5 should not be in T1 or T2
+    assert to_hashes([5])[0] not in arc_manager.t1
+    assert to_hashes([5])[0] not in arc_manager.t2
+
+    # evicted block should still be gone (in B1 ghost list)
+    evicted_hash = prepare_store_output.block_hashes_evicted[0]
+    assert evicted_hash in arc_manager.b1
+
+
+def test_arc_manager_full_scenario():
+    """
+    Comprehensive test covering multiple ARC operations in sequence.
+    Similar to the full LRU test but adapted for ARC behavior.
+    """
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
+    arc_manager = ARCOffloadingManager(cpu_backend, enable_events=True)
+
+    # store [1, 2]
+    arc_manager.prepare_store(to_hashes([1, 2]))
+    arc_manager.complete_store(to_hashes([1, 2]))
+
+    # store [3, 4, 5] -> evicts [1]
+    prepare_store_output = arc_manager.prepare_store(to_hashes([3, 4, 5]))
+    assert prepare_store_output is not None
+    assert len(prepare_store_output.block_hashes_evicted) == 1
+    arc_manager.complete_store(to_hashes([3, 4, 5]))
+
+    # promote some blocks to T2
+    arc_manager.touch(to_hashes([2, 3]))
+
+    # T1 has {4, 5}, T2 has {2, 3}
+    assert len(arc_manager.t1) == 2
+    assert len(arc_manager.t2) == 2
+
+    # store [6] -> should evict from T1 (4 is oldest in T1)
+    prepare_store_output = arc_manager.prepare_store(to_hashes([6]))
+    assert prepare_store_output is not None
+    arc_manager.complete_store(to_hashes([6]))
+
+    # verify blocks 2, 3 (in T2) are still present
+    assert arc_manager.lookup(to_hashes([2])) == 1
+    assert arc_manager.lookup(to_hashes([3])) == 1
+
+    # verify events
+    events = list(arc_manager.take_events())
+    assert len(events) > 0  # should have store and eviction events
--- a/tests/v1/kv_offload/test_cpu_offloading.py
+++ b/tests/v1/kv_offload/test_cpu_offloading.py
@@ -0,0 +1,199 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import socket
+import time
+
+import msgspec
+import msgspec.msgpack
+import pytest
+import zmq
+from tqdm import tqdm
+
+from vllm import LLM, SamplingParams, TokensPrompt
+from vllm.config import KVEventsConfig, KVTransferConfig
+from vllm.distributed.kv_events import BlockStored, KVEventBatch
+from vllm.platforms import current_platform
+from vllm.utils.system_utils import set_env_var
+
+CPU_BLOCK_SIZES = [48]
+ATTN_BACKENDS = ["FLASH_ATTN"]
+
+if current_platform.is_cuda():
+    ATTN_BACKENDS.append("FLASHINFER")
+elif current_platform.is_rocm():
+    ATTN_BACKENDS = ["TRITON_ATTN"]
+
+
+class MockSubscriber:
+    """Helper class to receive and verify published events"""
+
+    def __init__(
+        self,
+        endpoint: str,
+        topic: str,
+    ):
+        self.ctx = zmq.Context.instance()
+        self.topic_bytes = topic.encode("utf-8")
+
+        # Set up subscriber socket
+        self.sub = self.ctx.socket(zmq.SUB)
+        self.sub.setsockopt(zmq.SUBSCRIBE, self.topic_bytes)
+        self.sub.connect(endpoint)
+
+        self.decoder = msgspec.msgpack.Decoder(type=KVEventBatch)
+
+    def get_new_cpu_stored_events(self) -> list[BlockStored]:
+        cpu_stored_events: list[BlockStored] = []
+
+        poller = zmq.Poller()
+        poller.register(self.sub, zmq.POLLIN)
+
+        timeout = 1000  # 1 second
+        while True:
+            events = dict(poller.poll(timeout))
+
+            if events.get(self.sub) != zmq.POLLIN:
+                return cpu_stored_events
+
+            topic_bytes, _, payload = self.sub.recv_multipart()
+
+            assert topic_bytes == self.topic_bytes
+
+            event_batch = self.decoder.decode(payload)
+            assert isinstance(event_batch, KVEventBatch)
+            for event in event_batch.events:
+                if isinstance(event, BlockStored) and event.medium == "CPU":
+                    cpu_stored_events.append(event)
+                    timeout = 100
+
+    def close(self):
+        """Clean up resources"""
+        self.sub.close()
+
+
+def _latency_test(llm: LLM, subscriber: MockSubscriber):
+    sampling_params = SamplingParams(max_tokens=1)
+
+    num_times_cpu_better_than_cold = 0
+    num_tests = 10
+    total_cold_time = 0.0
+    total_gpu_hit_time = 0.0
+    total_cpu_hit_time = 0.0
+    prompt_token_ids = [0] * 10001
+    for i in tqdm(range(num_tests), desc="Running tests"):
+        prompt_token_ids[0] = i
+        prompts = [TokensPrompt(prompt_token_ids=prompt_token_ids)]
+
+        # run generation - this should trigger saving KV cache
+        start_time = time.time()
+        llm.generate(prompts, sampling_params, use_tqdm=False)
+        cold_time = time.time() - start_time
+        total_cold_time += cold_time
+
+        # run generation again - should hit the GPU prefix cache
+        start_time = time.time()
+        llm.generate(prompts, sampling_params, use_tqdm=False)
+        gpu_hit_time = time.time() - start_time
+        total_gpu_hit_time += gpu_hit_time
+
+        # reset prefix cache to avoid GPU hit.
+        llm.reset_prefix_cache()
+
+        assert subscriber.get_new_cpu_stored_events()
+
+        # run generation again - this should trigger loading from CPU
+        start_time = time.time()
+        llm.generate(prompts, sampling_params, use_tqdm=False)
+        cpu_hit_time = time.time() - start_time
+        total_cpu_hit_time += cpu_hit_time
+
+        if cpu_hit_time < cold_time:
+            num_times_cpu_better_than_cold += 1
+
+    print("Average times:")
+    print(f"    Cold: {total_cold_time * 1000 / num_tests:.2f}ms")
+    print(f"    GPU hit: {total_gpu_hit_time * 1000 / num_tests:.2f}ms")
+    print(f"    CPU hit: {total_cpu_hit_time * 1000 / num_tests:.2f}ms")
+
+    assert num_times_cpu_better_than_cold >= 0.8 * num_tests
+
+
+def _accuracy_test(llm: LLM, subscriber: MockSubscriber):
+    sampling_params = SamplingParams(max_tokens=1)
+    cpu_block_size = (
+        llm.llm_engine.vllm_config.kv_transfer_config.kv_connector_extra_config[
+            "block_size"
+        ]
+    )
+
+    subscriber.get_new_cpu_stored_events()
+
+    # prepend prompt to be cpu block aligned
+    prompt = "Let's count to 10. One, two, three, four,"
+    while (
+        len(llm.generate(prompt, use_tqdm=False)[0].prompt_token_ids) % cpu_block_size
+        != 0
+    ):
+        prompt = ". " + prompt
+
+    assert subscriber.get_new_cpu_stored_events()
+
+    test_count = 100
+    success_count = 0
+    for i in range(test_count):
+        if (
+            llm.generate(prompt, sampling_params, use_tqdm=False)[0].outputs[0].text
+            == " five"
+        ):
+            success_count += 1
+
+    assert success_count >= 0.5 * test_count
+
+
+@pytest.mark.parametrize("cpu_block_size", CPU_BLOCK_SIZES)
+@pytest.mark.parametrize("attn_backend", ATTN_BACKENDS)
+def test_cpu_offloading(cpu_block_size: int, attn_backend: str) -> None:
+    """
+    Tests OffloadingConnector with CPUOffloadingSpec.
+    """
+
+    # configure OffloadingConnector (spec_name=CPUOffloadingSpec by default)
+    kv_transfer_config = KVTransferConfig(
+        kv_connector="OffloadingConnector",
+        kv_role="kv_both",
+        kv_connector_extra_config={
+            "num_cpu_blocks": 1000,
+            "block_size": cpu_block_size,
+        },
+    )
+
+    port: int
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("0.0.0.0", 0))
+        port = s.getsockname()[1]
+
+    events_endpoint = f"tcp://*:{port}"
+    kv_events_config = KVEventsConfig(
+        enable_kv_cache_events=True,
+        publisher="zmq",
+        endpoint=events_endpoint,
+        topic="test",
+    )
+
+    with set_env_var("VLLM_ATTENTION_BACKEND", attn_backend):
+        llm = LLM(
+            model="meta-llama/Llama-3.2-1B-Instruct",
+            gpu_memory_utilization=0.5,
+            kv_events_config=kv_events_config,
+            kv_transfer_config=kv_transfer_config,
+        )
+
+    events_endpoint = events_endpoint.replace("*", "127.0.0.1")
+    subscriber = MockSubscriber(events_endpoint, topic=kv_events_config.topic)
+
+    try:
+        _latency_test(llm, subscriber)
+        _accuracy_test(llm, subscriber)
+    finally:
+        subscriber.close()
+        del llm
--- a/tests/v1/kv_offload/test_worker.py
+++ b/tests/v1/kv_offload/test_worker.py
@@ -0,0 +1,153 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.v1.kv_offload.abstract import LoadStoreSpec
+from vllm.v1.kv_offload.worker.worker import (
+    OffloadingHandler,
+    OffloadingWorker,
+    TransferResult,
+    TransferSpec,
+)
+
+
+class LoadStoreSpec1(LoadStoreSpec):
+    def __init__(
+        self,
+        submit_success: bool = True,
+        async_success: bool = True,
+        exception: bool = False,
+    ):
+        self.finished = False
+        self.submit_success = submit_success
+        self.async_success = async_success
+        self.exception = exception
+
+    @staticmethod
+    def medium() -> str:
+        return "1"
+
+    def __repr__(self):
+        return f"{self.medium()}: {id(self)}"
+
+
+class LoadStoreSpec2(LoadStoreSpec):
+    @staticmethod
+    def medium() -> str:
+        return "2"
+
+    def __repr__(self):
+        return f"{self.medium()}: {id(self)}"
+
+
+class OffloadingHandler1To2(OffloadingHandler):
+    def __init__(self):
+        self.transfers: dict[int, LoadStoreSpec1] = {}
+
+    def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
+        src, dst = spec
+        assert isinstance(src, LoadStoreSpec1)
+        assert isinstance(dst, LoadStoreSpec2)
+
+        if src.exception:
+            raise Exception("An expected exception. Don't worry!")
+        if not src.submit_success:
+            return False
+
+        self.transfers[job_id] = src
+        return True
+
+    def get_finished(self) -> list[TransferResult]:
+        finished = []
+        for job_id, spec in list(self.transfers.items()):
+            if spec.finished:
+                finished.append((job_id, spec.async_success))
+                del self.transfers[job_id]
+        return finished
+
+
+class OffloadingHandler2To1(OffloadingHandler):
+    def __init__(self):
+        self.transfers: dict[int, LoadStoreSpec1] = {}
+
+    def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
+        src, dst = spec
+        assert isinstance(src, LoadStoreSpec2)
+        assert isinstance(dst, LoadStoreSpec1)
+
+        self.transfers[job_id] = dst
+        return True
+
+    def get_finished(self) -> list[TransferResult]:
+        finished = []
+        for job_id, spec in list(self.transfers.items()):
+            if spec.finished:
+                finished.append((job_id, spec.async_success))
+                del self.transfers[job_id]
+        return finished
+
+
+def test_offloading_worker():
+    """
+    Tests OffloadingWorker with 2 handlers.
+    One handler performs 1->2 transfers, and the other handles 2->1.
+    """
+    worker = OffloadingWorker()
+    handler1to2 = OffloadingHandler1To2()
+    handler2to1 = OffloadingHandler2To1()
+    worker.register_handler(LoadStoreSpec1, LoadStoreSpec2, handler1to2)
+    worker.register_handler(LoadStoreSpec2, LoadStoreSpec1, handler2to1)
+
+    # 1st transfer 1->2 (exception)
+    src1 = LoadStoreSpec1(exception=True)
+    dst1 = LoadStoreSpec2()
+    assert not worker.transfer_async(1, (src1, dst1))
+
+    # 2ed transfer 1->2 (failure to submit)
+    src2 = LoadStoreSpec1(submit_success=False)
+    dst2 = LoadStoreSpec2()
+    assert not worker.transfer_async(2, (src2, dst2))
+
+    # 3rd transfer 1->2 (failure)
+    src3 = LoadStoreSpec1(async_success=False)
+    dst3 = LoadStoreSpec2()
+    assert worker.transfer_async(3, (src3, dst3))
+
+    # 4th transfer 1->2 (success)
+    src4 = LoadStoreSpec1()
+    dst4 = LoadStoreSpec2()
+    worker.transfer_async(4, (src4, dst4))
+    assert set(handler1to2.transfers.keys()) == {3, 4}
+
+    # 5th transfer 2->1
+    src5 = LoadStoreSpec2()
+    dst5 = LoadStoreSpec1()
+    worker.transfer_async(5, (src5, dst5))
+    assert set(handler2to1.transfers.keys()) == {5}
+
+    # no transfer completed yet
+    assert worker.get_finished() == []
+
+    # complete 3rd, 4th
+    src3.finished = True
+    src4.finished = True
+
+    # 6th transfer 1->2
+    src6 = LoadStoreSpec1()
+    dst6 = LoadStoreSpec2()
+    worker.transfer_async(6, (src6, dst6))
+
+    # 7th transfer 2->1
+    src7 = LoadStoreSpec2()
+    dst7 = LoadStoreSpec1()
+    worker.transfer_async(7, (src7, dst7))
+
+    # 6th and 7th transfers started
+    assert 6 in handler1to2.transfers
+    assert 7 in handler2to1.transfers
+
+    # verify result of 3rd and 4th transfers
+    assert sorted(worker.get_finished()) == [(3, False), (4, True)]
+
+    # complete 6th and 7th transfers
+    src6.finished = True
+    dst7.finished = True
+    assert sorted(worker.get_finished()) == [(6, True), (7, True)]