Sync from v0.13
This commit is contained in:
191
tests/v1/kv_offload/test_cpu_gpu.py
Normal file
191
tests/v1/kv_offload/test_cpu_gpu.py
Normal file
@@ -0,0 +1,191 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import random
|
||||
import time
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
|
||||
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
|
||||
from vllm.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandlers
|
||||
|
||||
BACKENDS_TO_TEST = [FlashAttentionBackend]
|
||||
|
||||
if not current_platform.is_rocm():
|
||||
from vllm.v1.attention.backends.flashinfer import FlashInferBackend
|
||||
|
||||
BACKENDS_TO_TEST.append(FlashInferBackend)
|
||||
|
||||
from vllm.v1.attention.backends.mla.flashattn_mla import FlashAttnMLABackend
|
||||
|
||||
BACKENDS_TO_TEST.append(FlashAttnMLABackend)
|
||||
|
||||
NUM_GPU_BLOCKS = [64]
|
||||
NUM_CPU_BLOCKS = [256]
|
||||
GPU_BLOCK_SIZES = [16]
|
||||
GPU_BLOCKS_PER_CPU_BLOCK = [1, 3]
|
||||
HEAD_SIZES = [64]
|
||||
NUM_HEADS = [8]
|
||||
NUM_LAYERS = [4]
|
||||
DTYPES = [torch.bfloat16]
|
||||
SEEDS = [0]
|
||||
CUDA_DEVICES = ["cuda:0"]
|
||||
NUM_MAPPINGS = [3]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("gpu_to_cpu", [True, False])
|
||||
@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("gpu_block_size", GPU_BLOCK_SIZES)
|
||||
@pytest.mark.parametrize("gpu_blocks_per_cpu_block", GPU_BLOCKS_PER_CPU_BLOCK)
|
||||
@pytest.mark.parametrize("num_gpu_blocks", NUM_GPU_BLOCKS)
|
||||
@pytest.mark.parametrize("num_cpu_blocks", NUM_CPU_BLOCKS)
|
||||
@pytest.mark.parametrize("num_layers", NUM_LAYERS)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@torch.inference_mode()
|
||||
def test_transfer(
|
||||
gpu_to_cpu: bool,
|
||||
num_mappings: int,
|
||||
head_size: int,
|
||||
num_heads: int,
|
||||
gpu_block_size: int,
|
||||
gpu_blocks_per_cpu_block: int,
|
||||
num_gpu_blocks: int,
|
||||
num_cpu_blocks: int,
|
||||
num_layers: int,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: str,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
|
||||
# create per-layer GPU KV caches based on available attn_backends
|
||||
attn_backends_list = BACKENDS_TO_TEST
|
||||
|
||||
gpu_caches = {}
|
||||
attn_backends = {}
|
||||
for i in range(num_layers):
|
||||
layer_name = f"layer {i}"
|
||||
|
||||
attn_backend = attn_backends_list[i % len(attn_backends_list)]
|
||||
attn_backends[layer_name] = attn_backend
|
||||
|
||||
gpu_cache_shape = attn_backend.get_kv_cache_shape(
|
||||
num_gpu_blocks, gpu_block_size, num_heads, head_size
|
||||
)
|
||||
gpu_caches[layer_name] = torch.rand(gpu_cache_shape, dtype=dtype, device=device)
|
||||
|
||||
# create handler
|
||||
cpu_block_size = gpu_blocks_per_cpu_block * gpu_block_size
|
||||
handlers = CpuGpuOffloadingHandlers(
|
||||
attn_backends=attn_backends,
|
||||
gpu_block_size=gpu_block_size,
|
||||
cpu_block_size=cpu_block_size,
|
||||
num_cpu_blocks=num_cpu_blocks,
|
||||
gpu_caches=gpu_caches,
|
||||
)
|
||||
|
||||
# select block mappings
|
||||
gpu_blocks = random.sample(
|
||||
range(num_gpu_blocks), num_mappings * gpu_blocks_per_cpu_block
|
||||
)
|
||||
cpu_blocks = random.sample(range(num_cpu_blocks), num_mappings)
|
||||
|
||||
# convert cpu blocks to gpu block size
|
||||
cpu_blocks_in_gpu_block_size = []
|
||||
for cpu_block in cpu_blocks:
|
||||
base_block_id = cpu_block * gpu_blocks_per_cpu_block
|
||||
for i in range(gpu_blocks_per_cpu_block):
|
||||
cpu_blocks_in_gpu_block_size.append(i + base_block_id)
|
||||
|
||||
# maybe skip a GPU block to test reading from the middle of a CPU block
|
||||
if not gpu_to_cpu:
|
||||
gpu_blocks = gpu_blocks[gpu_blocks_per_cpu_block - 1 :]
|
||||
cpu_blocks_in_gpu_block_size = cpu_blocks_in_gpu_block_size[
|
||||
gpu_blocks_per_cpu_block - 1 :
|
||||
]
|
||||
|
||||
# set transfer direction
|
||||
if gpu_to_cpu:
|
||||
handler = handlers.gpu_to_cpu_handler
|
||||
src_spec_class = GPULoadStoreSpec
|
||||
dst_spec_class = CPULoadStoreSpec
|
||||
src_blocks = gpu_blocks
|
||||
dst_blocks = cpu_blocks
|
||||
src_blocks_in_gpu_block_size = gpu_blocks
|
||||
dst_blocks_in_gpu_block_size = cpu_blocks_in_gpu_block_size
|
||||
dst_size_in_gpu_blocks = num_cpu_blocks * gpu_blocks_per_cpu_block
|
||||
else:
|
||||
handler = handlers.cpu_to_gpu_handler
|
||||
src_spec_class = CPULoadStoreSpec
|
||||
dst_spec_class = GPULoadStoreSpec
|
||||
src_blocks = cpu_blocks
|
||||
dst_blocks = gpu_blocks
|
||||
src_blocks_in_gpu_block_size = cpu_blocks_in_gpu_block_size
|
||||
dst_blocks_in_gpu_block_size = gpu_blocks
|
||||
dst_size_in_gpu_blocks = num_gpu_blocks
|
||||
|
||||
# build dst -> src mapping
|
||||
dst_to_src = {}
|
||||
for src_block, dst_block in zip(
|
||||
src_blocks_in_gpu_block_size, dst_blocks_in_gpu_block_size
|
||||
):
|
||||
dst_to_src[dst_block] = src_block
|
||||
|
||||
# build transfer specs
|
||||
src_spec = src_spec_class(src_blocks)
|
||||
dst_spec = dst_spec_class(dst_blocks)
|
||||
|
||||
# clone src and dst tensors before transfer
|
||||
orig_src_caches = [x.clone() for x in handler.src_tensors]
|
||||
orig_dst_caches = [x.clone() for x in handler.dst_tensors]
|
||||
|
||||
# call transfer function
|
||||
assert handler.transfer_async(1, (src_spec, dst_spec))
|
||||
assert set({x[0] for x in handler._transfers}) == {1}
|
||||
|
||||
# wait for transfer to complete
|
||||
end_time = time.time() + 10
|
||||
while time.time() < end_time:
|
||||
finished = handler.get_finished()
|
||||
if finished:
|
||||
assert finished == [(1, True)]
|
||||
break
|
||||
time.sleep(0.1)
|
||||
|
||||
# verify src tensors did not change
|
||||
for orig_tensor, tensor in zip(orig_src_caches, handler.src_tensors):
|
||||
assert torch.equal(orig_tensor, tensor)
|
||||
|
||||
# verify dst tensors
|
||||
for dst_block in range(dst_size_in_gpu_blocks):
|
||||
src_block_candidate = dst_to_src.get(dst_block)
|
||||
for src_cache, dst_cache, orig_dst_cache, kv_dim in zip(
|
||||
handler.src_tensors,
|
||||
handler.dst_tensors,
|
||||
orig_dst_caches,
|
||||
handler.kv_dim_before_num_blocks,
|
||||
):
|
||||
if kv_dim:
|
||||
# iterate over key, value
|
||||
for i in range(2):
|
||||
if src_block_candidate is not None:
|
||||
expected_value = src_cache[i][src_block_candidate]
|
||||
else:
|
||||
expected_value = orig_dst_cache[i][dst_block]
|
||||
torch.testing.assert_close(
|
||||
dst_cache[i][dst_block].cpu(), expected_value.cpu()
|
||||
)
|
||||
else:
|
||||
if src_block_candidate is not None:
|
||||
expected_value = src_cache[src_block_candidate]
|
||||
else:
|
||||
expected_value = orig_dst_cache[dst_block]
|
||||
torch.testing.assert_close(
|
||||
dst_cache[dst_block].cpu(), expected_value.cpu()
|
||||
)
|
||||
497
tests/v1/kv_offload/test_cpu_manager.py
Normal file
497
tests/v1/kv_offload/test_cpu_manager.py
Normal file
@@ -0,0 +1,497 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from collections.abc import Iterable
|
||||
from dataclasses import dataclass
|
||||
|
||||
import numpy as np
|
||||
|
||||
from vllm.v1.core.kv_cache_utils import BlockHash
|
||||
from vllm.v1.kv_offload.abstract import (
|
||||
LoadStoreSpec,
|
||||
OffloadingEvent,
|
||||
PrepareStoreOutput,
|
||||
)
|
||||
from vllm.v1.kv_offload.arc_manager import ARCOffloadingManager
|
||||
from vllm.v1.kv_offload.backends.cpu import CPUBackend
|
||||
from vllm.v1.kv_offload.lru_manager import LRUOffloadingManager
|
||||
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExpectedPrepareStoreOutput:
|
||||
block_hashes_to_store: list[int]
|
||||
store_block_ids: list[int]
|
||||
block_hashes_evicted: list[int]
|
||||
|
||||
|
||||
def to_hashes(int_hashes: list[int]) -> list[BlockHash]:
|
||||
return [BlockHash(str(i).encode()) for i in int_hashes]
|
||||
|
||||
|
||||
def verify_store_output(
|
||||
prepare_store_output: PrepareStoreOutput | None,
|
||||
expected_prepare_store_output: ExpectedPrepareStoreOutput,
|
||||
):
|
||||
assert prepare_store_output is not None
|
||||
assert prepare_store_output.block_hashes_to_store == to_hashes(
|
||||
expected_prepare_store_output.block_hashes_to_store
|
||||
)
|
||||
assert prepare_store_output.block_hashes_evicted == to_hashes(
|
||||
expected_prepare_store_output.block_hashes_evicted
|
||||
)
|
||||
store_spec = prepare_store_output.store_spec
|
||||
assert isinstance(store_spec, CPULoadStoreSpec)
|
||||
expected_array = np.array(
|
||||
expected_prepare_store_output.store_block_ids, dtype=np.int64
|
||||
)
|
||||
assert np.array_equal(expected_array, store_spec.block_ids)
|
||||
|
||||
|
||||
def verify_load_output(
|
||||
prepare_load_output: LoadStoreSpec, expected_prepare_load_output: list[int]
|
||||
):
|
||||
assert isinstance(prepare_load_output, CPULoadStoreSpec)
|
||||
expected_array = np.array(expected_prepare_load_output, dtype=np.int64)
|
||||
assert np.array_equal(expected_array, prepare_load_output.block_ids)
|
||||
|
||||
|
||||
def verify_events(
|
||||
events: Iterable[OffloadingEvent],
|
||||
block_size: int,
|
||||
expected_stores: tuple[set[int], ...] = (),
|
||||
expected_evictions: tuple[set[int], ...] = (),
|
||||
):
|
||||
stores: list[set[BlockHash]] = []
|
||||
evictions: list[set[BlockHash]] = []
|
||||
for event in events:
|
||||
assert event.medium == CPULoadStoreSpec.medium()
|
||||
assert event.block_size == block_size
|
||||
if event.removed:
|
||||
evictions.append(set(event.block_hashes))
|
||||
else:
|
||||
stores.append(set(event.block_hashes))
|
||||
|
||||
def to_hash_sets(int_sets: tuple[set[int], ...]) -> tuple[set[BlockHash], ...]:
|
||||
return tuple([set(to_hashes(list(int_set))) for int_set in int_sets])
|
||||
|
||||
assert tuple(evictions) == to_hash_sets(expected_evictions)
|
||||
assert tuple(stores) == to_hash_sets(expected_stores)
|
||||
|
||||
|
||||
def test_cpu_manager():
|
||||
"""
|
||||
Tests LRUOffloadingManager with a CPUBackend.
|
||||
"""
|
||||
# initialize a CPU backend with a capacity of 4 blocks
|
||||
block_size = 256
|
||||
cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
|
||||
cpu_manager = LRUOffloadingManager(cpu_backend, enable_events=True)
|
||||
|
||||
# prepare store [1, 2]
|
||||
prepare_store_output = cpu_manager.prepare_store(to_hashes([1, 2]))
|
||||
verify_store_output(
|
||||
prepare_store_output,
|
||||
ExpectedPrepareStoreOutput(
|
||||
block_hashes_to_store=[1, 2],
|
||||
store_block_ids=[0, 1],
|
||||
block_hashes_evicted=[],
|
||||
),
|
||||
)
|
||||
|
||||
# lookup [1, 2] -> not ready
|
||||
assert cpu_manager.lookup(to_hashes([1, 2])) == 0
|
||||
|
||||
# no events so far
|
||||
assert list(cpu_manager.take_events()) == []
|
||||
|
||||
# complete store [1, 2]
|
||||
cpu_manager.complete_store(to_hashes([1, 2]))
|
||||
verify_events(
|
||||
cpu_manager.take_events(), block_size=block_size, expected_stores=({1, 2},)
|
||||
)
|
||||
|
||||
# lookup [1, 2]
|
||||
assert cpu_manager.lookup(to_hashes([1])) == 1
|
||||
assert cpu_manager.lookup(to_hashes([1, 2])) == 2
|
||||
assert cpu_manager.lookup(to_hashes([1, 2, 3])) == 2
|
||||
|
||||
# prepare store [2, 3, 4, 5] -> evicts [1]
|
||||
prepare_store_output = cpu_manager.prepare_store(to_hashes([2, 3, 4, 5]))
|
||||
verify_store_output(
|
||||
prepare_store_output,
|
||||
ExpectedPrepareStoreOutput(
|
||||
block_hashes_to_store=[3, 4, 5],
|
||||
store_block_ids=[2, 3, 0],
|
||||
block_hashes_evicted=[1],
|
||||
),
|
||||
)
|
||||
|
||||
# verify eviction event
|
||||
verify_events(
|
||||
cpu_manager.take_events(), block_size=block_size, expected_evictions=({1},)
|
||||
)
|
||||
|
||||
# prepare store with no space
|
||||
assert cpu_manager.prepare_store(to_hashes([1, 6])) is None
|
||||
|
||||
# complete store [2, 3, 4, 5]
|
||||
cpu_manager.complete_store(to_hashes([2, 3, 4, 5]))
|
||||
|
||||
# prepare load [2, 3]
|
||||
prepare_load_output = cpu_manager.prepare_load(to_hashes([2, 3]))
|
||||
verify_load_output(prepare_load_output, [1, 2])
|
||||
|
||||
# prepare store with no space ([2, 3] is being loaded)
|
||||
assert cpu_manager.prepare_store(to_hashes([6, 7, 8])) is None
|
||||
|
||||
# complete load [2, 3]
|
||||
cpu_manager.complete_load(to_hashes([2, 3]))
|
||||
|
||||
# prepare store [6, 7, 8] -> evicts [2, 3, 4] (oldest)
|
||||
prepare_store_output = cpu_manager.prepare_store(to_hashes([6, 7, 8]))
|
||||
verify_store_output(
|
||||
prepare_store_output,
|
||||
ExpectedPrepareStoreOutput(
|
||||
block_hashes_to_store=[6, 7, 8],
|
||||
store_block_ids=[3, 2, 1],
|
||||
block_hashes_evicted=[2, 3, 4],
|
||||
),
|
||||
)
|
||||
|
||||
# complete store [6, 7, 8]
|
||||
cpu_manager.complete_store(to_hashes([6, 7, 8]))
|
||||
|
||||
# touch [5, 6, 7] (move to end of LRU order)
|
||||
cpu_manager.touch(to_hashes([5, 6, 7]))
|
||||
|
||||
# prepare store [7, 9] -> evicts [8] (oldest following previous touch)
|
||||
prepare_store_output = cpu_manager.prepare_store(to_hashes([9]))
|
||||
verify_store_output(
|
||||
prepare_store_output,
|
||||
ExpectedPrepareStoreOutput(
|
||||
block_hashes_to_store=[9],
|
||||
store_block_ids=[1],
|
||||
block_hashes_evicted=[8],
|
||||
),
|
||||
)
|
||||
|
||||
# complete store [7, 9] with failure
|
||||
cpu_manager.complete_store(to_hashes([7, 9]), success=False)
|
||||
|
||||
# assert [7] is still stored, but [9] is not
|
||||
assert cpu_manager.lookup(to_hashes([7])) == 1
|
||||
assert cpu_manager.lookup(to_hashes([9])) == 0
|
||||
|
||||
verify_events(
|
||||
cpu_manager.take_events(),
|
||||
block_size=block_size,
|
||||
expected_stores=({3, 4, 5}, {6, 7, 8}),
|
||||
expected_evictions=({2, 3, 4}, {8}),
|
||||
)
|
||||
|
||||
|
||||
def test_arc_manager_basic():
|
||||
"""
|
||||
Tests ARCOffloadingManager basic operations with a CPUBackend.
|
||||
Verifies that ARC handles store, load, and lookup operations correctly.
|
||||
"""
|
||||
# initialize a CPU backend with a capacity of 4 blocks
|
||||
block_size = 256
|
||||
cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
|
||||
arc_manager = ARCOffloadingManager(cpu_backend, enable_events=True)
|
||||
|
||||
# prepare store [1, 2]
|
||||
prepare_store_output = arc_manager.prepare_store(to_hashes([1, 2]))
|
||||
verify_store_output(
|
||||
prepare_store_output,
|
||||
ExpectedPrepareStoreOutput(
|
||||
block_hashes_to_store=[1, 2],
|
||||
store_block_ids=[0, 1],
|
||||
block_hashes_evicted=[],
|
||||
),
|
||||
)
|
||||
|
||||
# lookup [1, 2] -> not ready
|
||||
assert arc_manager.lookup(to_hashes([1, 2])) == 0
|
||||
|
||||
# no events so far
|
||||
assert list(arc_manager.take_events()) == []
|
||||
|
||||
# complete store [1, 2]
|
||||
arc_manager.complete_store(to_hashes([1, 2]))
|
||||
verify_events(
|
||||
arc_manager.take_events(), block_size=block_size, expected_stores=({1, 2},)
|
||||
)
|
||||
|
||||
# lookup [1, 2]
|
||||
assert arc_manager.lookup(to_hashes([1])) == 1
|
||||
assert arc_manager.lookup(to_hashes([1, 2])) == 2
|
||||
assert arc_manager.lookup(to_hashes([1, 2, 3])) == 2
|
||||
|
||||
# blocks should be in T1 (recent)
|
||||
assert len(arc_manager.t1) == 2
|
||||
assert len(arc_manager.t2) == 0
|
||||
|
||||
|
||||
def test_arc_manager_t1_to_t2_promotion():
|
||||
"""
|
||||
Tests that accessing a block in T1 promotes it to T2 (frequent).
|
||||
This is a key feature of ARC's adaptive behavior.
|
||||
"""
|
||||
block_size = 256
|
||||
cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
|
||||
arc_manager = ARCOffloadingManager(cpu_backend, enable_events=False)
|
||||
|
||||
# store and complete block 1
|
||||
arc_manager.prepare_store(to_hashes([1]))
|
||||
arc_manager.complete_store(to_hashes([1]))
|
||||
|
||||
# block 1 starts in T1 (recent)
|
||||
assert to_hashes([1])[0] in arc_manager.t1
|
||||
assert to_hashes([1])[0] not in arc_manager.t2
|
||||
|
||||
# touch block 1 (simulate second access)
|
||||
arc_manager.touch(to_hashes([1]))
|
||||
|
||||
# block 1 should now be in T2 (frequent)
|
||||
assert to_hashes([1])[0] not in arc_manager.t1
|
||||
assert to_hashes([1])[0] in arc_manager.t2
|
||||
|
||||
|
||||
def test_arc_manager_eviction_with_load():
|
||||
"""
|
||||
Tests ARC eviction behavior similar to LRU test.
|
||||
Verifies that blocks being loaded (ref_cnt > 0) cannot be evicted.
|
||||
"""
|
||||
block_size = 256
|
||||
cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
|
||||
arc_manager = ARCOffloadingManager(cpu_backend, enable_events=True)
|
||||
|
||||
# prepare and complete store [1, 2, 3, 4]
|
||||
prepare_store_output = arc_manager.prepare_store(to_hashes([1, 2, 3, 4]))
|
||||
verify_store_output(
|
||||
prepare_store_output,
|
||||
ExpectedPrepareStoreOutput(
|
||||
block_hashes_to_store=[1, 2, 3, 4],
|
||||
store_block_ids=[0, 1, 2, 3],
|
||||
block_hashes_evicted=[],
|
||||
),
|
||||
)
|
||||
arc_manager.complete_store(to_hashes([1, 2, 3, 4]))
|
||||
|
||||
# prepare load [2, 3] (increases ref_cnt)
|
||||
prepare_load_output = arc_manager.prepare_load(to_hashes([2, 3]))
|
||||
verify_load_output(prepare_load_output, [1, 2])
|
||||
|
||||
# prepare store [5, 6, 7] with [2, 3] being loaded
|
||||
# should fail because [2, 3] have ref_cnt > 0
|
||||
assert arc_manager.prepare_store(to_hashes([5, 6, 7])) is None
|
||||
|
||||
# complete load [2, 3]
|
||||
arc_manager.complete_load(to_hashes([2, 3]))
|
||||
|
||||
# now prepare store [5, 6, 7] should succeed
|
||||
# ARC will evict blocks one at a time from T1 as needed
|
||||
prepare_store_output = arc_manager.prepare_store(to_hashes([5, 6, 7]))
|
||||
assert prepare_store_output is not None
|
||||
# Should successfully evict enough blocks to make room (at least 1)
|
||||
assert len(prepare_store_output.block_hashes_evicted) >= 1
|
||||
|
||||
|
||||
def test_arc_manager_adaptive_target():
|
||||
"""
|
||||
Tests ARC's adaptive target adjustment via ghost lists.
|
||||
When a block in B1 (ghost list) is accessed, target_t1_size increases.
|
||||
When a block in B2 is accessed, target_t1_size decreases.
|
||||
"""
|
||||
block_size = 256
|
||||
cpu_backend = CPUBackend(block_size=block_size, num_blocks=2)
|
||||
arc_manager = ARCOffloadingManager(cpu_backend, enable_events=False)
|
||||
|
||||
# store blocks 1, 2 (fills cache)
|
||||
arc_manager.prepare_store(to_hashes([1, 2]))
|
||||
arc_manager.complete_store(to_hashes([1, 2]))
|
||||
|
||||
initial_target = arc_manager.target_t1_size
|
||||
|
||||
# store block 3, evicting block 1 (moves to B1 ghost list)
|
||||
arc_manager.prepare_store(to_hashes([3]))
|
||||
arc_manager.complete_store(to_hashes([3]))
|
||||
|
||||
# block 1 should be in B1 (ghost list)
|
||||
assert to_hashes([1])[0] in arc_manager.b1
|
||||
|
||||
# touch block 1 (cache miss, but in B1)
|
||||
# this should increase target_t1_size (favor recency)
|
||||
arc_manager.touch(to_hashes([1]))
|
||||
|
||||
# target should have increased
|
||||
assert arc_manager.target_t1_size > initial_target
|
||||
|
||||
|
||||
def test_arc_manager_t1_t2_eviction_policy():
|
||||
"""
|
||||
Tests that ARC evicts from T1 or T2 based on target_t1_size.
|
||||
If |T1| >= target_t1_size, evict from T1, otherwise from T2.
|
||||
"""
|
||||
block_size = 256
|
||||
cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
|
||||
arc_manager = ARCOffloadingManager(cpu_backend, enable_events=False)
|
||||
|
||||
# store blocks 1, 2, 3, 4
|
||||
arc_manager.prepare_store(to_hashes([1, 2, 3, 4]))
|
||||
arc_manager.complete_store(to_hashes([1, 2, 3, 4]))
|
||||
|
||||
# promote blocks 3, 4 to T2 by touching them
|
||||
arc_manager.touch(to_hashes([3, 4]))
|
||||
|
||||
# now: T1 = {1, 2}, T2 = {3, 4}
|
||||
assert len(arc_manager.t1) == 2
|
||||
assert len(arc_manager.t2) == 2
|
||||
|
||||
# set target_t1_size to prefer evicting from T1
|
||||
# (when |T1| >= target, evict from T1)
|
||||
arc_manager.target_t1_size = 1
|
||||
|
||||
# store block 5, should evict from T1 (block 1, LRU in T1)
|
||||
output = arc_manager.prepare_store(to_hashes([5]))
|
||||
assert output is not None
|
||||
assert to_hashes([1]) == output.block_hashes_evicted
|
||||
|
||||
arc_manager.complete_store(to_hashes([5]))
|
||||
|
||||
# block 1 should be in B1 (ghost list)
|
||||
assert to_hashes([1])[0] in arc_manager.b1
|
||||
# block 5 should be in T1
|
||||
assert to_hashes([5])[0] in arc_manager.t1
|
||||
|
||||
|
||||
def test_arc_manager_ghost_list_bounds():
|
||||
"""
|
||||
Tests that ghost lists (B1, B2) don't grow unbounded.
|
||||
They should be capped at cache_capacity.
|
||||
"""
|
||||
block_size = 256
|
||||
cpu_backend = CPUBackend(block_size=block_size, num_blocks=2)
|
||||
arc_manager = ARCOffloadingManager(cpu_backend, enable_events=False)
|
||||
|
||||
# fill cache with blocks 1, 2
|
||||
arc_manager.prepare_store(to_hashes([1, 2]))
|
||||
arc_manager.complete_store(to_hashes([1, 2]))
|
||||
|
||||
# store many blocks to fill ghost lists
|
||||
for i in range(3, 20):
|
||||
arc_manager.prepare_store(to_hashes([i]))
|
||||
arc_manager.complete_store(to_hashes([i]))
|
||||
|
||||
# ghost lists should not exceed cache_capacity
|
||||
assert len(arc_manager.b1) <= arc_manager.cache_capacity
|
||||
assert len(arc_manager.b2) <= arc_manager.cache_capacity
|
||||
|
||||
|
||||
def test_arc_manager_touch_ordering():
|
||||
"""
|
||||
Tests that touch() correctly updates access patterns.
|
||||
Similar to LRU test but verifies T1/T2 ordering.
|
||||
"""
|
||||
block_size = 256
|
||||
cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
|
||||
arc_manager = ARCOffloadingManager(cpu_backend, enable_events=True)
|
||||
|
||||
# store blocks 1, 2, 3, 4
|
||||
arc_manager.prepare_store(to_hashes([1, 2, 3, 4]))
|
||||
arc_manager.complete_store(to_hashes([1, 2, 3, 4]))
|
||||
|
||||
# promote 3, 4 to T2
|
||||
arc_manager.touch(to_hashes([3, 4]))
|
||||
|
||||
# T1 = {1, 2}, T2 = {3, 4}
|
||||
# touch [1, 3, 4] - should promote 1 to T2, and move 3,4 to end of T2
|
||||
arc_manager.touch(to_hashes([1, 3, 4]))
|
||||
|
||||
# T1 = {2}, T2 = {1, 3, 4} (in that order, with 4 most recent)
|
||||
assert len(arc_manager.t1) == 1
|
||||
assert len(arc_manager.t2) == 3
|
||||
|
||||
# store block 5, should evict from T1 (block 2, only one in T1)
|
||||
prepare_store_output = arc_manager.prepare_store(to_hashes([5]))
|
||||
verify_store_output(
|
||||
prepare_store_output,
|
||||
ExpectedPrepareStoreOutput(
|
||||
block_hashes_to_store=[5],
|
||||
store_block_ids=[1], # reuses block 2's storage
|
||||
block_hashes_evicted=[2],
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def test_arc_manager_failed_store():
|
||||
"""
|
||||
Tests that failed store operations clean up correctly.
|
||||
Similar to LRU test but for ARC.
|
||||
"""
|
||||
block_size = 256
|
||||
cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
|
||||
arc_manager = ARCOffloadingManager(cpu_backend, enable_events=True)
|
||||
|
||||
# store blocks 1, 2, 3, 4
|
||||
arc_manager.prepare_store(to_hashes([1, 2, 3, 4]))
|
||||
arc_manager.complete_store(to_hashes([1, 2, 3, 4]))
|
||||
|
||||
# prepare store block 5 (will evict block 1)
|
||||
prepare_store_output = arc_manager.prepare_store(to_hashes([5]))
|
||||
assert prepare_store_output is not None
|
||||
assert len(prepare_store_output.block_hashes_evicted) == 1
|
||||
|
||||
# complete store with failure
|
||||
arc_manager.complete_store(to_hashes([5]), success=False)
|
||||
|
||||
# block 5 should not be in cache
|
||||
assert arc_manager.lookup(to_hashes([5])) == 0
|
||||
# block 5 should not be in T1 or T2
|
||||
assert to_hashes([5])[0] not in arc_manager.t1
|
||||
assert to_hashes([5])[0] not in arc_manager.t2
|
||||
|
||||
# evicted block should still be gone (in B1 ghost list)
|
||||
evicted_hash = prepare_store_output.block_hashes_evicted[0]
|
||||
assert evicted_hash in arc_manager.b1
|
||||
|
||||
|
||||
def test_arc_manager_full_scenario():
|
||||
"""
|
||||
Comprehensive test covering multiple ARC operations in sequence.
|
||||
Similar to the full LRU test but adapted for ARC behavior.
|
||||
"""
|
||||
block_size = 256
|
||||
cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
|
||||
arc_manager = ARCOffloadingManager(cpu_backend, enable_events=True)
|
||||
|
||||
# store [1, 2]
|
||||
arc_manager.prepare_store(to_hashes([1, 2]))
|
||||
arc_manager.complete_store(to_hashes([1, 2]))
|
||||
|
||||
# store [3, 4, 5] -> evicts [1]
|
||||
prepare_store_output = arc_manager.prepare_store(to_hashes([3, 4, 5]))
|
||||
assert prepare_store_output is not None
|
||||
assert len(prepare_store_output.block_hashes_evicted) == 1
|
||||
arc_manager.complete_store(to_hashes([3, 4, 5]))
|
||||
|
||||
# promote some blocks to T2
|
||||
arc_manager.touch(to_hashes([2, 3]))
|
||||
|
||||
# T1 has {4, 5}, T2 has {2, 3}
|
||||
assert len(arc_manager.t1) == 2
|
||||
assert len(arc_manager.t2) == 2
|
||||
|
||||
# store [6] -> should evict from T1 (4 is oldest in T1)
|
||||
prepare_store_output = arc_manager.prepare_store(to_hashes([6]))
|
||||
assert prepare_store_output is not None
|
||||
arc_manager.complete_store(to_hashes([6]))
|
||||
|
||||
# verify blocks 2, 3 (in T2) are still present
|
||||
assert arc_manager.lookup(to_hashes([2])) == 1
|
||||
assert arc_manager.lookup(to_hashes([3])) == 1
|
||||
|
||||
# verify events
|
||||
events = list(arc_manager.take_events())
|
||||
assert len(events) > 0 # should have store and eviction events
|
||||
199
tests/v1/kv_offload/test_cpu_offloading.py
Normal file
199
tests/v1/kv_offload/test_cpu_offloading.py
Normal file
@@ -0,0 +1,199 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import socket
|
||||
import time
|
||||
|
||||
import msgspec
|
||||
import msgspec.msgpack
|
||||
import pytest
|
||||
import zmq
|
||||
from tqdm import tqdm
|
||||
|
||||
from vllm import LLM, SamplingParams, TokensPrompt
|
||||
from vllm.config import KVEventsConfig, KVTransferConfig
|
||||
from vllm.distributed.kv_events import BlockStored, KVEventBatch
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.system_utils import set_env_var
|
||||
|
||||
CPU_BLOCK_SIZES = [48]
|
||||
ATTN_BACKENDS = ["FLASH_ATTN"]
|
||||
|
||||
if current_platform.is_cuda():
|
||||
ATTN_BACKENDS.append("FLASHINFER")
|
||||
elif current_platform.is_rocm():
|
||||
ATTN_BACKENDS = ["TRITON_ATTN"]
|
||||
|
||||
|
||||
class MockSubscriber:
|
||||
"""Helper class to receive and verify published events"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
endpoint: str,
|
||||
topic: str,
|
||||
):
|
||||
self.ctx = zmq.Context.instance()
|
||||
self.topic_bytes = topic.encode("utf-8")
|
||||
|
||||
# Set up subscriber socket
|
||||
self.sub = self.ctx.socket(zmq.SUB)
|
||||
self.sub.setsockopt(zmq.SUBSCRIBE, self.topic_bytes)
|
||||
self.sub.connect(endpoint)
|
||||
|
||||
self.decoder = msgspec.msgpack.Decoder(type=KVEventBatch)
|
||||
|
||||
def get_new_cpu_stored_events(self) -> list[BlockStored]:
|
||||
cpu_stored_events: list[BlockStored] = []
|
||||
|
||||
poller = zmq.Poller()
|
||||
poller.register(self.sub, zmq.POLLIN)
|
||||
|
||||
timeout = 1000 # 1 second
|
||||
while True:
|
||||
events = dict(poller.poll(timeout))
|
||||
|
||||
if events.get(self.sub) != zmq.POLLIN:
|
||||
return cpu_stored_events
|
||||
|
||||
topic_bytes, _, payload = self.sub.recv_multipart()
|
||||
|
||||
assert topic_bytes == self.topic_bytes
|
||||
|
||||
event_batch = self.decoder.decode(payload)
|
||||
assert isinstance(event_batch, KVEventBatch)
|
||||
for event in event_batch.events:
|
||||
if isinstance(event, BlockStored) and event.medium == "CPU":
|
||||
cpu_stored_events.append(event)
|
||||
timeout = 100
|
||||
|
||||
def close(self):
|
||||
"""Clean up resources"""
|
||||
self.sub.close()
|
||||
|
||||
|
||||
def _latency_test(llm: LLM, subscriber: MockSubscriber):
|
||||
sampling_params = SamplingParams(max_tokens=1)
|
||||
|
||||
num_times_cpu_better_than_cold = 0
|
||||
num_tests = 10
|
||||
total_cold_time = 0.0
|
||||
total_gpu_hit_time = 0.0
|
||||
total_cpu_hit_time = 0.0
|
||||
prompt_token_ids = [0] * 10001
|
||||
for i in tqdm(range(num_tests), desc="Running tests"):
|
||||
prompt_token_ids[0] = i
|
||||
prompts = [TokensPrompt(prompt_token_ids=prompt_token_ids)]
|
||||
|
||||
# run generation - this should trigger saving KV cache
|
||||
start_time = time.time()
|
||||
llm.generate(prompts, sampling_params, use_tqdm=False)
|
||||
cold_time = time.time() - start_time
|
||||
total_cold_time += cold_time
|
||||
|
||||
# run generation again - should hit the GPU prefix cache
|
||||
start_time = time.time()
|
||||
llm.generate(prompts, sampling_params, use_tqdm=False)
|
||||
gpu_hit_time = time.time() - start_time
|
||||
total_gpu_hit_time += gpu_hit_time
|
||||
|
||||
# reset prefix cache to avoid GPU hit.
|
||||
llm.reset_prefix_cache()
|
||||
|
||||
assert subscriber.get_new_cpu_stored_events()
|
||||
|
||||
# run generation again - this should trigger loading from CPU
|
||||
start_time = time.time()
|
||||
llm.generate(prompts, sampling_params, use_tqdm=False)
|
||||
cpu_hit_time = time.time() - start_time
|
||||
total_cpu_hit_time += cpu_hit_time
|
||||
|
||||
if cpu_hit_time < cold_time:
|
||||
num_times_cpu_better_than_cold += 1
|
||||
|
||||
print("Average times:")
|
||||
print(f" Cold: {total_cold_time * 1000 / num_tests:.2f}ms")
|
||||
print(f" GPU hit: {total_gpu_hit_time * 1000 / num_tests:.2f}ms")
|
||||
print(f" CPU hit: {total_cpu_hit_time * 1000 / num_tests:.2f}ms")
|
||||
|
||||
assert num_times_cpu_better_than_cold >= 0.8 * num_tests
|
||||
|
||||
|
||||
def _accuracy_test(llm: LLM, subscriber: MockSubscriber):
|
||||
sampling_params = SamplingParams(max_tokens=1)
|
||||
cpu_block_size = (
|
||||
llm.llm_engine.vllm_config.kv_transfer_config.kv_connector_extra_config[
|
||||
"block_size"
|
||||
]
|
||||
)
|
||||
|
||||
subscriber.get_new_cpu_stored_events()
|
||||
|
||||
# prepend prompt to be cpu block aligned
|
||||
prompt = "Let's count to 10. One, two, three, four,"
|
||||
while (
|
||||
len(llm.generate(prompt, use_tqdm=False)[0].prompt_token_ids) % cpu_block_size
|
||||
!= 0
|
||||
):
|
||||
prompt = ". " + prompt
|
||||
|
||||
assert subscriber.get_new_cpu_stored_events()
|
||||
|
||||
test_count = 100
|
||||
success_count = 0
|
||||
for i in range(test_count):
|
||||
if (
|
||||
llm.generate(prompt, sampling_params, use_tqdm=False)[0].outputs[0].text
|
||||
== " five"
|
||||
):
|
||||
success_count += 1
|
||||
|
||||
assert success_count >= 0.5 * test_count
|
||||
|
||||
|
||||
@pytest.mark.parametrize("cpu_block_size", CPU_BLOCK_SIZES)
|
||||
@pytest.mark.parametrize("attn_backend", ATTN_BACKENDS)
|
||||
def test_cpu_offloading(cpu_block_size: int, attn_backend: str) -> None:
|
||||
"""
|
||||
Tests OffloadingConnector with CPUOffloadingSpec.
|
||||
"""
|
||||
|
||||
# configure OffloadingConnector (spec_name=CPUOffloadingSpec by default)
|
||||
kv_transfer_config = KVTransferConfig(
|
||||
kv_connector="OffloadingConnector",
|
||||
kv_role="kv_both",
|
||||
kv_connector_extra_config={
|
||||
"num_cpu_blocks": 1000,
|
||||
"block_size": cpu_block_size,
|
||||
},
|
||||
)
|
||||
|
||||
port: int
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
s.bind(("0.0.0.0", 0))
|
||||
port = s.getsockname()[1]
|
||||
|
||||
events_endpoint = f"tcp://*:{port}"
|
||||
kv_events_config = KVEventsConfig(
|
||||
enable_kv_cache_events=True,
|
||||
publisher="zmq",
|
||||
endpoint=events_endpoint,
|
||||
topic="test",
|
||||
)
|
||||
|
||||
with set_env_var("VLLM_ATTENTION_BACKEND", attn_backend):
|
||||
llm = LLM(
|
||||
model="meta-llama/Llama-3.2-1B-Instruct",
|
||||
gpu_memory_utilization=0.5,
|
||||
kv_events_config=kv_events_config,
|
||||
kv_transfer_config=kv_transfer_config,
|
||||
)
|
||||
|
||||
events_endpoint = events_endpoint.replace("*", "127.0.0.1")
|
||||
subscriber = MockSubscriber(events_endpoint, topic=kv_events_config.topic)
|
||||
|
||||
try:
|
||||
_latency_test(llm, subscriber)
|
||||
_accuracy_test(llm, subscriber)
|
||||
finally:
|
||||
subscriber.close()
|
||||
del llm
|
||||
153
tests/v1/kv_offload/test_worker.py
Normal file
153
tests/v1/kv_offload/test_worker.py
Normal file
@@ -0,0 +1,153 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from vllm.v1.kv_offload.abstract import LoadStoreSpec
|
||||
from vllm.v1.kv_offload.worker.worker import (
|
||||
OffloadingHandler,
|
||||
OffloadingWorker,
|
||||
TransferResult,
|
||||
TransferSpec,
|
||||
)
|
||||
|
||||
|
||||
class LoadStoreSpec1(LoadStoreSpec):
|
||||
def __init__(
|
||||
self,
|
||||
submit_success: bool = True,
|
||||
async_success: bool = True,
|
||||
exception: bool = False,
|
||||
):
|
||||
self.finished = False
|
||||
self.submit_success = submit_success
|
||||
self.async_success = async_success
|
||||
self.exception = exception
|
||||
|
||||
@staticmethod
|
||||
def medium() -> str:
|
||||
return "1"
|
||||
|
||||
def __repr__(self):
|
||||
return f"{self.medium()}: {id(self)}"
|
||||
|
||||
|
||||
class LoadStoreSpec2(LoadStoreSpec):
|
||||
@staticmethod
|
||||
def medium() -> str:
|
||||
return "2"
|
||||
|
||||
def __repr__(self):
|
||||
return f"{self.medium()}: {id(self)}"
|
||||
|
||||
|
||||
class OffloadingHandler1To2(OffloadingHandler):
|
||||
def __init__(self):
|
||||
self.transfers: dict[int, LoadStoreSpec1] = {}
|
||||
|
||||
def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
|
||||
src, dst = spec
|
||||
assert isinstance(src, LoadStoreSpec1)
|
||||
assert isinstance(dst, LoadStoreSpec2)
|
||||
|
||||
if src.exception:
|
||||
raise Exception("An expected exception. Don't worry!")
|
||||
if not src.submit_success:
|
||||
return False
|
||||
|
||||
self.transfers[job_id] = src
|
||||
return True
|
||||
|
||||
def get_finished(self) -> list[TransferResult]:
|
||||
finished = []
|
||||
for job_id, spec in list(self.transfers.items()):
|
||||
if spec.finished:
|
||||
finished.append((job_id, spec.async_success))
|
||||
del self.transfers[job_id]
|
||||
return finished
|
||||
|
||||
|
||||
class OffloadingHandler2To1(OffloadingHandler):
|
||||
def __init__(self):
|
||||
self.transfers: dict[int, LoadStoreSpec1] = {}
|
||||
|
||||
def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
|
||||
src, dst = spec
|
||||
assert isinstance(src, LoadStoreSpec2)
|
||||
assert isinstance(dst, LoadStoreSpec1)
|
||||
|
||||
self.transfers[job_id] = dst
|
||||
return True
|
||||
|
||||
def get_finished(self) -> list[TransferResult]:
|
||||
finished = []
|
||||
for job_id, spec in list(self.transfers.items()):
|
||||
if spec.finished:
|
||||
finished.append((job_id, spec.async_success))
|
||||
del self.transfers[job_id]
|
||||
return finished
|
||||
|
||||
|
||||
def test_offloading_worker():
|
||||
"""
|
||||
Tests OffloadingWorker with 2 handlers.
|
||||
One handler performs 1->2 transfers, and the other handles 2->1.
|
||||
"""
|
||||
worker = OffloadingWorker()
|
||||
handler1to2 = OffloadingHandler1To2()
|
||||
handler2to1 = OffloadingHandler2To1()
|
||||
worker.register_handler(LoadStoreSpec1, LoadStoreSpec2, handler1to2)
|
||||
worker.register_handler(LoadStoreSpec2, LoadStoreSpec1, handler2to1)
|
||||
|
||||
# 1st transfer 1->2 (exception)
|
||||
src1 = LoadStoreSpec1(exception=True)
|
||||
dst1 = LoadStoreSpec2()
|
||||
assert not worker.transfer_async(1, (src1, dst1))
|
||||
|
||||
# 2ed transfer 1->2 (failure to submit)
|
||||
src2 = LoadStoreSpec1(submit_success=False)
|
||||
dst2 = LoadStoreSpec2()
|
||||
assert not worker.transfer_async(2, (src2, dst2))
|
||||
|
||||
# 3rd transfer 1->2 (failure)
|
||||
src3 = LoadStoreSpec1(async_success=False)
|
||||
dst3 = LoadStoreSpec2()
|
||||
assert worker.transfer_async(3, (src3, dst3))
|
||||
|
||||
# 4th transfer 1->2 (success)
|
||||
src4 = LoadStoreSpec1()
|
||||
dst4 = LoadStoreSpec2()
|
||||
worker.transfer_async(4, (src4, dst4))
|
||||
assert set(handler1to2.transfers.keys()) == {3, 4}
|
||||
|
||||
# 5th transfer 2->1
|
||||
src5 = LoadStoreSpec2()
|
||||
dst5 = LoadStoreSpec1()
|
||||
worker.transfer_async(5, (src5, dst5))
|
||||
assert set(handler2to1.transfers.keys()) == {5}
|
||||
|
||||
# no transfer completed yet
|
||||
assert worker.get_finished() == []
|
||||
|
||||
# complete 3rd, 4th
|
||||
src3.finished = True
|
||||
src4.finished = True
|
||||
|
||||
# 6th transfer 1->2
|
||||
src6 = LoadStoreSpec1()
|
||||
dst6 = LoadStoreSpec2()
|
||||
worker.transfer_async(6, (src6, dst6))
|
||||
|
||||
# 7th transfer 2->1
|
||||
src7 = LoadStoreSpec2()
|
||||
dst7 = LoadStoreSpec1()
|
||||
worker.transfer_async(7, (src7, dst7))
|
||||
|
||||
# 6th and 7th transfers started
|
||||
assert 6 in handler1to2.transfers
|
||||
assert 7 in handler2to1.transfers
|
||||
|
||||
# verify result of 3rd and 4th transfers
|
||||
assert sorted(worker.get_finished()) == [(3, False), (4, True)]
|
||||
|
||||
# complete 6th and 7th transfers
|
||||
src6.finished = True
|
||||
dst7.finished = True
|
||||
assert sorted(worker.get_finished()) == [(6, True), (7, True)]
|
||||
Reference in New Issue
Block a user