Sync from v0.13

This commit is contained in:
2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions

View File

@@ -0,0 +1,191 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random
import time
import pytest
import torch
from vllm.platforms import current_platform
from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
from vllm.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandlers
BACKENDS_TO_TEST = [FlashAttentionBackend]
if not current_platform.is_rocm():
from vllm.v1.attention.backends.flashinfer import FlashInferBackend
BACKENDS_TO_TEST.append(FlashInferBackend)
from vllm.v1.attention.backends.mla.flashattn_mla import FlashAttnMLABackend
BACKENDS_TO_TEST.append(FlashAttnMLABackend)
NUM_GPU_BLOCKS = [64]
NUM_CPU_BLOCKS = [256]
GPU_BLOCK_SIZES = [16]
GPU_BLOCKS_PER_CPU_BLOCK = [1, 3]
HEAD_SIZES = [64]
NUM_HEADS = [8]
NUM_LAYERS = [4]
DTYPES = [torch.bfloat16]
SEEDS = [0]
CUDA_DEVICES = ["cuda:0"]
NUM_MAPPINGS = [3]
@pytest.mark.parametrize("gpu_to_cpu", [True, False])
@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("gpu_block_size", GPU_BLOCK_SIZES)
@pytest.mark.parametrize("gpu_blocks_per_cpu_block", GPU_BLOCKS_PER_CPU_BLOCK)
@pytest.mark.parametrize("num_gpu_blocks", NUM_GPU_BLOCKS)
@pytest.mark.parametrize("num_cpu_blocks", NUM_CPU_BLOCKS)
@pytest.mark.parametrize("num_layers", NUM_LAYERS)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode()
def test_transfer(
gpu_to_cpu: bool,
num_mappings: int,
head_size: int,
num_heads: int,
gpu_block_size: int,
gpu_blocks_per_cpu_block: int,
num_gpu_blocks: int,
num_cpu_blocks: int,
num_layers: int,
dtype: torch.dtype,
seed: int,
device: str,
) -> None:
current_platform.seed_everything(seed)
# create per-layer GPU KV caches based on available attn_backends
attn_backends_list = BACKENDS_TO_TEST
gpu_caches = {}
attn_backends = {}
for i in range(num_layers):
layer_name = f"layer {i}"
attn_backend = attn_backends_list[i % len(attn_backends_list)]
attn_backends[layer_name] = attn_backend
gpu_cache_shape = attn_backend.get_kv_cache_shape(
num_gpu_blocks, gpu_block_size, num_heads, head_size
)
gpu_caches[layer_name] = torch.rand(gpu_cache_shape, dtype=dtype, device=device)
# create handler
cpu_block_size = gpu_blocks_per_cpu_block * gpu_block_size
handlers = CpuGpuOffloadingHandlers(
attn_backends=attn_backends,
gpu_block_size=gpu_block_size,
cpu_block_size=cpu_block_size,
num_cpu_blocks=num_cpu_blocks,
gpu_caches=gpu_caches,
)
# select block mappings
gpu_blocks = random.sample(
range(num_gpu_blocks), num_mappings * gpu_blocks_per_cpu_block
)
cpu_blocks = random.sample(range(num_cpu_blocks), num_mappings)
# convert cpu blocks to gpu block size
cpu_blocks_in_gpu_block_size = []
for cpu_block in cpu_blocks:
base_block_id = cpu_block * gpu_blocks_per_cpu_block
for i in range(gpu_blocks_per_cpu_block):
cpu_blocks_in_gpu_block_size.append(i + base_block_id)
# maybe skip a GPU block to test reading from the middle of a CPU block
if not gpu_to_cpu:
gpu_blocks = gpu_blocks[gpu_blocks_per_cpu_block - 1 :]
cpu_blocks_in_gpu_block_size = cpu_blocks_in_gpu_block_size[
gpu_blocks_per_cpu_block - 1 :
]
# set transfer direction
if gpu_to_cpu:
handler = handlers.gpu_to_cpu_handler
src_spec_class = GPULoadStoreSpec
dst_spec_class = CPULoadStoreSpec
src_blocks = gpu_blocks
dst_blocks = cpu_blocks
src_blocks_in_gpu_block_size = gpu_blocks
dst_blocks_in_gpu_block_size = cpu_blocks_in_gpu_block_size
dst_size_in_gpu_blocks = num_cpu_blocks * gpu_blocks_per_cpu_block
else:
handler = handlers.cpu_to_gpu_handler
src_spec_class = CPULoadStoreSpec
dst_spec_class = GPULoadStoreSpec
src_blocks = cpu_blocks
dst_blocks = gpu_blocks
src_blocks_in_gpu_block_size = cpu_blocks_in_gpu_block_size
dst_blocks_in_gpu_block_size = gpu_blocks
dst_size_in_gpu_blocks = num_gpu_blocks
# build dst -> src mapping
dst_to_src = {}
for src_block, dst_block in zip(
src_blocks_in_gpu_block_size, dst_blocks_in_gpu_block_size
):
dst_to_src[dst_block] = src_block
# build transfer specs
src_spec = src_spec_class(src_blocks)
dst_spec = dst_spec_class(dst_blocks)
# clone src and dst tensors before transfer
orig_src_caches = [x.clone() for x in handler.src_tensors]
orig_dst_caches = [x.clone() for x in handler.dst_tensors]
# call transfer function
assert handler.transfer_async(1, (src_spec, dst_spec))
assert set({x[0] for x in handler._transfers}) == {1}
# wait for transfer to complete
end_time = time.time() + 10
while time.time() < end_time:
finished = handler.get_finished()
if finished:
assert finished == [(1, True)]
break
time.sleep(0.1)
# verify src tensors did not change
for orig_tensor, tensor in zip(orig_src_caches, handler.src_tensors):
assert torch.equal(orig_tensor, tensor)
# verify dst tensors
for dst_block in range(dst_size_in_gpu_blocks):
src_block_candidate = dst_to_src.get(dst_block)
for src_cache, dst_cache, orig_dst_cache, kv_dim in zip(
handler.src_tensors,
handler.dst_tensors,
orig_dst_caches,
handler.kv_dim_before_num_blocks,
):
if kv_dim:
# iterate over key, value
for i in range(2):
if src_block_candidate is not None:
expected_value = src_cache[i][src_block_candidate]
else:
expected_value = orig_dst_cache[i][dst_block]
torch.testing.assert_close(
dst_cache[i][dst_block].cpu(), expected_value.cpu()
)
else:
if src_block_candidate is not None:
expected_value = src_cache[src_block_candidate]
else:
expected_value = orig_dst_cache[dst_block]
torch.testing.assert_close(
dst_cache[dst_block].cpu(), expected_value.cpu()
)

View File

@@ -0,0 +1,497 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable
from dataclasses import dataclass
import numpy as np
from vllm.v1.core.kv_cache_utils import BlockHash
from vllm.v1.kv_offload.abstract import (
LoadStoreSpec,
OffloadingEvent,
PrepareStoreOutput,
)
from vllm.v1.kv_offload.arc_manager import ARCOffloadingManager
from vllm.v1.kv_offload.backends.cpu import CPUBackend
from vllm.v1.kv_offload.lru_manager import LRUOffloadingManager
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec
@dataclass
class ExpectedPrepareStoreOutput:
block_hashes_to_store: list[int]
store_block_ids: list[int]
block_hashes_evicted: list[int]
def to_hashes(int_hashes: list[int]) -> list[BlockHash]:
return [BlockHash(str(i).encode()) for i in int_hashes]
def verify_store_output(
prepare_store_output: PrepareStoreOutput | None,
expected_prepare_store_output: ExpectedPrepareStoreOutput,
):
assert prepare_store_output is not None
assert prepare_store_output.block_hashes_to_store == to_hashes(
expected_prepare_store_output.block_hashes_to_store
)
assert prepare_store_output.block_hashes_evicted == to_hashes(
expected_prepare_store_output.block_hashes_evicted
)
store_spec = prepare_store_output.store_spec
assert isinstance(store_spec, CPULoadStoreSpec)
expected_array = np.array(
expected_prepare_store_output.store_block_ids, dtype=np.int64
)
assert np.array_equal(expected_array, store_spec.block_ids)
def verify_load_output(
prepare_load_output: LoadStoreSpec, expected_prepare_load_output: list[int]
):
assert isinstance(prepare_load_output, CPULoadStoreSpec)
expected_array = np.array(expected_prepare_load_output, dtype=np.int64)
assert np.array_equal(expected_array, prepare_load_output.block_ids)
def verify_events(
events: Iterable[OffloadingEvent],
block_size: int,
expected_stores: tuple[set[int], ...] = (),
expected_evictions: tuple[set[int], ...] = (),
):
stores: list[set[BlockHash]] = []
evictions: list[set[BlockHash]] = []
for event in events:
assert event.medium == CPULoadStoreSpec.medium()
assert event.block_size == block_size
if event.removed:
evictions.append(set(event.block_hashes))
else:
stores.append(set(event.block_hashes))
def to_hash_sets(int_sets: tuple[set[int], ...]) -> tuple[set[BlockHash], ...]:
return tuple([set(to_hashes(list(int_set))) for int_set in int_sets])
assert tuple(evictions) == to_hash_sets(expected_evictions)
assert tuple(stores) == to_hash_sets(expected_stores)
def test_cpu_manager():
"""
Tests LRUOffloadingManager with a CPUBackend.
"""
# initialize a CPU backend with a capacity of 4 blocks
block_size = 256
cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
cpu_manager = LRUOffloadingManager(cpu_backend, enable_events=True)
# prepare store [1, 2]
prepare_store_output = cpu_manager.prepare_store(to_hashes([1, 2]))
verify_store_output(
prepare_store_output,
ExpectedPrepareStoreOutput(
block_hashes_to_store=[1, 2],
store_block_ids=[0, 1],
block_hashes_evicted=[],
),
)
# lookup [1, 2] -> not ready
assert cpu_manager.lookup(to_hashes([1, 2])) == 0
# no events so far
assert list(cpu_manager.take_events()) == []
# complete store [1, 2]
cpu_manager.complete_store(to_hashes([1, 2]))
verify_events(
cpu_manager.take_events(), block_size=block_size, expected_stores=({1, 2},)
)
# lookup [1, 2]
assert cpu_manager.lookup(to_hashes([1])) == 1
assert cpu_manager.lookup(to_hashes([1, 2])) == 2
assert cpu_manager.lookup(to_hashes([1, 2, 3])) == 2
# prepare store [2, 3, 4, 5] -> evicts [1]
prepare_store_output = cpu_manager.prepare_store(to_hashes([2, 3, 4, 5]))
verify_store_output(
prepare_store_output,
ExpectedPrepareStoreOutput(
block_hashes_to_store=[3, 4, 5],
store_block_ids=[2, 3, 0],
block_hashes_evicted=[1],
),
)
# verify eviction event
verify_events(
cpu_manager.take_events(), block_size=block_size, expected_evictions=({1},)
)
# prepare store with no space
assert cpu_manager.prepare_store(to_hashes([1, 6])) is None
# complete store [2, 3, 4, 5]
cpu_manager.complete_store(to_hashes([2, 3, 4, 5]))
# prepare load [2, 3]
prepare_load_output = cpu_manager.prepare_load(to_hashes([2, 3]))
verify_load_output(prepare_load_output, [1, 2])
# prepare store with no space ([2, 3] is being loaded)
assert cpu_manager.prepare_store(to_hashes([6, 7, 8])) is None
# complete load [2, 3]
cpu_manager.complete_load(to_hashes([2, 3]))
# prepare store [6, 7, 8] -> evicts [2, 3, 4] (oldest)
prepare_store_output = cpu_manager.prepare_store(to_hashes([6, 7, 8]))
verify_store_output(
prepare_store_output,
ExpectedPrepareStoreOutput(
block_hashes_to_store=[6, 7, 8],
store_block_ids=[3, 2, 1],
block_hashes_evicted=[2, 3, 4],
),
)
# complete store [6, 7, 8]
cpu_manager.complete_store(to_hashes([6, 7, 8]))
# touch [5, 6, 7] (move to end of LRU order)
cpu_manager.touch(to_hashes([5, 6, 7]))
# prepare store [7, 9] -> evicts [8] (oldest following previous touch)
prepare_store_output = cpu_manager.prepare_store(to_hashes([9]))
verify_store_output(
prepare_store_output,
ExpectedPrepareStoreOutput(
block_hashes_to_store=[9],
store_block_ids=[1],
block_hashes_evicted=[8],
),
)
# complete store [7, 9] with failure
cpu_manager.complete_store(to_hashes([7, 9]), success=False)
# assert [7] is still stored, but [9] is not
assert cpu_manager.lookup(to_hashes([7])) == 1
assert cpu_manager.lookup(to_hashes([9])) == 0
verify_events(
cpu_manager.take_events(),
block_size=block_size,
expected_stores=({3, 4, 5}, {6, 7, 8}),
expected_evictions=({2, 3, 4}, {8}),
)
def test_arc_manager_basic():
"""
Tests ARCOffloadingManager basic operations with a CPUBackend.
Verifies that ARC handles store, load, and lookup operations correctly.
"""
# initialize a CPU backend with a capacity of 4 blocks
block_size = 256
cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
arc_manager = ARCOffloadingManager(cpu_backend, enable_events=True)
# prepare store [1, 2]
prepare_store_output = arc_manager.prepare_store(to_hashes([1, 2]))
verify_store_output(
prepare_store_output,
ExpectedPrepareStoreOutput(
block_hashes_to_store=[1, 2],
store_block_ids=[0, 1],
block_hashes_evicted=[],
),
)
# lookup [1, 2] -> not ready
assert arc_manager.lookup(to_hashes([1, 2])) == 0
# no events so far
assert list(arc_manager.take_events()) == []
# complete store [1, 2]
arc_manager.complete_store(to_hashes([1, 2]))
verify_events(
arc_manager.take_events(), block_size=block_size, expected_stores=({1, 2},)
)
# lookup [1, 2]
assert arc_manager.lookup(to_hashes([1])) == 1
assert arc_manager.lookup(to_hashes([1, 2])) == 2
assert arc_manager.lookup(to_hashes([1, 2, 3])) == 2
# blocks should be in T1 (recent)
assert len(arc_manager.t1) == 2
assert len(arc_manager.t2) == 0
def test_arc_manager_t1_to_t2_promotion():
"""
Tests that accessing a block in T1 promotes it to T2 (frequent).
This is a key feature of ARC's adaptive behavior.
"""
block_size = 256
cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
arc_manager = ARCOffloadingManager(cpu_backend, enable_events=False)
# store and complete block 1
arc_manager.prepare_store(to_hashes([1]))
arc_manager.complete_store(to_hashes([1]))
# block 1 starts in T1 (recent)
assert to_hashes([1])[0] in arc_manager.t1
assert to_hashes([1])[0] not in arc_manager.t2
# touch block 1 (simulate second access)
arc_manager.touch(to_hashes([1]))
# block 1 should now be in T2 (frequent)
assert to_hashes([1])[0] not in arc_manager.t1
assert to_hashes([1])[0] in arc_manager.t2
def test_arc_manager_eviction_with_load():
"""
Tests ARC eviction behavior similar to LRU test.
Verifies that blocks being loaded (ref_cnt > 0) cannot be evicted.
"""
block_size = 256
cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
arc_manager = ARCOffloadingManager(cpu_backend, enable_events=True)
# prepare and complete store [1, 2, 3, 4]
prepare_store_output = arc_manager.prepare_store(to_hashes([1, 2, 3, 4]))
verify_store_output(
prepare_store_output,
ExpectedPrepareStoreOutput(
block_hashes_to_store=[1, 2, 3, 4],
store_block_ids=[0, 1, 2, 3],
block_hashes_evicted=[],
),
)
arc_manager.complete_store(to_hashes([1, 2, 3, 4]))
# prepare load [2, 3] (increases ref_cnt)
prepare_load_output = arc_manager.prepare_load(to_hashes([2, 3]))
verify_load_output(prepare_load_output, [1, 2])
# prepare store [5, 6, 7] with [2, 3] being loaded
# should fail because [2, 3] have ref_cnt > 0
assert arc_manager.prepare_store(to_hashes([5, 6, 7])) is None
# complete load [2, 3]
arc_manager.complete_load(to_hashes([2, 3]))
# now prepare store [5, 6, 7] should succeed
# ARC will evict blocks one at a time from T1 as needed
prepare_store_output = arc_manager.prepare_store(to_hashes([5, 6, 7]))
assert prepare_store_output is not None
# Should successfully evict enough blocks to make room (at least 1)
assert len(prepare_store_output.block_hashes_evicted) >= 1
def test_arc_manager_adaptive_target():
"""
Tests ARC's adaptive target adjustment via ghost lists.
When a block in B1 (ghost list) is accessed, target_t1_size increases.
When a block in B2 is accessed, target_t1_size decreases.
"""
block_size = 256
cpu_backend = CPUBackend(block_size=block_size, num_blocks=2)
arc_manager = ARCOffloadingManager(cpu_backend, enable_events=False)
# store blocks 1, 2 (fills cache)
arc_manager.prepare_store(to_hashes([1, 2]))
arc_manager.complete_store(to_hashes([1, 2]))
initial_target = arc_manager.target_t1_size
# store block 3, evicting block 1 (moves to B1 ghost list)
arc_manager.prepare_store(to_hashes([3]))
arc_manager.complete_store(to_hashes([3]))
# block 1 should be in B1 (ghost list)
assert to_hashes([1])[0] in arc_manager.b1
# touch block 1 (cache miss, but in B1)
# this should increase target_t1_size (favor recency)
arc_manager.touch(to_hashes([1]))
# target should have increased
assert arc_manager.target_t1_size > initial_target
def test_arc_manager_t1_t2_eviction_policy():
"""
Tests that ARC evicts from T1 or T2 based on target_t1_size.
If |T1| >= target_t1_size, evict from T1, otherwise from T2.
"""
block_size = 256
cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
arc_manager = ARCOffloadingManager(cpu_backend, enable_events=False)
# store blocks 1, 2, 3, 4
arc_manager.prepare_store(to_hashes([1, 2, 3, 4]))
arc_manager.complete_store(to_hashes([1, 2, 3, 4]))
# promote blocks 3, 4 to T2 by touching them
arc_manager.touch(to_hashes([3, 4]))
# now: T1 = {1, 2}, T2 = {3, 4}
assert len(arc_manager.t1) == 2
assert len(arc_manager.t2) == 2
# set target_t1_size to prefer evicting from T1
# (when |T1| >= target, evict from T1)
arc_manager.target_t1_size = 1
# store block 5, should evict from T1 (block 1, LRU in T1)
output = arc_manager.prepare_store(to_hashes([5]))
assert output is not None
assert to_hashes([1]) == output.block_hashes_evicted
arc_manager.complete_store(to_hashes([5]))
# block 1 should be in B1 (ghost list)
assert to_hashes([1])[0] in arc_manager.b1
# block 5 should be in T1
assert to_hashes([5])[0] in arc_manager.t1
def test_arc_manager_ghost_list_bounds():
"""
Tests that ghost lists (B1, B2) don't grow unbounded.
They should be capped at cache_capacity.
"""
block_size = 256
cpu_backend = CPUBackend(block_size=block_size, num_blocks=2)
arc_manager = ARCOffloadingManager(cpu_backend, enable_events=False)
# fill cache with blocks 1, 2
arc_manager.prepare_store(to_hashes([1, 2]))
arc_manager.complete_store(to_hashes([1, 2]))
# store many blocks to fill ghost lists
for i in range(3, 20):
arc_manager.prepare_store(to_hashes([i]))
arc_manager.complete_store(to_hashes([i]))
# ghost lists should not exceed cache_capacity
assert len(arc_manager.b1) <= arc_manager.cache_capacity
assert len(arc_manager.b2) <= arc_manager.cache_capacity
def test_arc_manager_touch_ordering():
"""
Tests that touch() correctly updates access patterns.
Similar to LRU test but verifies T1/T2 ordering.
"""
block_size = 256
cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
arc_manager = ARCOffloadingManager(cpu_backend, enable_events=True)
# store blocks 1, 2, 3, 4
arc_manager.prepare_store(to_hashes([1, 2, 3, 4]))
arc_manager.complete_store(to_hashes([1, 2, 3, 4]))
# promote 3, 4 to T2
arc_manager.touch(to_hashes([3, 4]))
# T1 = {1, 2}, T2 = {3, 4}
# touch [1, 3, 4] - should promote 1 to T2, and move 3,4 to end of T2
arc_manager.touch(to_hashes([1, 3, 4]))
# T1 = {2}, T2 = {1, 3, 4} (in that order, with 4 most recent)
assert len(arc_manager.t1) == 1
assert len(arc_manager.t2) == 3
# store block 5, should evict from T1 (block 2, only one in T1)
prepare_store_output = arc_manager.prepare_store(to_hashes([5]))
verify_store_output(
prepare_store_output,
ExpectedPrepareStoreOutput(
block_hashes_to_store=[5],
store_block_ids=[1], # reuses block 2's storage
block_hashes_evicted=[2],
),
)
def test_arc_manager_failed_store():
"""
Tests that failed store operations clean up correctly.
Similar to LRU test but for ARC.
"""
block_size = 256
cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
arc_manager = ARCOffloadingManager(cpu_backend, enable_events=True)
# store blocks 1, 2, 3, 4
arc_manager.prepare_store(to_hashes([1, 2, 3, 4]))
arc_manager.complete_store(to_hashes([1, 2, 3, 4]))
# prepare store block 5 (will evict block 1)
prepare_store_output = arc_manager.prepare_store(to_hashes([5]))
assert prepare_store_output is not None
assert len(prepare_store_output.block_hashes_evicted) == 1
# complete store with failure
arc_manager.complete_store(to_hashes([5]), success=False)
# block 5 should not be in cache
assert arc_manager.lookup(to_hashes([5])) == 0
# block 5 should not be in T1 or T2
assert to_hashes([5])[0] not in arc_manager.t1
assert to_hashes([5])[0] not in arc_manager.t2
# evicted block should still be gone (in B1 ghost list)
evicted_hash = prepare_store_output.block_hashes_evicted[0]
assert evicted_hash in arc_manager.b1
def test_arc_manager_full_scenario():
"""
Comprehensive test covering multiple ARC operations in sequence.
Similar to the full LRU test but adapted for ARC behavior.
"""
block_size = 256
cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
arc_manager = ARCOffloadingManager(cpu_backend, enable_events=True)
# store [1, 2]
arc_manager.prepare_store(to_hashes([1, 2]))
arc_manager.complete_store(to_hashes([1, 2]))
# store [3, 4, 5] -> evicts [1]
prepare_store_output = arc_manager.prepare_store(to_hashes([3, 4, 5]))
assert prepare_store_output is not None
assert len(prepare_store_output.block_hashes_evicted) == 1
arc_manager.complete_store(to_hashes([3, 4, 5]))
# promote some blocks to T2
arc_manager.touch(to_hashes([2, 3]))
# T1 has {4, 5}, T2 has {2, 3}
assert len(arc_manager.t1) == 2
assert len(arc_manager.t2) == 2
# store [6] -> should evict from T1 (4 is oldest in T1)
prepare_store_output = arc_manager.prepare_store(to_hashes([6]))
assert prepare_store_output is not None
arc_manager.complete_store(to_hashes([6]))
# verify blocks 2, 3 (in T2) are still present
assert arc_manager.lookup(to_hashes([2])) == 1
assert arc_manager.lookup(to_hashes([3])) == 1
# verify events
events = list(arc_manager.take_events())
assert len(events) > 0 # should have store and eviction events

View File

@@ -0,0 +1,199 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import socket
import time
import msgspec
import msgspec.msgpack
import pytest
import zmq
from tqdm import tqdm
from vllm import LLM, SamplingParams, TokensPrompt
from vllm.config import KVEventsConfig, KVTransferConfig
from vllm.distributed.kv_events import BlockStored, KVEventBatch
from vllm.platforms import current_platform
from vllm.utils.system_utils import set_env_var
CPU_BLOCK_SIZES = [48]
ATTN_BACKENDS = ["FLASH_ATTN"]
if current_platform.is_cuda():
ATTN_BACKENDS.append("FLASHINFER")
elif current_platform.is_rocm():
ATTN_BACKENDS = ["TRITON_ATTN"]
class MockSubscriber:
"""Helper class to receive and verify published events"""
def __init__(
self,
endpoint: str,
topic: str,
):
self.ctx = zmq.Context.instance()
self.topic_bytes = topic.encode("utf-8")
# Set up subscriber socket
self.sub = self.ctx.socket(zmq.SUB)
self.sub.setsockopt(zmq.SUBSCRIBE, self.topic_bytes)
self.sub.connect(endpoint)
self.decoder = msgspec.msgpack.Decoder(type=KVEventBatch)
def get_new_cpu_stored_events(self) -> list[BlockStored]:
cpu_stored_events: list[BlockStored] = []
poller = zmq.Poller()
poller.register(self.sub, zmq.POLLIN)
timeout = 1000 # 1 second
while True:
events = dict(poller.poll(timeout))
if events.get(self.sub) != zmq.POLLIN:
return cpu_stored_events
topic_bytes, _, payload = self.sub.recv_multipart()
assert topic_bytes == self.topic_bytes
event_batch = self.decoder.decode(payload)
assert isinstance(event_batch, KVEventBatch)
for event in event_batch.events:
if isinstance(event, BlockStored) and event.medium == "CPU":
cpu_stored_events.append(event)
timeout = 100
def close(self):
"""Clean up resources"""
self.sub.close()
def _latency_test(llm: LLM, subscriber: MockSubscriber):
sampling_params = SamplingParams(max_tokens=1)
num_times_cpu_better_than_cold = 0
num_tests = 10
total_cold_time = 0.0
total_gpu_hit_time = 0.0
total_cpu_hit_time = 0.0
prompt_token_ids = [0] * 10001
for i in tqdm(range(num_tests), desc="Running tests"):
prompt_token_ids[0] = i
prompts = [TokensPrompt(prompt_token_ids=prompt_token_ids)]
# run generation - this should trigger saving KV cache
start_time = time.time()
llm.generate(prompts, sampling_params, use_tqdm=False)
cold_time = time.time() - start_time
total_cold_time += cold_time
# run generation again - should hit the GPU prefix cache
start_time = time.time()
llm.generate(prompts, sampling_params, use_tqdm=False)
gpu_hit_time = time.time() - start_time
total_gpu_hit_time += gpu_hit_time
# reset prefix cache to avoid GPU hit.
llm.reset_prefix_cache()
assert subscriber.get_new_cpu_stored_events()
# run generation again - this should trigger loading from CPU
start_time = time.time()
llm.generate(prompts, sampling_params, use_tqdm=False)
cpu_hit_time = time.time() - start_time
total_cpu_hit_time += cpu_hit_time
if cpu_hit_time < cold_time:
num_times_cpu_better_than_cold += 1
print("Average times:")
print(f" Cold: {total_cold_time * 1000 / num_tests:.2f}ms")
print(f" GPU hit: {total_gpu_hit_time * 1000 / num_tests:.2f}ms")
print(f" CPU hit: {total_cpu_hit_time * 1000 / num_tests:.2f}ms")
assert num_times_cpu_better_than_cold >= 0.8 * num_tests
def _accuracy_test(llm: LLM, subscriber: MockSubscriber):
sampling_params = SamplingParams(max_tokens=1)
cpu_block_size = (
llm.llm_engine.vllm_config.kv_transfer_config.kv_connector_extra_config[
"block_size"
]
)
subscriber.get_new_cpu_stored_events()
# prepend prompt to be cpu block aligned
prompt = "Let's count to 10. One, two, three, four,"
while (
len(llm.generate(prompt, use_tqdm=False)[0].prompt_token_ids) % cpu_block_size
!= 0
):
prompt = ". " + prompt
assert subscriber.get_new_cpu_stored_events()
test_count = 100
success_count = 0
for i in range(test_count):
if (
llm.generate(prompt, sampling_params, use_tqdm=False)[0].outputs[0].text
== " five"
):
success_count += 1
assert success_count >= 0.5 * test_count
@pytest.mark.parametrize("cpu_block_size", CPU_BLOCK_SIZES)
@pytest.mark.parametrize("attn_backend", ATTN_BACKENDS)
def test_cpu_offloading(cpu_block_size: int, attn_backend: str) -> None:
"""
Tests OffloadingConnector with CPUOffloadingSpec.
"""
# configure OffloadingConnector (spec_name=CPUOffloadingSpec by default)
kv_transfer_config = KVTransferConfig(
kv_connector="OffloadingConnector",
kv_role="kv_both",
kv_connector_extra_config={
"num_cpu_blocks": 1000,
"block_size": cpu_block_size,
},
)
port: int
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(("0.0.0.0", 0))
port = s.getsockname()[1]
events_endpoint = f"tcp://*:{port}"
kv_events_config = KVEventsConfig(
enable_kv_cache_events=True,
publisher="zmq",
endpoint=events_endpoint,
topic="test",
)
with set_env_var("VLLM_ATTENTION_BACKEND", attn_backend):
llm = LLM(
model="meta-llama/Llama-3.2-1B-Instruct",
gpu_memory_utilization=0.5,
kv_events_config=kv_events_config,
kv_transfer_config=kv_transfer_config,
)
events_endpoint = events_endpoint.replace("*", "127.0.0.1")
subscriber = MockSubscriber(events_endpoint, topic=kv_events_config.topic)
try:
_latency_test(llm, subscriber)
_accuracy_test(llm, subscriber)
finally:
subscriber.close()
del llm

View File

@@ -0,0 +1,153 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm.v1.kv_offload.abstract import LoadStoreSpec
from vllm.v1.kv_offload.worker.worker import (
OffloadingHandler,
OffloadingWorker,
TransferResult,
TransferSpec,
)
class LoadStoreSpec1(LoadStoreSpec):
def __init__(
self,
submit_success: bool = True,
async_success: bool = True,
exception: bool = False,
):
self.finished = False
self.submit_success = submit_success
self.async_success = async_success
self.exception = exception
@staticmethod
def medium() -> str:
return "1"
def __repr__(self):
return f"{self.medium()}: {id(self)}"
class LoadStoreSpec2(LoadStoreSpec):
@staticmethod
def medium() -> str:
return "2"
def __repr__(self):
return f"{self.medium()}: {id(self)}"
class OffloadingHandler1To2(OffloadingHandler):
def __init__(self):
self.transfers: dict[int, LoadStoreSpec1] = {}
def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
src, dst = spec
assert isinstance(src, LoadStoreSpec1)
assert isinstance(dst, LoadStoreSpec2)
if src.exception:
raise Exception("An expected exception. Don't worry!")
if not src.submit_success:
return False
self.transfers[job_id] = src
return True
def get_finished(self) -> list[TransferResult]:
finished = []
for job_id, spec in list(self.transfers.items()):
if spec.finished:
finished.append((job_id, spec.async_success))
del self.transfers[job_id]
return finished
class OffloadingHandler2To1(OffloadingHandler):
def __init__(self):
self.transfers: dict[int, LoadStoreSpec1] = {}
def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
src, dst = spec
assert isinstance(src, LoadStoreSpec2)
assert isinstance(dst, LoadStoreSpec1)
self.transfers[job_id] = dst
return True
def get_finished(self) -> list[TransferResult]:
finished = []
for job_id, spec in list(self.transfers.items()):
if spec.finished:
finished.append((job_id, spec.async_success))
del self.transfers[job_id]
return finished
def test_offloading_worker():
"""
Tests OffloadingWorker with 2 handlers.
One handler performs 1->2 transfers, and the other handles 2->1.
"""
worker = OffloadingWorker()
handler1to2 = OffloadingHandler1To2()
handler2to1 = OffloadingHandler2To1()
worker.register_handler(LoadStoreSpec1, LoadStoreSpec2, handler1to2)
worker.register_handler(LoadStoreSpec2, LoadStoreSpec1, handler2to1)
# 1st transfer 1->2 (exception)
src1 = LoadStoreSpec1(exception=True)
dst1 = LoadStoreSpec2()
assert not worker.transfer_async(1, (src1, dst1))
# 2ed transfer 1->2 (failure to submit)
src2 = LoadStoreSpec1(submit_success=False)
dst2 = LoadStoreSpec2()
assert not worker.transfer_async(2, (src2, dst2))
# 3rd transfer 1->2 (failure)
src3 = LoadStoreSpec1(async_success=False)
dst3 = LoadStoreSpec2()
assert worker.transfer_async(3, (src3, dst3))
# 4th transfer 1->2 (success)
src4 = LoadStoreSpec1()
dst4 = LoadStoreSpec2()
worker.transfer_async(4, (src4, dst4))
assert set(handler1to2.transfers.keys()) == {3, 4}
# 5th transfer 2->1
src5 = LoadStoreSpec2()
dst5 = LoadStoreSpec1()
worker.transfer_async(5, (src5, dst5))
assert set(handler2to1.transfers.keys()) == {5}
# no transfer completed yet
assert worker.get_finished() == []
# complete 3rd, 4th
src3.finished = True
src4.finished = True
# 6th transfer 1->2
src6 = LoadStoreSpec1()
dst6 = LoadStoreSpec2()
worker.transfer_async(6, (src6, dst6))
# 7th transfer 2->1
src7 = LoadStoreSpec2()
dst7 = LoadStoreSpec1()
worker.transfer_async(7, (src7, dst7))
# 6th and 7th transfers started
assert 6 in handler1to2.transfers
assert 7 in handler2to1.transfers
# verify result of 3rd and 4th transfers
assert sorted(worker.get_finished()) == [(3, False), (4, True)]
# complete 6th and 7th transfers
src6.finished = True
dst7.finished = True
assert sorted(worker.get_finished()) == [(6, True), (7, True)]