This commit is contained in:
2026-04-02 04:53:13 +00:00
parent 80932c96e5
commit 24df76db9d
1987 changed files with 447445 additions and 0 deletions

View File

View File

@@ -0,0 +1,165 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
OffloadingManager class for managing KV data offloading in vLLM v1
This class runs in the scheduler, tracks which blocks are offloaded
and their address.
The class provides the following primitives:
lookup() - find the length of the maximal series of blocks,
starting from the first one, that are all offloaded.
prepare_load() - prepare given blocks to be read.
The given blocks will be protected from eviction.
This function returns a LoadSpec which encapsulates
information required for performing the load.
touch() - marks the give blocks as recently used. Can be used
to track block's LRU. This function is separated from the
prepare_load function to allow setting block recency even
for blocks which do not need reading from the cache, such as
blocks that are cached by the GPU prefix cache.
complete_load() - mark blocks which were previously prepared to be
loaded as done loading. This is to re-allow their eviction.
prepare_store() - prepare the given blocks to be written.
Returns a StoreSpec encapsulating offloading information,
as well as a list of blocks that were evicted as a result.
complete_store() - marks a previous store as completed.
Following this call, the given blocks will become loadable.
"""
from abc import ABC, abstractmethod
from collections.abc import Iterable
from dataclasses import dataclass
from typing import Optional
from vllm.v1.core.kv_cache_utils import BlockHash
class LoadStoreSpec(ABC):
"""
Abstract metadata that encapsulates information allowing a worker
to load, and optionally also to store, blocks of KV data.
"""
@staticmethod
@abstractmethod
def medium() -> str:
"""
Returns a string representation of the medium type
this store/load targets.
"""
pass
@dataclass
class PrepareStoreOutput:
block_hashes_to_store: list[BlockHash]
store_spec: LoadStoreSpec
block_hashes_evicted: list[BlockHash]
@dataclass
class OffloadingEvent:
block_hashes: list[BlockHash]
block_size: int
medium: str
# True if blocks are removed, False if stored
removed: bool
class OffloadingManager(ABC):
@abstractmethod
def lookup(self, block_hashes: Iterable[BlockHash]) -> int:
"""
Finds the length of the maximal series of blocks, starting from the
first one, that are all offloaded.
Args:
block_hashes: the hashes identifying the blocks to lookup.
Returns:
An integer representing the maximal number of blocks that
are currently offloaded.
"""
pass
@abstractmethod
def prepare_load(self, block_hashes: Iterable[BlockHash]) -> LoadStoreSpec:
"""
Prepare the given blocks to be read.
The given blocks will be protected from eviction until
complete_load is called.
It assumes all given blocks are offloaded.
Args:
block_hashes: the hashes identifying the blocks.
Returns:
A LoadStoreSpec that can be used by a worker to locate and load
the actual offloaded KV data.
"""
pass
def touch(self, block_hashes: Iterable[BlockHash]):
"""
Mark the given blocks as recently used.
This could in practice mean moving them to the end of an LRU list.
Args:
block_hashes: the hashes identifying the blocks.
"""
return
def complete_load(self, block_hashes: Iterable[BlockHash]):
"""
Marks previous blocks that were prepared to load as done loading.
Args:
block_hashes: the hashes identifying the blocks.
"""
return
@abstractmethod
def prepare_store(
self,
block_hashes: Iterable[BlockHash]) -> Optional[PrepareStoreOutput]:
"""
Prepare the given blocks to be offloaded.
The given blocks will be protected from eviction until
complete_store is called.
Args:
block_hashes: the hashes identifying the blocks.
Returns:
A PrepareStoreOutput indicating which blocks need storing,
where to store them (LoadStoreSpec), and list of blocks that
were evicted as a result.
None is returned if the blocks cannot be stored.
"""
pass
def complete_store(self,
block_hashes: Iterable[BlockHash],
success: bool = True):
"""
Marks blocks which were previously prepared to be stored, as stored.
Following this call, the blocks become loadable.
If if_success is False, blocks that were not marked as stored will be
removed.
Args:
block_hashes: the hashes identifying the blocks.
success: whether the blocks were stored successfully.
"""
return
def take_events(self) -> Iterable[OffloadingEvent]:
"""
Take the offloading events from the manager.
Yields:
New OffloadingEvents collected since the last call.
"""
return ()

View File

@@ -0,0 +1,96 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import ctypes
from abc import ABC, abstractmethod
from collections.abc import Iterable
from vllm.v1.core.kv_cache_utils import BlockHash
from vllm.v1.kv_offload.abstract import LoadStoreSpec
class BlockStatus(ctypes.Structure):
"""
Offloading status for a single block of KV data.
Holds the following information:
ref_cnt - the current number of transfers using this block as a source.
A value of -1 indicates the block is not yet ready to be read.
load_store_spec - backend-specific information on how to actually
read/write the block.
"""
_fields_ = [("ref_cnt", ctypes.c_int32)]
def __init__(self):
super().__init__()
# initialize block as "not ready" (ref_cnt = -1)
self.ref_cnt = -1
@property
def is_ready(self) -> bool:
"""
Returns whether the block is ready to be read.
"""
return self.ref_cnt >= 0
class Backend(ABC):
"""
An abstract class for allocating and returning specs for writing
KV blocks to some backend.
"""
def __init__(self, block_size: int, medium: str):
self.block_size = block_size
self.medium = medium
@abstractmethod
def get_num_free_blocks(self):
"""
Returns the number of current number of blocks that can be allocated.
"""
pass
@abstractmethod
def allocate_blocks(self,
block_hashes: list[BlockHash]) -> list[BlockStatus]:
"""
Allocate space for writing blocks.
This method assumes there is enough space for allocation.
It is unsafe to use without checking get_num_free_blocks beforehand.
Args:
block_hashes: the hashes identifying the blocks to be written.
Returns:
A list of BlockStatus for the allocated blocks.
The ref_cnt of each returned item will be -1, meaning the block
is not yet ready to be read.
"""
pass
@abstractmethod
def free(self, block: BlockStatus):
"""
Free a previously allocated block.
You should only call this function with blocks returned by
allocate_blocks, and only once per each block.
Args:
block: The block to be freed.
"""
pass
def get_load_store_spec(self, block_hashes: Iterable[BlockHash],
blocks: Iterable[BlockStatus]) -> LoadStoreSpec:
"""
Get backend-specific information on how to read/write blocks.
Args:
block_hashes: the list of block hashes identifying the blocks.
blocks: the list of blocks.
Returns:
A LoadStoreSpec that can be used by a worker
to read/write the blocks.
"""
raise NotImplementedError

View File

View File

@@ -0,0 +1,61 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import ctypes
from collections.abc import Iterable
from vllm.v1.core.kv_cache_utils import BlockHash
from vllm.v1.kv_offload.abstract import LoadStoreSpec
from vllm.v1.kv_offload.backend import Backend, BlockStatus
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec
class CPUBlockStatus(BlockStatus):
_fields_ = BlockStatus._fields_ + [("block_id", ctypes.c_int64)
] # type: ignore
def __init__(self, block_id: int):
super().__init__()
self.block_id = block_id
class CPUBackend(Backend):
def __init__(self, block_size: int, num_blocks: int):
super().__init__(block_size=block_size,
medium=CPULoadStoreSpec.medium())
self.num_blocks: int = num_blocks
self.num_allocated_blocks: int = 0
self.allocated_blocks_free_list: list[int] = []
def get_num_free_blocks(self):
return (len(self.allocated_blocks_free_list) + self.num_blocks -
self.num_allocated_blocks)
def allocate_blocks(self,
block_hashes: list[BlockHash]) -> list[BlockStatus]:
num_fresh_blocks = min(len(block_hashes),
self.num_blocks - self.num_allocated_blocks)
num_reused_blocks = len(block_hashes) - num_fresh_blocks
assert len(self.allocated_blocks_free_list) >= num_reused_blocks
# allocate fresh blocks
blocks: list[BlockStatus] = []
for _ in range(num_fresh_blocks):
blocks.append(CPUBlockStatus(self.num_allocated_blocks))
self.num_allocated_blocks += 1
# allocate reused blocks
for _ in range(num_reused_blocks):
block_id = self.allocated_blocks_free_list.pop()
blocks.append(CPUBlockStatus(block_id))
return blocks
def free(self, block: BlockStatus):
assert isinstance(block, CPUBlockStatus)
self.allocated_blocks_free_list.append(block.block_id)
def get_load_store_spec(self, block_hashes: Iterable[BlockHash],
blocks: Iterable[BlockStatus]) -> LoadStoreSpec:
return CPULoadStoreSpec([block.block_id for block in blocks])

75
vllm/v1/kv_offload/cpu.py Normal file
View File

@@ -0,0 +1,75 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterator
from typing import Optional
import torch
from vllm.config import VllmConfig, get_layers_from_vllm_config
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
from vllm.platforms import current_platform
from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
from vllm.v1.kv_offload.backends.cpu import CPUBackend
from vllm.v1.kv_offload.lru_manager import LRUOffloadingManager
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
from vllm.v1.kv_offload.spec import OffloadingSpec
from vllm.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandler
from vllm.v1.kv_offload.worker.worker import OffloadingHandler
class CPUOffloadingSpec(OffloadingSpec):
def __init__(self, vllm_config: VllmConfig):
super().__init__(vllm_config)
num_cpu_blocks = self.extra_config.get("num_cpu_blocks")
if not num_cpu_blocks:
raise Exception("num_cpu_blocks must be specified "
"in kv_connector_extra_config")
self.num_cpu_blocks: int = num_cpu_blocks
# scheduler-side
self._manager: Optional[OffloadingManager] = None
# worker-side
self._handler: Optional[OffloadingHandler] = None
def get_manager(self) -> OffloadingManager:
if not self._manager:
kv_events_config = self.vllm_config.kv_events_config
enable_events = (kv_events_config is not None
and kv_events_config.enable_kv_cache_events)
self._manager = LRUOffloadingManager(CPUBackend(
block_size=self.offloaded_block_size,
num_blocks=self.num_cpu_blocks),
enable_events=enable_events)
return self._manager
def get_handlers(
self, kv_caches: dict[str, torch.Tensor]
) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec],
OffloadingHandler]]:
if not self._handler:
if not current_platform.is_cuda():
raise Exception("CPU Offloading is currently only supported"
" on CUDA GPUs")
layer_names = list(kv_caches.keys())
layers = get_layers_from_vllm_config(self.vllm_config,
AttentionLayerBase,
layer_names)
attn_backends = {
layer_name: layers[layer_name].get_attn_backend()
for layer_name in layer_names
}
self._handler = CpuGpuOffloadingHandler(
attn_backends=attn_backends,
gpu_block_size=self.gpu_block_size,
cpu_block_size=self.offloaded_block_size,
num_cpu_blocks=self.num_cpu_blocks,
gpu_caches=kv_caches)
assert self._handler is not None
yield GPULoadStoreSpec, CPULoadStoreSpec, self._handler
yield CPULoadStoreSpec, GPULoadStoreSpec, self._handler

View File

@@ -0,0 +1,56 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib
from typing import TYPE_CHECKING, Callable
from vllm.logger import init_logger
from vllm.v1.kv_offload.spec import OffloadingSpec
if TYPE_CHECKING:
from vllm.config import VllmConfig
logger = init_logger(__name__)
class OffloadingSpecFactory:
_registry: dict[str, Callable[[], type[OffloadingSpec]]] = {}
@classmethod
def register_spec(cls, name: str, module_path: str,
class_name: str) -> None:
"""Register a spec with a lazy-loading module and class name."""
if name in cls._registry:
raise ValueError(f"Connector '{name}' is already registered.")
def loader() -> type[OffloadingSpec]:
module = importlib.import_module(module_path)
return getattr(module, class_name)
cls._registry[name] = loader
@classmethod
def create_spec(
cls,
config: "VllmConfig",
) -> OffloadingSpec:
kv_transfer_config = config.kv_transfer_config
assert kv_transfer_config is not None
extra_config = kv_transfer_config.kv_connector_extra_config
spec_name = extra_config.get("spec_name", "CPUOffloadingSpec")
if spec_name in cls._registry:
spec_cls = cls._registry[spec_name]()
else:
spec_module_path = extra_config.get("spec_module_path")
if spec_module_path is None:
raise ValueError(f"Unsupported spec type: {spec_name}")
spec_module = importlib.import_module(spec_module_path)
spec_cls = getattr(spec_module, spec_name)
assert issubclass(spec_cls, OffloadingSpec)
logger.info("Creating offloading spec with name: %s", spec_name)
return spec_cls(config)
# Register various specs here.
OffloadingSpecFactory.register_spec("CPUOffloadingSpec",
"vllm.v1.kv_offload.cpu",
"CPUOffloadingSpec")

View File

@@ -0,0 +1,132 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections import OrderedDict
from collections.abc import Iterable
from typing import Optional
from vllm.v1.core.kv_cache_utils import BlockHash
from vllm.v1.kv_offload.abstract import (LoadStoreSpec, OffloadingEvent,
OffloadingManager, PrepareStoreOutput)
from vllm.v1.kv_offload.backend import Backend, BlockStatus
class LRUOffloadingManager(OffloadingManager):
"""
An OffloadingManager with a pluggable backend, which evicts blocks by LRU.
"""
def __init__(self, backend: Backend, enable_events: bool = False):
self.backend: Backend = backend
# block_hash -> BlockStatus
self.blocks: OrderedDict[BlockHash, BlockStatus] = OrderedDict()
self.events: Optional[list[OffloadingEvent]] = \
[] if enable_events else None
def lookup(self, block_hashes: Iterable[BlockHash]) -> int:
hit_count = 0
for block_hash in block_hashes:
block = self.blocks.get(block_hash)
if block is None or not block.is_ready:
break
hit_count += 1
return hit_count
def prepare_load(self, block_hashes: Iterable[BlockHash]) -> LoadStoreSpec:
blocks = []
for block_hash in block_hashes:
block = self.blocks[block_hash]
assert block.is_ready
block.ref_cnt += 1
blocks.append(block)
return self.backend.get_load_store_spec(block_hashes, blocks)
def touch(self, block_hashes: Iterable[BlockHash]):
for block_hash in reversed(list(block_hashes)):
if self.blocks.get(block_hash):
self.blocks.move_to_end(block_hash)
def complete_load(self, block_hashes: Iterable[BlockHash]):
for block_hash in block_hashes:
block = self.blocks[block_hash]
assert block.ref_cnt > 0
block.ref_cnt -= 1
def prepare_store(
self,
block_hashes: Iterable[BlockHash]) -> Optional[PrepareStoreOutput]:
# filter out blocks that are already stored
block_hashes_to_store = [
block_hash for block_hash in block_hashes
if block_hash not in self.blocks
]
num_blocks_to_evict = (len(block_hashes_to_store) -
self.backend.get_num_free_blocks())
# build list of blocks to evict
to_evict = []
if num_blocks_to_evict > 0:
for block_hash, block in self.blocks.items():
if block.ref_cnt == 0:
to_evict.append(block_hash)
num_blocks_to_evict -= 1
if num_blocks_to_evict == 0:
break
else:
# we could not evict enough blocks
return None
# evict blocks
for block_hash in to_evict:
self.backend.free(self.blocks.pop(block_hash))
if to_evict and self.events is not None:
self.events.append(
OffloadingEvent(block_hashes=to_evict,
block_size=self.backend.block_size,
medium=self.backend.medium,
removed=True))
blocks = self.backend.allocate_blocks(block_hashes_to_store)
assert len(blocks) == len(block_hashes_to_store)
for block_hash, block in zip(block_hashes_to_store, blocks):
self.blocks[block_hash] = block
# build store specs for allocated blocks
store_spec = self.backend.get_load_store_spec(block_hashes_to_store,
blocks)
return PrepareStoreOutput(block_hashes_to_store=block_hashes_to_store,
store_spec=store_spec,
block_hashes_evicted=to_evict)
def complete_store(self,
block_hashes: Iterable[BlockHash],
success: bool = True):
stored_block_hashes: list[BlockHash] = []
if success:
for block_hash in block_hashes:
block = self.blocks[block_hash]
if not block.is_ready:
block.ref_cnt = 0
stored_block_hashes.append(block_hash)
else:
for block_hash in block_hashes:
block = self.blocks[block_hash]
if not block.is_ready:
self.backend.free(block)
del self.blocks[block_hash]
if stored_block_hashes and self.events is not None:
self.events.append(
OffloadingEvent(block_hashes=stored_block_hashes,
block_size=self.backend.block_size,
medium=self.backend.medium,
removed=False))
def take_events(self) -> Iterable[OffloadingEvent]:
if self.events is not None:
yield from self.events
self.events.clear()

View File

@@ -0,0 +1,39 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from abc import ABC
import numpy as np
from vllm.v1.kv_offload.abstract import LoadStoreSpec
class BlockIDsLoadStoreSpec(LoadStoreSpec, ABC):
"""
Spec for loading/storing KV blocks from given block numbers.
"""
def __init__(self, block_ids: list[int]):
self.block_ids = np.array(block_ids, dtype=np.int64)
def __repr__(self) -> str:
return repr(self.block_ids)
class GPULoadStoreSpec(BlockIDsLoadStoreSpec):
"""
Spec for loading/storing a KV block to GPU memory.
"""
@staticmethod
def medium() -> str:
return "GPU"
class CPULoadStoreSpec(BlockIDsLoadStoreSpec):
"""
Spec for loading/storing a KV block to CPU memory.
"""
@staticmethod
def medium() -> str:
return "CPU"

View File

@@ -0,0 +1,61 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from abc import ABC, abstractmethod
from collections.abc import Iterator
from typing import TYPE_CHECKING
import torch
from vllm.logger import init_logger
from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
from vllm.v1.kv_offload.worker.worker import OffloadingHandler
if TYPE_CHECKING:
from vllm.config import VllmConfig
logger = init_logger(__name__)
class OffloadingSpec(ABC):
"""Spec for an offloading connector"""
def __init__(self, vllm_config: "VllmConfig"):
logger.warning(
"Initializing OffloadingSpec. This API is experimental and "
"subject to change in the future as we iterate the design.")
self.vllm_config = vllm_config
kv_transfer_config = vllm_config.kv_transfer_config
assert kv_transfer_config is not None
self.extra_config = kv_transfer_config.kv_connector_extra_config
self.gpu_block_size = vllm_config.cache_config.block_size
self.offloaded_block_size = int(
self.extra_config.get("block_size", self.gpu_block_size))
assert self.offloaded_block_size % self.gpu_block_size == 0
@abstractmethod
def get_manager(self) -> OffloadingManager:
"""
Get an OffloadingManager that will be used
by the scheduler-side offloading connector to track
offloaded blocks and manage evictions.
"""
pass
@abstractmethod
def get_handlers(
self, kv_caches: dict[str, torch.Tensor]
) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec],
OffloadingHandler]]:
"""
Get offloading handlers along with their respective src and dst types.
Args:
kv_caches: A dictionary of layer_name -> gpu_kv_cache tensor.
Yields:
Tuples of (src_type, dst_type, offloading_handler).
"""
pass

View File

View File

@@ -0,0 +1,171 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import numpy as np
import torch
from vllm import _custom_ops as ops
from vllm.attention import AttentionBackend
from vllm.logger import init_logger
from vllm.utils import is_pin_memory_available
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
from vllm.v1.kv_offload.worker.worker import (OffloadingHandler,
TransferResult, TransferSpec)
logger = init_logger(__name__)
def expand_block_ids(block_ids: np.ndarray,
block_size_factor: int,
output: np.ndarray,
skip_count: int = 0):
"""
Convert a list of block IDs to a list of matching block ids,
assuming each block is composed of actual block_size_factor blocks.
Outputs to output tensor.
The first skip_count blocks will be skipped.
Note that skip_count must be less than block_size_factor.
For example, if block_ids = [0, 1, 3] and block_size_factor = 4,
then it yields [0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15]
since 0 maps to [0, 1, 2, 3]
1 maps to [4, 5, 6, 7]
and 3 maps to [12, 13, 14, 15]
"""
assert skip_count < block_size_factor
first_range = np.arange(skip_count, block_size_factor)
full_range = np.arange(0, block_size_factor)
output_idx = 0
for i, block_id in enumerate(block_ids):
base_block_id = block_id * block_size_factor
indices = first_range if i == 0 else full_range
output_end_idx = output_idx + len(indices)
output[output_idx:output_end_idx] = base_block_id + indices
output_idx = output_end_idx
class CpuGpuOffloadingHandler(OffloadingHandler):
def __init__(self, gpu_block_size: int, cpu_block_size: int,
num_cpu_blocks: int, gpu_caches: dict[str, torch.Tensor],
attn_backends: dict[str, type[AttentionBackend]]):
assert cpu_block_size % gpu_block_size == 0
self.block_size_factor = cpu_block_size // gpu_block_size
# cuda streams for gpu->cpu and cpu->gpu
self.d2h_stream = torch.cuda.Stream()
self.h2d_stream = torch.cuda.Stream()
# job_id -> transfer cuda event
self.transfer_events: dict[int, torch.cuda.Event] = {}
# list of cuda events available for re-use
self.events_pool: list[torch.cuda.Event] = []
pin_memory = is_pin_memory_available()
# allocate cpu tensors
logger.info("Allocating %d CPU tensors...", len(gpu_caches))
self.gpu_tensors: list[torch.Tensor] = []
self.cpu_tensors: list[torch.Tensor] = []
self.kv_dim_before_num_blocks: list[bool] = []
for layer_name, gpu_tensor in gpu_caches.items():
self.gpu_tensors.append(gpu_tensor)
gpu_shape = gpu_tensor.shape
test_shape = attn_backends[layer_name].get_kv_cache_shape(
num_blocks=1234, block_size=16, num_kv_heads=8, head_size=256)
if test_shape[0] == 1234:
# shape is (num_blocks, ...)
num_blocks_idx = 0
self.kv_dim_before_num_blocks.append(False)
else:
# shape should be (2, num_blocks, ...)
assert test_shape[0] == 2
assert test_shape[1] == 1234
assert gpu_shape[0] == 2
num_blocks_idx = 1
self.kv_dim_before_num_blocks.append(True)
cpu_shape = list(gpu_shape)
cpu_shape[num_blocks_idx] = num_cpu_blocks * self.block_size_factor
logger.debug("Allocating CPU tensor of shape %r", cpu_shape)
self.cpu_tensors.append(
torch.zeros(cpu_shape,
dtype=gpu_tensor.dtype,
device="cpu",
pin_memory=pin_memory))
def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
src_spec, dst_spec = spec
if isinstance(src_spec, CPULoadStoreSpec):
assert isinstance(dst_spec, GPULoadStoreSpec)
stream = self.h2d_stream
src_tensors = self.cpu_tensors
dst_tensors = self.gpu_tensors
src_block_size_factor = self.block_size_factor
dst_block_size_factor = 1
else:
assert isinstance(src_spec, GPULoadStoreSpec)
assert isinstance(dst_spec, CPULoadStoreSpec)
stream = self.d2h_stream
src_tensors = self.gpu_tensors
dst_tensors = self.cpu_tensors
src_block_size_factor = 1
dst_block_size_factor = self.block_size_factor
src_blocks = src_spec.block_ids
dst_blocks = dst_spec.block_ids
assert src_blocks.ndim == 1
assert dst_blocks.ndim == 1
dst_sub_blocks_to_skip = (-src_blocks.size % dst_block_size_factor)
src_sub_block_count = src_blocks.size * src_block_size_factor
assert (
src_sub_block_count == dst_blocks.size * dst_block_size_factor -
dst_sub_blocks_to_skip)
src_to_dst = np.empty((src_sub_block_count, 2), dtype=np.int64)
expand_block_ids(src_blocks, src_block_size_factor, src_to_dst[:, 0])
expand_block_ids(dst_blocks,
dst_block_size_factor,
src_to_dst[:, 1],
skip_count=dst_sub_blocks_to_skip)
src_to_dst_tensor = torch.from_numpy(src_to_dst)
event = self.events_pool.pop() if self.events_pool \
else torch.cuda.Event()
with torch.cuda.stream(stream):
for src_tensor, dst_tensor, kv_dim in zip(
src_tensors, dst_tensors, self.kv_dim_before_num_blocks):
if kv_dim:
src_key_cache = src_tensor[0]
dst_key_cache = dst_tensor[0]
ops.swap_blocks(src_key_cache, dst_key_cache,
src_to_dst_tensor)
src_value_cache = src_tensor[1]
dst_value_cache = dst_tensor[1]
ops.swap_blocks(src_value_cache, dst_value_cache,
src_to_dst_tensor)
else:
ops.swap_blocks(src_tensor, dst_tensor, src_to_dst_tensor)
event.record(stream)
self.transfer_events[job_id] = event
# success
return True
def get_finished(self) -> list[TransferResult]:
results: list[TransferResult] = []
for job_id, event in self.transfer_events.items():
if event.query():
results.append((job_id, True))
self.events_pool.append(event)
for job_id, _ in results:
del self.transfer_events[job_id]
return results

View File

@@ -0,0 +1,142 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from abc import ABC, abstractmethod
from vllm.logger import init_logger
from vllm.v1.kv_offload.abstract import LoadStoreSpec
# a single transfer spec (src_blocks_spec, dst_blocks_spec)
TransferSpec = tuple[LoadStoreSpec, LoadStoreSpec]
# transfers are forwarded to workers by (src_medium, dst_medium)
TransferType = tuple[str, str]
# transfer result (job_id, success)
TransferResult = tuple[int, bool]
logger = init_logger(__name__)
class OffloadingHandler(ABC):
"""
OffloadingHandler class for managing asynchronous KV data transfers
This class runs in the worker.
It kicks off async KV data transfer requests, and allows
collecting back completion statuses.
The class provides the following primitives:
transfer_async() - kicks off a new transfer job
get_finished() - returns a list of newly finished job IDs.
"""
@abstractmethod
def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
"""
Initiates an asynchronous transfer of KV data.
Args:
job_id: a unique ID that will be used when notifying back on
transfer completion.
spec: the (src, dst) spec of the KV data transfer.
Returns:
True if transfer was submitted successfully.
"""
pass
@abstractmethod
def get_finished(self) -> list[TransferResult]:
"""
Get transfers finished since last call.
Returns:
A list of (job_id, success) of transfers.
"""
pass
class OffloadingWorker:
"""
OffloadingWorker class for managing asynchronous KV data transfers
using multiple OffloadingHandlers
This class runs in the worker.
It kicks off async KV data transfer requests, by delegating
to one of its registered OffloadingHandlers, based on the transfer type.
The class provides the following primitives:
register_handler() - registers a new handler to handle
a specific transfer type
transfer_async() - kicks off a new transfer job
using one of the registered handlers.
get_finished() - returns a list of newly finished job IDs
from all handlers.
"""
def __init__(self):
self.handlers: set[OffloadingHandler] = set()
self.transfer_type_to_handler: dict[TransferType,
OffloadingHandler] = {}
def register_handler(self, src_cls: type[LoadStoreSpec],
dst_cls: type[LoadStoreSpec],
handler: OffloadingHandler) -> None:
"""
Registers a new handler.
Args:
src_cls: the source type of transfers handled by this handler.
dst_cls: the destination type of transfers handled by this handler.
handler: the handler that will handle transfers.
"""
transfer_type = (src_cls.medium(), dst_cls.medium())
assert transfer_type not in self.transfer_type_to_handler
self.handlers.add(handler)
self.transfer_type_to_handler[transfer_type] = handler
def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
"""
Initiates an asynchronous transfer of KV data.
Args:
job_id: a unique ID that will be used when notifying back on
transfer completion.
spec: the (src, dst) spec of the KV data transfer.
Returns:
True if transfer was submitted successfully.
"""
src, dst = spec
transfer_type = (src.medium(), dst.medium())
handler = self.transfer_type_to_handler.get(transfer_type)
assert handler is not None
try:
success = handler.transfer_async(job_id, spec)
except Exception as e:
logger.warning("Exception in %r transfer %d: %r",
transfer_type,
job_id,
e,
exc_info=True)
return False
if not success:
logger.warning("Failed to submit %r transfer %d", transfer_type,
job_id)
else:
logger.debug("Submitted %r transfer %d: %r", transfer_type, job_id,
spec)
return success
def get_finished(self) -> list[TransferResult]:
"""
Get transfers finished since last call.
Returns:
A list of (job_id, success) of transfers.
"""
finished = []
for handler in self.handlers:
finished.extend(handler.get_finished())
return finished