init

2026-04-02 04:53:13 +00:00
parent 80932c96e5
commit 24df76db9d
1987 changed files with 447445 additions and 0 deletions
--- a/vllm/v1/kv_offload/init.py
+++ b/vllm/v1/kv_offload/init.py
--- a/vllm/v1/kv_offload/abstract.py
+++ b/vllm/v1/kv_offload/abstract.py
@@ -0,0 +1,165 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+OffloadingManager class for managing KV data offloading in vLLM v1
+
+This class runs in the scheduler, tracks which blocks are offloaded
+and their address.
+
+The class provides the following primitives:
+    lookup() - find the length of the maximal series of blocks,
+        starting from the first one, that are all offloaded.
+    prepare_load() - prepare given blocks to be read.
+        The given blocks will be protected from eviction.
+        This function returns a LoadSpec which encapsulates
+        information required for performing the load.
+    touch() - marks the give blocks as recently used. Can be used
+        to track block's LRU. This function is separated from the
+        prepare_load function to allow setting block recency even
+        for blocks which do not need reading from the cache, such as
+        blocks that are cached by the GPU prefix cache.
+    complete_load() - mark blocks which were previously prepared to be
+        loaded as done loading. This is to re-allow their eviction.
+    prepare_store() - prepare the given blocks to be written.
+        Returns a StoreSpec encapsulating offloading information,
+        as well as a list of blocks that were evicted as a result.
+    complete_store() - marks a previous store as completed.
+        Following this call, the given blocks will become loadable.
+"""
+
+from abc import ABC, abstractmethod
+from collections.abc import Iterable
+from dataclasses import dataclass
+from typing import Optional
+
+from vllm.v1.core.kv_cache_utils import BlockHash
+
+
+class LoadStoreSpec(ABC):
+    """
+    Abstract metadata that encapsulates information allowing a worker
+    to load, and optionally also to store, blocks of KV data.
+    """
+
+    @staticmethod
+    @abstractmethod
+    def medium() -> str:
+        """
+        Returns a string representation of the medium type
+        this store/load targets.
+        """
+        pass
+
+
+@dataclass
+class PrepareStoreOutput:
+    block_hashes_to_store: list[BlockHash]
+    store_spec: LoadStoreSpec
+    block_hashes_evicted: list[BlockHash]
+
+
+@dataclass
+class OffloadingEvent:
+    block_hashes: list[BlockHash]
+    block_size: int
+    medium: str
+    # True if blocks are removed, False if stored
+    removed: bool
+
+
+class OffloadingManager(ABC):
+
+    @abstractmethod
+    def lookup(self, block_hashes: Iterable[BlockHash]) -> int:
+        """
+        Finds the length of the maximal series of blocks, starting from the
+        first one, that are all offloaded.
+
+        Args:
+            block_hashes: the hashes identifying the blocks to lookup.
+
+        Returns:
+            An integer representing the maximal number of blocks that
+            are currently offloaded.
+        """
+        pass
+
+    @abstractmethod
+    def prepare_load(self, block_hashes: Iterable[BlockHash]) -> LoadStoreSpec:
+        """
+        Prepare the given blocks to be read.
+        The given blocks will be protected from eviction until
+        complete_load is called.
+        It assumes all given blocks are offloaded.
+
+        Args:
+            block_hashes: the hashes identifying the blocks.
+
+        Returns:
+            A LoadStoreSpec that can be used by a worker to locate and load
+            the actual offloaded KV data.
+        """
+        pass
+
+    def touch(self, block_hashes: Iterable[BlockHash]):
+        """
+        Mark the given blocks as recently used.
+        This could in practice mean moving them to the end of an LRU list.
+
+        Args:
+            block_hashes: the hashes identifying the blocks.
+        """
+        return
+
+    def complete_load(self, block_hashes: Iterable[BlockHash]):
+        """
+        Marks previous blocks that were prepared to load as done loading.
+
+        Args:
+            block_hashes: the hashes identifying the blocks.
+        """
+        return
+
+    @abstractmethod
+    def prepare_store(
+            self,
+            block_hashes: Iterable[BlockHash]) -> Optional[PrepareStoreOutput]:
+        """
+        Prepare the given blocks to be offloaded.
+        The given blocks will be protected from eviction until
+        complete_store is called.
+
+        Args:
+            block_hashes: the hashes identifying the blocks.
+
+        Returns:
+            A PrepareStoreOutput indicating which blocks need storing,
+            where to store them (LoadStoreSpec), and list of blocks that
+            were evicted as a result.
+            None is returned if the blocks cannot be stored.
+        """
+        pass
+
+    def complete_store(self,
+                       block_hashes: Iterable[BlockHash],
+                       success: bool = True):
+        """
+        Marks blocks which were previously prepared to be stored, as stored.
+        Following this call, the blocks become loadable.
+        If if_success is False, blocks that were not marked as stored will be
+        removed.
+
+        Args:
+            block_hashes: the hashes identifying the blocks.
+            success: whether the blocks were stored successfully.
+        """
+        return
+
+    def take_events(self) -> Iterable[OffloadingEvent]:
+        """
+        Take the offloading events from the manager.
+
+        Yields:
+            New OffloadingEvents collected since the last call.
+        """
+        return ()
--- a/vllm/v1/kv_offload/backend.py
+++ b/vllm/v1/kv_offload/backend.py
@@ -0,0 +1,96 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import ctypes
+from abc import ABC, abstractmethod
+from collections.abc import Iterable
+
+from vllm.v1.core.kv_cache_utils import BlockHash
+from vllm.v1.kv_offload.abstract import LoadStoreSpec
+
+
+class BlockStatus(ctypes.Structure):
+    """
+    Offloading status for a single block of KV data.
+    Holds the following information:
+
+    ref_cnt - the current number of transfers using this block as a source.
+        A value of -1 indicates the block is not yet ready to be read.
+    load_store_spec - backend-specific information on how to actually
+        read/write the block.
+    """
+    _fields_ = [("ref_cnt", ctypes.c_int32)]
+
+    def __init__(self):
+        super().__init__()
+        # initialize block as "not ready" (ref_cnt = -1)
+        self.ref_cnt = -1
+
+    @property
+    def is_ready(self) -> bool:
+        """
+        Returns whether the block is ready to be read.
+        """
+        return self.ref_cnt >= 0
+
+
+class Backend(ABC):
+    """
+    An abstract class for allocating and returning specs for writing
+    KV blocks to some backend.
+    """
+
+    def __init__(self, block_size: int, medium: str):
+        self.block_size = block_size
+        self.medium = medium
+
+    @abstractmethod
+    def get_num_free_blocks(self):
+        """
+        Returns the number of current number of blocks that can be allocated.
+        """
+        pass
+
+    @abstractmethod
+    def allocate_blocks(self,
+                        block_hashes: list[BlockHash]) -> list[BlockStatus]:
+        """
+        Allocate space for writing blocks.
+        This method assumes there is enough space for allocation.
+        It is unsafe to use without checking get_num_free_blocks beforehand.
+
+        Args:
+            block_hashes: the hashes identifying the blocks to be written.
+
+        Returns:
+            A list of BlockStatus for the allocated blocks.
+            The ref_cnt of each returned item will be -1, meaning the block
+            is not yet ready to be read.
+        """
+        pass
+
+    @abstractmethod
+    def free(self, block: BlockStatus):
+        """
+        Free a previously allocated block.
+        You should only call this function with blocks returned by
+        allocate_blocks, and only once per each block.
+
+        Args:
+            block: The block to be freed.
+        """
+        pass
+
+    def get_load_store_spec(self, block_hashes: Iterable[BlockHash],
+                            blocks: Iterable[BlockStatus]) -> LoadStoreSpec:
+        """
+        Get backend-specific information on how to read/write blocks.
+
+        Args:
+            block_hashes: the list of block hashes identifying the blocks.
+            blocks: the list of blocks.
+
+        Returns:
+            A LoadStoreSpec that can be used by a worker
+            to read/write the blocks.
+        """
+        raise NotImplementedError
--- a/vllm/v1/kv_offload/backends/init.py
+++ b/vllm/v1/kv_offload/backends/init.py
--- a/vllm/v1/kv_offload/backends/cpu.py
+++ b/vllm/v1/kv_offload/backends/cpu.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import ctypes
+from collections.abc import Iterable
+
+from vllm.v1.core.kv_cache_utils import BlockHash
+from vllm.v1.kv_offload.abstract import LoadStoreSpec
+from vllm.v1.kv_offload.backend import Backend, BlockStatus
+from vllm.v1.kv_offload.mediums import CPULoadStoreSpec
+
+
+class CPUBlockStatus(BlockStatus):
+    _fields_ = BlockStatus._fields_ + [("block_id", ctypes.c_int64)
+                                       ]  # type: ignore
+
+    def __init__(self, block_id: int):
+        super().__init__()
+        self.block_id = block_id
+
+
+class CPUBackend(Backend):
+
+    def __init__(self, block_size: int, num_blocks: int):
+        super().__init__(block_size=block_size,
+                         medium=CPULoadStoreSpec.medium())
+
+        self.num_blocks: int = num_blocks
+        self.num_allocated_blocks: int = 0
+        self.allocated_blocks_free_list: list[int] = []
+
+    def get_num_free_blocks(self):
+        return (len(self.allocated_blocks_free_list) + self.num_blocks -
+                self.num_allocated_blocks)
+
+    def allocate_blocks(self,
+                        block_hashes: list[BlockHash]) -> list[BlockStatus]:
+        num_fresh_blocks = min(len(block_hashes),
+                               self.num_blocks - self.num_allocated_blocks)
+        num_reused_blocks = len(block_hashes) - num_fresh_blocks
+        assert len(self.allocated_blocks_free_list) >= num_reused_blocks
+
+        # allocate fresh blocks
+        blocks: list[BlockStatus] = []
+        for _ in range(num_fresh_blocks):
+            blocks.append(CPUBlockStatus(self.num_allocated_blocks))
+            self.num_allocated_blocks += 1
+
+        # allocate reused blocks
+        for _ in range(num_reused_blocks):
+            block_id = self.allocated_blocks_free_list.pop()
+            blocks.append(CPUBlockStatus(block_id))
+
+        return blocks
+
+    def free(self, block: BlockStatus):
+        assert isinstance(block, CPUBlockStatus)
+        self.allocated_blocks_free_list.append(block.block_id)
+
+    def get_load_store_spec(self, block_hashes: Iterable[BlockHash],
+                            blocks: Iterable[BlockStatus]) -> LoadStoreSpec:
+        return CPULoadStoreSpec([block.block_id for block in blocks])
--- a/vllm/v1/kv_offload/cpu.py
+++ b/vllm/v1/kv_offload/cpu.py
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterator
+from typing import Optional
+
+import torch
+
+from vllm.config import VllmConfig, get_layers_from_vllm_config
+from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
+from vllm.platforms import current_platform
+from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
+from vllm.v1.kv_offload.backends.cpu import CPUBackend
+from vllm.v1.kv_offload.lru_manager import LRUOffloadingManager
+from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
+from vllm.v1.kv_offload.spec import OffloadingSpec
+from vllm.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandler
+from vllm.v1.kv_offload.worker.worker import OffloadingHandler
+
+
+class CPUOffloadingSpec(OffloadingSpec):
+
+    def __init__(self, vllm_config: VllmConfig):
+        super().__init__(vllm_config)
+
+        num_cpu_blocks = self.extra_config.get("num_cpu_blocks")
+        if not num_cpu_blocks:
+            raise Exception("num_cpu_blocks must be specified "
+                            "in kv_connector_extra_config")
+        self.num_cpu_blocks: int = num_cpu_blocks
+
+        # scheduler-side
+        self._manager: Optional[OffloadingManager] = None
+
+        # worker-side
+        self._handler: Optional[OffloadingHandler] = None
+
+    def get_manager(self) -> OffloadingManager:
+        if not self._manager:
+            kv_events_config = self.vllm_config.kv_events_config
+            enable_events = (kv_events_config is not None
+                             and kv_events_config.enable_kv_cache_events)
+            self._manager = LRUOffloadingManager(CPUBackend(
+                block_size=self.offloaded_block_size,
+                num_blocks=self.num_cpu_blocks),
+                                                 enable_events=enable_events)
+        return self._manager
+
+    def get_handlers(
+        self, kv_caches: dict[str, torch.Tensor]
+    ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec],
+                        OffloadingHandler]]:
+        if not self._handler:
+            if not current_platform.is_cuda():
+                raise Exception("CPU Offloading is currently only supported"
+                                " on CUDA GPUs")
+
+            layer_names = list(kv_caches.keys())
+            layers = get_layers_from_vllm_config(self.vllm_config,
+                                                 AttentionLayerBase,
+                                                 layer_names)
+            attn_backends = {
+                layer_name: layers[layer_name].get_attn_backend()
+                for layer_name in layer_names
+            }
+
+            self._handler = CpuGpuOffloadingHandler(
+                attn_backends=attn_backends,
+                gpu_block_size=self.gpu_block_size,
+                cpu_block_size=self.offloaded_block_size,
+                num_cpu_blocks=self.num_cpu_blocks,
+                gpu_caches=kv_caches)
+
+        assert self._handler is not None
+        yield GPULoadStoreSpec, CPULoadStoreSpec, self._handler
+        yield CPULoadStoreSpec, GPULoadStoreSpec, self._handler
--- a/vllm/v1/kv_offload/factory.py
+++ b/vllm/v1/kv_offload/factory.py
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import importlib
+from typing import TYPE_CHECKING, Callable
+
+from vllm.logger import init_logger
+from vllm.v1.kv_offload.spec import OffloadingSpec
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+
+logger = init_logger(__name__)
+
+
+class OffloadingSpecFactory:
+    _registry: dict[str, Callable[[], type[OffloadingSpec]]] = {}
+
+    @classmethod
+    def register_spec(cls, name: str, module_path: str,
+                      class_name: str) -> None:
+        """Register a spec with a lazy-loading module and class name."""
+        if name in cls._registry:
+            raise ValueError(f"Connector '{name}' is already registered.")
+
+        def loader() -> type[OffloadingSpec]:
+            module = importlib.import_module(module_path)
+            return getattr(module, class_name)
+
+        cls._registry[name] = loader
+
+    @classmethod
+    def create_spec(
+        cls,
+        config: "VllmConfig",
+    ) -> OffloadingSpec:
+        kv_transfer_config = config.kv_transfer_config
+        assert kv_transfer_config is not None
+        extra_config = kv_transfer_config.kv_connector_extra_config
+        spec_name = extra_config.get("spec_name", "CPUOffloadingSpec")
+        if spec_name in cls._registry:
+            spec_cls = cls._registry[spec_name]()
+        else:
+            spec_module_path = extra_config.get("spec_module_path")
+            if spec_module_path is None:
+                raise ValueError(f"Unsupported spec type: {spec_name}")
+            spec_module = importlib.import_module(spec_module_path)
+            spec_cls = getattr(spec_module, spec_name)
+        assert issubclass(spec_cls, OffloadingSpec)
+        logger.info("Creating offloading spec with name: %s", spec_name)
+        return spec_cls(config)
+
+
+# Register various specs here.
+OffloadingSpecFactory.register_spec("CPUOffloadingSpec",
+                                    "vllm.v1.kv_offload.cpu",
+                                    "CPUOffloadingSpec")
--- a/vllm/v1/kv_offload/lru_manager.py
+++ b/vllm/v1/kv_offload/lru_manager.py
@@ -0,0 +1,132 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections import OrderedDict
+from collections.abc import Iterable
+from typing import Optional
+
+from vllm.v1.core.kv_cache_utils import BlockHash
+from vllm.v1.kv_offload.abstract import (LoadStoreSpec, OffloadingEvent,
+                                         OffloadingManager, PrepareStoreOutput)
+from vllm.v1.kv_offload.backend import Backend, BlockStatus
+
+
+class LRUOffloadingManager(OffloadingManager):
+    """
+    An OffloadingManager with a pluggable backend, which evicts blocks by LRU.
+    """
+
+    def __init__(self, backend: Backend, enable_events: bool = False):
+        self.backend: Backend = backend
+        # block_hash -> BlockStatus
+        self.blocks: OrderedDict[BlockHash, BlockStatus] = OrderedDict()
+        self.events: Optional[list[OffloadingEvent]] = \
+            [] if enable_events else None
+
+    def lookup(self, block_hashes: Iterable[BlockHash]) -> int:
+        hit_count = 0
+        for block_hash in block_hashes:
+            block = self.blocks.get(block_hash)
+            if block is None or not block.is_ready:
+                break
+            hit_count += 1
+        return hit_count
+
+    def prepare_load(self, block_hashes: Iterable[BlockHash]) -> LoadStoreSpec:
+        blocks = []
+        for block_hash in block_hashes:
+            block = self.blocks[block_hash]
+            assert block.is_ready
+            block.ref_cnt += 1
+            blocks.append(block)
+
+        return self.backend.get_load_store_spec(block_hashes, blocks)
+
+    def touch(self, block_hashes: Iterable[BlockHash]):
+        for block_hash in reversed(list(block_hashes)):
+            if self.blocks.get(block_hash):
+                self.blocks.move_to_end(block_hash)
+
+    def complete_load(self, block_hashes: Iterable[BlockHash]):
+        for block_hash in block_hashes:
+            block = self.blocks[block_hash]
+            assert block.ref_cnt > 0
+            block.ref_cnt -= 1
+
+    def prepare_store(
+            self,
+            block_hashes: Iterable[BlockHash]) -> Optional[PrepareStoreOutput]:
+        # filter out blocks that are already stored
+        block_hashes_to_store = [
+            block_hash for block_hash in block_hashes
+            if block_hash not in self.blocks
+        ]
+
+        num_blocks_to_evict = (len(block_hashes_to_store) -
+                               self.backend.get_num_free_blocks())
+
+        # build list of blocks to evict
+        to_evict = []
+        if num_blocks_to_evict > 0:
+            for block_hash, block in self.blocks.items():
+                if block.ref_cnt == 0:
+                    to_evict.append(block_hash)
+                    num_blocks_to_evict -= 1
+                    if num_blocks_to_evict == 0:
+                        break
+            else:
+                # we could not evict enough blocks
+                return None
+
+        # evict blocks
+        for block_hash in to_evict:
+            self.backend.free(self.blocks.pop(block_hash))
+
+        if to_evict and self.events is not None:
+            self.events.append(
+                OffloadingEvent(block_hashes=to_evict,
+                                block_size=self.backend.block_size,
+                                medium=self.backend.medium,
+                                removed=True))
+
+        blocks = self.backend.allocate_blocks(block_hashes_to_store)
+        assert len(blocks) == len(block_hashes_to_store)
+
+        for block_hash, block in zip(block_hashes_to_store, blocks):
+            self.blocks[block_hash] = block
+
+        # build store specs for allocated blocks
+        store_spec = self.backend.get_load_store_spec(block_hashes_to_store,
+                                                      blocks)
+
+        return PrepareStoreOutput(block_hashes_to_store=block_hashes_to_store,
+                                  store_spec=store_spec,
+                                  block_hashes_evicted=to_evict)
+
+    def complete_store(self,
+                       block_hashes: Iterable[BlockHash],
+                       success: bool = True):
+        stored_block_hashes: list[BlockHash] = []
+        if success:
+            for block_hash in block_hashes:
+                block = self.blocks[block_hash]
+                if not block.is_ready:
+                    block.ref_cnt = 0
+                    stored_block_hashes.append(block_hash)
+        else:
+            for block_hash in block_hashes:
+                block = self.blocks[block_hash]
+                if not block.is_ready:
+                    self.backend.free(block)
+                    del self.blocks[block_hash]
+
+        if stored_block_hashes and self.events is not None:
+            self.events.append(
+                OffloadingEvent(block_hashes=stored_block_hashes,
+                                block_size=self.backend.block_size,
+                                medium=self.backend.medium,
+                                removed=False))
+
+    def take_events(self) -> Iterable[OffloadingEvent]:
+        if self.events is not None:
+            yield from self.events
+            self.events.clear()
--- a/vllm/v1/kv_offload/mediums.py
+++ b/vllm/v1/kv_offload/mediums.py
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC
+
+import numpy as np
+
+from vllm.v1.kv_offload.abstract import LoadStoreSpec
+
+
+class BlockIDsLoadStoreSpec(LoadStoreSpec, ABC):
+    """
+    Spec for loading/storing KV blocks from given block numbers.
+    """
+
+    def __init__(self, block_ids: list[int]):
+        self.block_ids = np.array(block_ids, dtype=np.int64)
+
+    def __repr__(self) -> str:
+        return repr(self.block_ids)
+
+
+class GPULoadStoreSpec(BlockIDsLoadStoreSpec):
+    """
+    Spec for loading/storing a KV block to GPU memory.
+    """
+
+    @staticmethod
+    def medium() -> str:
+        return "GPU"
+
+
+class CPULoadStoreSpec(BlockIDsLoadStoreSpec):
+    """
+    Spec for loading/storing a KV block to CPU memory.
+    """
+
+    @staticmethod
+    def medium() -> str:
+        return "CPU"
--- a/vllm/v1/kv_offload/spec.py
+++ b/vllm/v1/kv_offload/spec.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+from collections.abc import Iterator
+from typing import TYPE_CHECKING
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
+from vllm.v1.kv_offload.worker.worker import OffloadingHandler
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+
+logger = init_logger(__name__)
+
+
+class OffloadingSpec(ABC):
+    """Spec for an offloading connector"""
+
+    def __init__(self, vllm_config: "VllmConfig"):
+        logger.warning(
+            "Initializing OffloadingSpec. This API is experimental and "
+            "subject to change in the future as we iterate the design.")
+        self.vllm_config = vllm_config
+
+        kv_transfer_config = vllm_config.kv_transfer_config
+        assert kv_transfer_config is not None
+        self.extra_config = kv_transfer_config.kv_connector_extra_config
+
+        self.gpu_block_size = vllm_config.cache_config.block_size
+        self.offloaded_block_size = int(
+            self.extra_config.get("block_size", self.gpu_block_size))
+
+        assert self.offloaded_block_size % self.gpu_block_size == 0
+
+    @abstractmethod
+    def get_manager(self) -> OffloadingManager:
+        """
+        Get an OffloadingManager that will be used
+        by the scheduler-side offloading connector to track
+        offloaded blocks and manage evictions.
+        """
+        pass
+
+    @abstractmethod
+    def get_handlers(
+        self, kv_caches: dict[str, torch.Tensor]
+    ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec],
+                        OffloadingHandler]]:
+        """
+        Get offloading handlers along with their respective src and dst types.
+
+        Args:
+            kv_caches: A dictionary of layer_name -> gpu_kv_cache tensor.
+
+        Yields:
+            Tuples of (src_type, dst_type, offloading_handler).
+        """
+        pass
--- a/vllm/v1/kv_offload/worker/init.py
+++ b/vllm/v1/kv_offload/worker/init.py
--- a/vllm/v1/kv_offload/worker/cpu_gpu.py
+++ b/vllm/v1/kv_offload/worker/cpu_gpu.py
@@ -0,0 +1,171 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import numpy as np
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.attention import AttentionBackend
+from vllm.logger import init_logger
+from vllm.utils import is_pin_memory_available
+from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
+from vllm.v1.kv_offload.worker.worker import (OffloadingHandler,
+                                              TransferResult, TransferSpec)
+
+logger = init_logger(__name__)
+
+
+def expand_block_ids(block_ids: np.ndarray,
+                     block_size_factor: int,
+                     output: np.ndarray,
+                     skip_count: int = 0):
+    """
+    Convert a list of block IDs to a list of matching block ids,
+    assuming each block is composed of actual block_size_factor blocks.
+    Outputs to output tensor.
+    The first skip_count blocks will be skipped.
+    Note that skip_count must be less than block_size_factor.
+
+    For example, if block_ids = [0, 1, 3] and block_size_factor =  4,
+    then it yields [0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15]
+    since 0 maps to [0, 1, 2, 3]
+    1 maps to [4, 5, 6, 7]
+    and 3 maps to [12, 13, 14, 15]
+    """
+    assert skip_count < block_size_factor
+
+    first_range = np.arange(skip_count, block_size_factor)
+    full_range = np.arange(0, block_size_factor)
+
+    output_idx = 0
+    for i, block_id in enumerate(block_ids):
+        base_block_id = block_id * block_size_factor
+        indices = first_range if i == 0 else full_range
+        output_end_idx = output_idx + len(indices)
+        output[output_idx:output_end_idx] = base_block_id + indices
+        output_idx = output_end_idx
+
+
+class CpuGpuOffloadingHandler(OffloadingHandler):
+
+    def __init__(self, gpu_block_size: int, cpu_block_size: int,
+                 num_cpu_blocks: int, gpu_caches: dict[str, torch.Tensor],
+                 attn_backends: dict[str, type[AttentionBackend]]):
+        assert cpu_block_size % gpu_block_size == 0
+        self.block_size_factor = cpu_block_size // gpu_block_size
+
+        # cuda streams for gpu->cpu and cpu->gpu
+        self.d2h_stream = torch.cuda.Stream()
+        self.h2d_stream = torch.cuda.Stream()
+
+        # job_id -> transfer cuda event
+        self.transfer_events: dict[int, torch.cuda.Event] = {}
+        # list of cuda events available for re-use
+        self.events_pool: list[torch.cuda.Event] = []
+
+        pin_memory = is_pin_memory_available()
+
+        # allocate cpu tensors
+        logger.info("Allocating %d CPU tensors...", len(gpu_caches))
+        self.gpu_tensors: list[torch.Tensor] = []
+        self.cpu_tensors: list[torch.Tensor] = []
+        self.kv_dim_before_num_blocks: list[bool] = []
+        for layer_name, gpu_tensor in gpu_caches.items():
+            self.gpu_tensors.append(gpu_tensor)
+
+            gpu_shape = gpu_tensor.shape
+            test_shape = attn_backends[layer_name].get_kv_cache_shape(
+                num_blocks=1234, block_size=16, num_kv_heads=8, head_size=256)
+            if test_shape[0] == 1234:
+                # shape is (num_blocks, ...)
+                num_blocks_idx = 0
+                self.kv_dim_before_num_blocks.append(False)
+            else:
+                # shape should be (2, num_blocks, ...)
+                assert test_shape[0] == 2
+                assert test_shape[1] == 1234
+                assert gpu_shape[0] == 2
+
+                num_blocks_idx = 1
+                self.kv_dim_before_num_blocks.append(True)
+
+            cpu_shape = list(gpu_shape)
+            cpu_shape[num_blocks_idx] = num_cpu_blocks * self.block_size_factor
+
+            logger.debug("Allocating CPU tensor of shape %r", cpu_shape)
+            self.cpu_tensors.append(
+                torch.zeros(cpu_shape,
+                            dtype=gpu_tensor.dtype,
+                            device="cpu",
+                            pin_memory=pin_memory))
+
+    def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
+        src_spec, dst_spec = spec
+        if isinstance(src_spec, CPULoadStoreSpec):
+            assert isinstance(dst_spec, GPULoadStoreSpec)
+            stream = self.h2d_stream
+            src_tensors = self.cpu_tensors
+            dst_tensors = self.gpu_tensors
+            src_block_size_factor = self.block_size_factor
+            dst_block_size_factor = 1
+        else:
+            assert isinstance(src_spec, GPULoadStoreSpec)
+            assert isinstance(dst_spec, CPULoadStoreSpec)
+            stream = self.d2h_stream
+            src_tensors = self.gpu_tensors
+            dst_tensors = self.cpu_tensors
+            src_block_size_factor = 1
+            dst_block_size_factor = self.block_size_factor
+
+        src_blocks = src_spec.block_ids
+        dst_blocks = dst_spec.block_ids
+        assert src_blocks.ndim == 1
+        assert dst_blocks.ndim == 1
+
+        dst_sub_blocks_to_skip = (-src_blocks.size % dst_block_size_factor)
+        src_sub_block_count = src_blocks.size * src_block_size_factor
+
+        assert (
+            src_sub_block_count == dst_blocks.size * dst_block_size_factor -
+            dst_sub_blocks_to_skip)
+
+        src_to_dst = np.empty((src_sub_block_count, 2), dtype=np.int64)
+        expand_block_ids(src_blocks, src_block_size_factor, src_to_dst[:, 0])
+        expand_block_ids(dst_blocks,
+                         dst_block_size_factor,
+                         src_to_dst[:, 1],
+                         skip_count=dst_sub_blocks_to_skip)
+        src_to_dst_tensor = torch.from_numpy(src_to_dst)
+
+        event = self.events_pool.pop() if self.events_pool \
+            else torch.cuda.Event()
+        with torch.cuda.stream(stream):
+            for src_tensor, dst_tensor, kv_dim in zip(
+                    src_tensors, dst_tensors, self.kv_dim_before_num_blocks):
+                if kv_dim:
+                    src_key_cache = src_tensor[0]
+                    dst_key_cache = dst_tensor[0]
+                    ops.swap_blocks(src_key_cache, dst_key_cache,
+                                    src_to_dst_tensor)
+                    src_value_cache = src_tensor[1]
+                    dst_value_cache = dst_tensor[1]
+                    ops.swap_blocks(src_value_cache, dst_value_cache,
+                                    src_to_dst_tensor)
+                else:
+                    ops.swap_blocks(src_tensor, dst_tensor, src_to_dst_tensor)
+            event.record(stream)
+
+        self.transfer_events[job_id] = event
+
+        # success
+        return True
+
+    def get_finished(self) -> list[TransferResult]:
+        results: list[TransferResult] = []
+        for job_id, event in self.transfer_events.items():
+            if event.query():
+                results.append((job_id, True))
+                self.events_pool.append(event)
+        for job_id, _ in results:
+            del self.transfer_events[job_id]
+        return results
--- a/vllm/v1/kv_offload/worker/worker.py
+++ b/vllm/v1/kv_offload/worker/worker.py
@@ -0,0 +1,142 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+
+from vllm.logger import init_logger
+from vllm.v1.kv_offload.abstract import LoadStoreSpec
+
+# a single transfer spec (src_blocks_spec, dst_blocks_spec)
+TransferSpec = tuple[LoadStoreSpec, LoadStoreSpec]
+# transfers are forwarded to workers by (src_medium, dst_medium)
+TransferType = tuple[str, str]
+# transfer result (job_id, success)
+TransferResult = tuple[int, bool]
+
+logger = init_logger(__name__)
+
+
+class OffloadingHandler(ABC):
+    """
+    OffloadingHandler class for managing asynchronous KV data transfers
+
+    This class runs in the worker.
+    It kicks off async KV data transfer requests, and allows
+    collecting back completion statuses.
+
+    The class provides the following primitives:
+        transfer_async() - kicks off a new transfer job
+        get_finished() - returns a list of newly finished job IDs.
+    """
+
+    @abstractmethod
+    def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
+        """
+        Initiates an asynchronous transfer of KV data.
+
+        Args:
+            job_id: a unique ID that will be used when notifying back on
+                transfer completion.
+            spec: the (src, dst) spec of the KV data transfer.
+
+        Returns:
+            True if transfer was submitted successfully.
+        """
+        pass
+
+    @abstractmethod
+    def get_finished(self) -> list[TransferResult]:
+        """
+        Get transfers finished since last call.
+
+        Returns:
+            A list of (job_id, success) of transfers.
+        """
+        pass
+
+
+class OffloadingWorker:
+    """
+    OffloadingWorker class for managing asynchronous KV data transfers
+    using multiple OffloadingHandlers
+
+    This class runs in the worker.
+    It kicks off async KV data transfer requests, by delegating
+    to one of its registered OffloadingHandlers, based on the transfer type.
+
+    The class provides the following primitives:
+        register_handler() - registers a new handler to handle
+            a specific transfer type
+        transfer_async() - kicks off a new transfer job
+            using one of the registered handlers.
+        get_finished() - returns a list of newly finished job IDs
+            from all handlers.
+    """
+
+    def __init__(self):
+        self.handlers: set[OffloadingHandler] = set()
+        self.transfer_type_to_handler: dict[TransferType,
+                                            OffloadingHandler] = {}
+
+    def register_handler(self, src_cls: type[LoadStoreSpec],
+                         dst_cls: type[LoadStoreSpec],
+                         handler: OffloadingHandler) -> None:
+        """
+        Registers a new handler.
+
+        Args:
+            src_cls: the source type of transfers handled by this handler.
+            dst_cls: the destination type of transfers handled by this handler.
+            handler: the handler that will handle transfers.
+        """
+        transfer_type = (src_cls.medium(), dst_cls.medium())
+        assert transfer_type not in self.transfer_type_to_handler
+        self.handlers.add(handler)
+        self.transfer_type_to_handler[transfer_type] = handler
+
+    def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
+        """
+        Initiates an asynchronous transfer of KV data.
+
+        Args:
+            job_id: a unique ID that will be used when notifying back on
+                transfer completion.
+            spec: the (src, dst) spec of the KV data transfer.
+
+        Returns:
+            True if transfer was submitted successfully.
+        """
+        src, dst = spec
+        transfer_type = (src.medium(), dst.medium())
+        handler = self.transfer_type_to_handler.get(transfer_type)
+        assert handler is not None
+
+        try:
+            success = handler.transfer_async(job_id, spec)
+        except Exception as e:
+            logger.warning("Exception in %r transfer %d: %r",
+                           transfer_type,
+                           job_id,
+                           e,
+                           exc_info=True)
+            return False
+
+        if not success:
+            logger.warning("Failed to submit %r transfer %d", transfer_type,
+                           job_id)
+        else:
+            logger.debug("Submitted %r transfer %d: %r", transfer_type, job_id,
+                         spec)
+
+        return success
+
+    def get_finished(self) -> list[TransferResult]:
+        """
+        Get transfers finished since last call.
+
+        Returns:
+            A list of (job_id, success) of transfers.
+        """
+        finished = []
+        for handler in self.handlers:
+            finished.extend(handler.get_finished())
+        return finished