init

2026-04-02 04:53:13 +00:00
parent 80932c96e5
commit 24df76db9d
1987 changed files with 447445 additions and 0 deletions
--- a/vllm/v1/kv_offload/worker/init.py
+++ b/vllm/v1/kv_offload/worker/init.py
--- a/vllm/v1/kv_offload/worker/cpu_gpu.py
+++ b/vllm/v1/kv_offload/worker/cpu_gpu.py
@@ -0,0 +1,171 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import numpy as np
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.attention import AttentionBackend
+from vllm.logger import init_logger
+from vllm.utils import is_pin_memory_available
+from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
+from vllm.v1.kv_offload.worker.worker import (OffloadingHandler,
+                                              TransferResult, TransferSpec)
+
+logger = init_logger(__name__)
+
+
+def expand_block_ids(block_ids: np.ndarray,
+                     block_size_factor: int,
+                     output: np.ndarray,
+                     skip_count: int = 0):
+    """
+    Convert a list of block IDs to a list of matching block ids,
+    assuming each block is composed of actual block_size_factor blocks.
+    Outputs to output tensor.
+    The first skip_count blocks will be skipped.
+    Note that skip_count must be less than block_size_factor.
+
+    For example, if block_ids = [0, 1, 3] and block_size_factor =  4,
+    then it yields [0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15]
+    since 0 maps to [0, 1, 2, 3]
+    1 maps to [4, 5, 6, 7]
+    and 3 maps to [12, 13, 14, 15]
+    """
+    assert skip_count < block_size_factor
+
+    first_range = np.arange(skip_count, block_size_factor)
+    full_range = np.arange(0, block_size_factor)
+
+    output_idx = 0
+    for i, block_id in enumerate(block_ids):
+        base_block_id = block_id * block_size_factor
+        indices = first_range if i == 0 else full_range
+        output_end_idx = output_idx + len(indices)
+        output[output_idx:output_end_idx] = base_block_id + indices
+        output_idx = output_end_idx
+
+
+class CpuGpuOffloadingHandler(OffloadingHandler):
+
+    def __init__(self, gpu_block_size: int, cpu_block_size: int,
+                 num_cpu_blocks: int, gpu_caches: dict[str, torch.Tensor],
+                 attn_backends: dict[str, type[AttentionBackend]]):
+        assert cpu_block_size % gpu_block_size == 0
+        self.block_size_factor = cpu_block_size // gpu_block_size
+
+        # cuda streams for gpu->cpu and cpu->gpu
+        self.d2h_stream = torch.cuda.Stream()
+        self.h2d_stream = torch.cuda.Stream()
+
+        # job_id -> transfer cuda event
+        self.transfer_events: dict[int, torch.cuda.Event] = {}
+        # list of cuda events available for re-use
+        self.events_pool: list[torch.cuda.Event] = []
+
+        pin_memory = is_pin_memory_available()
+
+        # allocate cpu tensors
+        logger.info("Allocating %d CPU tensors...", len(gpu_caches))
+        self.gpu_tensors: list[torch.Tensor] = []
+        self.cpu_tensors: list[torch.Tensor] = []
+        self.kv_dim_before_num_blocks: list[bool] = []
+        for layer_name, gpu_tensor in gpu_caches.items():
+            self.gpu_tensors.append(gpu_tensor)
+
+            gpu_shape = gpu_tensor.shape
+            test_shape = attn_backends[layer_name].get_kv_cache_shape(
+                num_blocks=1234, block_size=16, num_kv_heads=8, head_size=256)
+            if test_shape[0] == 1234:
+                # shape is (num_blocks, ...)
+                num_blocks_idx = 0
+                self.kv_dim_before_num_blocks.append(False)
+            else:
+                # shape should be (2, num_blocks, ...)
+                assert test_shape[0] == 2
+                assert test_shape[1] == 1234
+                assert gpu_shape[0] == 2
+
+                num_blocks_idx = 1
+                self.kv_dim_before_num_blocks.append(True)
+
+            cpu_shape = list(gpu_shape)
+            cpu_shape[num_blocks_idx] = num_cpu_blocks * self.block_size_factor
+
+            logger.debug("Allocating CPU tensor of shape %r", cpu_shape)
+            self.cpu_tensors.append(
+                torch.zeros(cpu_shape,
+                            dtype=gpu_tensor.dtype,
+                            device="cpu",
+                            pin_memory=pin_memory))
+
+    def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
+        src_spec, dst_spec = spec
+        if isinstance(src_spec, CPULoadStoreSpec):
+            assert isinstance(dst_spec, GPULoadStoreSpec)
+            stream = self.h2d_stream
+            src_tensors = self.cpu_tensors
+            dst_tensors = self.gpu_tensors
+            src_block_size_factor = self.block_size_factor
+            dst_block_size_factor = 1
+        else:
+            assert isinstance(src_spec, GPULoadStoreSpec)
+            assert isinstance(dst_spec, CPULoadStoreSpec)
+            stream = self.d2h_stream
+            src_tensors = self.gpu_tensors
+            dst_tensors = self.cpu_tensors
+            src_block_size_factor = 1
+            dst_block_size_factor = self.block_size_factor
+
+        src_blocks = src_spec.block_ids
+        dst_blocks = dst_spec.block_ids
+        assert src_blocks.ndim == 1
+        assert dst_blocks.ndim == 1
+
+        dst_sub_blocks_to_skip = (-src_blocks.size % dst_block_size_factor)
+        src_sub_block_count = src_blocks.size * src_block_size_factor
+
+        assert (
+            src_sub_block_count == dst_blocks.size * dst_block_size_factor -
+            dst_sub_blocks_to_skip)
+
+        src_to_dst = np.empty((src_sub_block_count, 2), dtype=np.int64)
+        expand_block_ids(src_blocks, src_block_size_factor, src_to_dst[:, 0])
+        expand_block_ids(dst_blocks,
+                         dst_block_size_factor,
+                         src_to_dst[:, 1],
+                         skip_count=dst_sub_blocks_to_skip)
+        src_to_dst_tensor = torch.from_numpy(src_to_dst)
+
+        event = self.events_pool.pop() if self.events_pool \
+            else torch.cuda.Event()
+        with torch.cuda.stream(stream):
+            for src_tensor, dst_tensor, kv_dim in zip(
+                    src_tensors, dst_tensors, self.kv_dim_before_num_blocks):
+                if kv_dim:
+                    src_key_cache = src_tensor[0]
+                    dst_key_cache = dst_tensor[0]
+                    ops.swap_blocks(src_key_cache, dst_key_cache,
+                                    src_to_dst_tensor)
+                    src_value_cache = src_tensor[1]
+                    dst_value_cache = dst_tensor[1]
+                    ops.swap_blocks(src_value_cache, dst_value_cache,
+                                    src_to_dst_tensor)
+                else:
+                    ops.swap_blocks(src_tensor, dst_tensor, src_to_dst_tensor)
+            event.record(stream)
+
+        self.transfer_events[job_id] = event
+
+        # success
+        return True
+
+    def get_finished(self) -> list[TransferResult]:
+        results: list[TransferResult] = []
+        for job_id, event in self.transfer_events.items():
+            if event.query():
+                results.append((job_id, True))
+                self.events_pool.append(event)
+        for job_id, _ in results:
+            del self.transfer_events[job_id]
+        return results
--- a/vllm/v1/kv_offload/worker/worker.py
+++ b/vllm/v1/kv_offload/worker/worker.py
@@ -0,0 +1,142 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+
+from vllm.logger import init_logger
+from vllm.v1.kv_offload.abstract import LoadStoreSpec
+
+# a single transfer spec (src_blocks_spec, dst_blocks_spec)
+TransferSpec = tuple[LoadStoreSpec, LoadStoreSpec]
+# transfers are forwarded to workers by (src_medium, dst_medium)
+TransferType = tuple[str, str]
+# transfer result (job_id, success)
+TransferResult = tuple[int, bool]
+
+logger = init_logger(__name__)
+
+
+class OffloadingHandler(ABC):
+    """
+    OffloadingHandler class for managing asynchronous KV data transfers
+
+    This class runs in the worker.
+    It kicks off async KV data transfer requests, and allows
+    collecting back completion statuses.
+
+    The class provides the following primitives:
+        transfer_async() - kicks off a new transfer job
+        get_finished() - returns a list of newly finished job IDs.
+    """
+
+    @abstractmethod
+    def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
+        """
+        Initiates an asynchronous transfer of KV data.
+
+        Args:
+            job_id: a unique ID that will be used when notifying back on
+                transfer completion.
+            spec: the (src, dst) spec of the KV data transfer.
+
+        Returns:
+            True if transfer was submitted successfully.
+        """
+        pass
+
+    @abstractmethod
+    def get_finished(self) -> list[TransferResult]:
+        """
+        Get transfers finished since last call.
+
+        Returns:
+            A list of (job_id, success) of transfers.
+        """
+        pass
+
+
+class OffloadingWorker:
+    """
+    OffloadingWorker class for managing asynchronous KV data transfers
+    using multiple OffloadingHandlers
+
+    This class runs in the worker.
+    It kicks off async KV data transfer requests, by delegating
+    to one of its registered OffloadingHandlers, based on the transfer type.
+
+    The class provides the following primitives:
+        register_handler() - registers a new handler to handle
+            a specific transfer type
+        transfer_async() - kicks off a new transfer job
+            using one of the registered handlers.
+        get_finished() - returns a list of newly finished job IDs
+            from all handlers.
+    """
+
+    def __init__(self):
+        self.handlers: set[OffloadingHandler] = set()
+        self.transfer_type_to_handler: dict[TransferType,
+                                            OffloadingHandler] = {}
+
+    def register_handler(self, src_cls: type[LoadStoreSpec],
+                         dst_cls: type[LoadStoreSpec],
+                         handler: OffloadingHandler) -> None:
+        """
+        Registers a new handler.
+
+        Args:
+            src_cls: the source type of transfers handled by this handler.
+            dst_cls: the destination type of transfers handled by this handler.
+            handler: the handler that will handle transfers.
+        """
+        transfer_type = (src_cls.medium(), dst_cls.medium())
+        assert transfer_type not in self.transfer_type_to_handler
+        self.handlers.add(handler)
+        self.transfer_type_to_handler[transfer_type] = handler
+
+    def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
+        """
+        Initiates an asynchronous transfer of KV data.
+
+        Args:
+            job_id: a unique ID that will be used when notifying back on
+                transfer completion.
+            spec: the (src, dst) spec of the KV data transfer.
+
+        Returns:
+            True if transfer was submitted successfully.
+        """
+        src, dst = spec
+        transfer_type = (src.medium(), dst.medium())
+        handler = self.transfer_type_to_handler.get(transfer_type)
+        assert handler is not None
+
+        try:
+            success = handler.transfer_async(job_id, spec)
+        except Exception as e:
+            logger.warning("Exception in %r transfer %d: %r",
+                           transfer_type,
+                           job_id,
+                           e,
+                           exc_info=True)
+            return False
+
+        if not success:
+            logger.warning("Failed to submit %r transfer %d", transfer_type,
+                           job_id)
+        else:
+            logger.debug("Submitted %r transfer %d: %r", transfer_type, job_id,
+                         spec)
+
+        return success
+
+    def get_finished(self) -> list[TransferResult]:
+        """
+        Get transfers finished since last call.
+
+        Returns:
+            A list of (job_id, success) of transfers.
+        """
+        finished = []
+        for handler in self.handlers:
+            finished.extend(handler.get_finished())
+        return finished