[gpt-oss] Add gpt-oss bf16 support
This commit is contained in:
0
vllm/core/block/__init__.py
Normal file
0
vllm/core/block/__init__.py
Normal file
399
vllm/core/block/block_table.py
Normal file
399
vllm/core/block/block_table.py
Normal file
@@ -0,0 +1,399 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import math
|
||||
from typing import List, Optional
|
||||
|
||||
from vllm.core.block.common import BlockList
|
||||
from vllm.core.block.interfaces import Block, DeviceAwareBlockAllocator
|
||||
from vllm.utils import Device, cdiv, chunk_list
|
||||
|
||||
|
||||
class BlockTable:
|
||||
"""A class to manage blocks for a specific sequence.
|
||||
|
||||
The BlockTable maps a sequence of tokens to a list of blocks, where each
|
||||
block represents a contiguous memory allocation for a portion of the
|
||||
sequence. The blocks are managed by a DeviceAwareBlockAllocator, which is
|
||||
responsible for allocating and freeing memory for the blocks.
|
||||
|
||||
Args:
|
||||
block_size (int): The maximum number of tokens that can be stored in a
|
||||
single block.
|
||||
block_allocator (DeviceAwareBlockAllocator): The block allocator used to
|
||||
manage memory for the blocks.
|
||||
_blocks (Optional[List[Block]], optional): An optional list of existing
|
||||
blocks to initialize the BlockTable with. If not provided, an empty
|
||||
BlockTable is created.
|
||||
max_block_sliding_window (Optional[int], optional): The number of
|
||||
blocks to keep around for each sequence. If None, all blocks
|
||||
are kept (eg., when sliding window is not used).
|
||||
It should at least fit the sliding window size of the model.
|
||||
|
||||
Attributes:
|
||||
_block_size (int): The maximum number of tokens that can be stored in a
|
||||
single block.
|
||||
_allocator (DeviceAwareBlockAllocator): The block allocator used to
|
||||
manage memory for the blocks.
|
||||
_blocks (Optional[List[Block]]): The list of blocks managed by this
|
||||
BlockTable.
|
||||
_num_full_slots (int): The number of tokens currently stored in the
|
||||
blocks.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
block_size: int,
|
||||
block_allocator: DeviceAwareBlockAllocator,
|
||||
_blocks: Optional[List[Block]] = None,
|
||||
max_block_sliding_window: Optional[int] = None,
|
||||
):
|
||||
self._block_size = block_size
|
||||
self._allocator = block_allocator
|
||||
if _blocks is None:
|
||||
_blocks = []
|
||||
self._blocks: BlockList = BlockList(_blocks)
|
||||
|
||||
self._max_block_sliding_window = max_block_sliding_window
|
||||
self._num_full_slots = self._get_num_token_ids()
|
||||
|
||||
@staticmethod
|
||||
def get_num_required_blocks(token_ids: List[int],
|
||||
block_size: int,
|
||||
num_lookahead_slots: int = 0) -> int:
|
||||
"""Calculates the minimum number of blocks required to store a given
|
||||
sequence of token IDs along with any look-ahead slots that may be
|
||||
required (like in multi-step + chunked-prefill).
|
||||
|
||||
This assumes worst-case scenario, where every block requires a new
|
||||
allocation (e.g. ignoring prefix caching).
|
||||
|
||||
Args:
|
||||
token_ids (List[int]): The sequence of token IDs to be stored.
|
||||
block_size (int): The maximum number of tokens that can be stored in
|
||||
a single block.
|
||||
num_lookahead_slots (int): look-ahead slots that the sequence may
|
||||
require.
|
||||
|
||||
Returns:
|
||||
int: The minimum number of blocks required to store the given
|
||||
sequence of token IDs along with any required look-ahead slots.
|
||||
"""
|
||||
return cdiv(len(token_ids) + num_lookahead_slots, block_size)
|
||||
|
||||
def allocate(self,
|
||||
token_ids: List[int],
|
||||
device: Device = Device.GPU,
|
||||
extra_hash: Optional[int] = None) -> None:
|
||||
"""Allocates memory blocks for storing the given sequence of token IDs.
|
||||
|
||||
This method allocates the required number of blocks to store the given
|
||||
sequence of token IDs.
|
||||
|
||||
Args:
|
||||
token_ids (List[int]): The sequence of token IDs to be stored.
|
||||
device (Device, optional): The device on which the blocks should be
|
||||
allocated. Defaults to Device.GPU.
|
||||
extra_hash (Optional[int]): The hash value of additional
|
||||
factors, such as adapters, that influence the block hash
|
||||
in the prefixcaching block.
|
||||
"""
|
||||
assert not self._is_allocated
|
||||
assert token_ids
|
||||
blocks = self._allocate_blocks_for_token_ids(prev_block=None,
|
||||
token_ids=token_ids,
|
||||
device=device,
|
||||
extra_hash=extra_hash)
|
||||
self.update(blocks)
|
||||
self._num_full_slots = len(token_ids)
|
||||
|
||||
def update(self, blocks: List[Block]) -> None:
|
||||
"""Resets the table to the newly provided blocks
|
||||
(with their corresponding block ids)
|
||||
"""
|
||||
self._blocks.update(blocks)
|
||||
|
||||
def append_token_ids(self,
|
||||
token_ids: List[int],
|
||||
num_lookahead_slots: int = 0,
|
||||
num_computed_slots: Optional[int] = None,
|
||||
extra_hash: Optional[int] = None) -> None:
|
||||
"""Appends a sequence of token IDs to the existing blocks in the
|
||||
BlockTable.
|
||||
|
||||
This method appends the given sequence of token IDs to the existing
|
||||
blocks in the BlockTable. If there is not enough space in the existing
|
||||
blocks, new blocks are allocated using the `ensure_num_empty_slots`
|
||||
method to accommodate the additional tokens.
|
||||
|
||||
The token IDs are divided into chunks of size `block_size` (except for
|
||||
the first chunk, which may be smaller), and each chunk is appended to a
|
||||
separate block.
|
||||
|
||||
Args:
|
||||
token_ids (List[int]): The sequence of token IDs to be appended.
|
||||
num_computed_slots (Optional[int]): The number of KV cache slots
|
||||
that are already filled (computed).
|
||||
When sliding window is enabled, this is used to compute how many
|
||||
blocks to drop at the front of the sequence.
|
||||
Without sliding window, None can be passed.
|
||||
Without chunked prefill, it should be the same as
|
||||
_num_full_slots.
|
||||
extra_hash (Optional[int]): The hash value of additional
|
||||
factors such as adapters that influence the block, apart
|
||||
from the token_ids.
|
||||
"""
|
||||
assert self._is_allocated, "no blocks have been allocated"
|
||||
assert len(self._blocks) > 0
|
||||
|
||||
# Drop blocks that are no longer needed due to sliding window
|
||||
if self._max_block_sliding_window is not None:
|
||||
null_block = self._allocator.allocate_or_get_null_block()
|
||||
assert num_computed_slots is not None
|
||||
end_block_idx = (num_computed_slots //
|
||||
self._block_size) - self._max_block_sliding_window
|
||||
for idx in range(0, end_block_idx):
|
||||
b = self._blocks[idx]
|
||||
if b is not null_block:
|
||||
self._allocator.free(b)
|
||||
self._blocks[idx] = null_block
|
||||
|
||||
# Ensure there are enough empty slots for the new tokens plus
|
||||
# lookahead slots
|
||||
self.ensure_num_empty_slots(num_empty_slots=len(token_ids) +
|
||||
num_lookahead_slots,
|
||||
extra_hash=extra_hash)
|
||||
|
||||
# Update the blocks with the new tokens
|
||||
first_block_idx = self._num_full_slots // self._block_size
|
||||
token_blocks = self._chunk_token_blocks_for_append(token_ids)
|
||||
|
||||
for i, token_block in enumerate(token_blocks):
|
||||
self._blocks.append_token_ids(first_block_idx + i, token_block)
|
||||
|
||||
self._num_full_slots += len(token_ids)
|
||||
|
||||
def ensure_num_empty_slots(self,
|
||||
num_empty_slots: int,
|
||||
extra_hash: Optional[int] = None) -> None:
|
||||
"""Ensures that the BlockTable has at least the specified number of
|
||||
empty slots available.
|
||||
|
||||
This method checks if the BlockTable has enough empty slots (i.e.,
|
||||
available space) to accommodate the requested number of tokens. If not,
|
||||
it allocates additional blocks on the GPU to ensure that the required
|
||||
number of empty slots is available.
|
||||
|
||||
Args:
|
||||
num_empty_slots (int): The minimum number of empty slots required.
|
||||
extra_hash (Optional[int]): The hash value of additional
|
||||
factors such as adapters that influence the block, apart
|
||||
from the token_ids.
|
||||
"""
|
||||
# Currently the block table only supports
|
||||
# appending tokens to GPU blocks.
|
||||
device = Device.GPU
|
||||
assert self._is_allocated
|
||||
|
||||
if self._num_empty_slots >= num_empty_slots:
|
||||
return
|
||||
|
||||
slots_to_allocate = num_empty_slots - self._num_empty_slots
|
||||
blocks_to_allocate = cdiv(slots_to_allocate, self._block_size)
|
||||
|
||||
for _ in range(blocks_to_allocate):
|
||||
assert len(self._blocks) > 0
|
||||
self._blocks.append(
|
||||
self._allocator.allocate_mutable_block(
|
||||
prev_block=self._blocks[-1],
|
||||
device=device,
|
||||
extra_hash=extra_hash))
|
||||
|
||||
def fork(self) -> "BlockTable":
|
||||
"""Creates a new BlockTable instance with a copy of the blocks from the
|
||||
current instance.
|
||||
|
||||
This method creates a new BlockTable instance with the same block size,
|
||||
block allocator, and a copy of the blocks from the current instance. The
|
||||
new BlockTable has its own independent set of blocks, but shares the
|
||||
same underlying memory allocation with the original BlockTable.
|
||||
|
||||
Returns:
|
||||
BlockTable: A new BlockTable instance with a copy of the blocks from
|
||||
the current instance.
|
||||
"""
|
||||
assert self._is_allocated
|
||||
assert len(self._blocks) > 0
|
||||
forked_blocks = self._allocator.fork(self._blocks[-1])
|
||||
return BlockTable(
|
||||
block_size=self._block_size,
|
||||
block_allocator=self._allocator,
|
||||
_blocks=forked_blocks,
|
||||
max_block_sliding_window=self._max_block_sliding_window,
|
||||
)
|
||||
|
||||
def free(self) -> None:
|
||||
"""Frees the memory occupied by the blocks in the BlockTable.
|
||||
|
||||
This method iterates over all the blocks in the `_blocks` list and calls
|
||||
the `free` method of the `_allocator` object to release the memory
|
||||
occupied by each block. After freeing all the blocks, the `_blocks` list
|
||||
is set to `None`.
|
||||
"""
|
||||
for block in self.blocks:
|
||||
self._allocator.free(block)
|
||||
self._blocks.reset()
|
||||
|
||||
@property
|
||||
def physical_block_ids(self) -> List[int]:
|
||||
"""Returns a list of physical block indices for the blocks in the
|
||||
BlockTable.
|
||||
|
||||
This property returns a list of integers, where each integer represents
|
||||
the physical block index of a corresponding block in the `_blocks` list.
|
||||
The physical block index is a unique identifier for the memory location
|
||||
occupied by the block.
|
||||
|
||||
Returns:
|
||||
List[int]: A list of physical block indices for the blocks in the
|
||||
BlockTable.
|
||||
"""
|
||||
return self._blocks.ids()
|
||||
|
||||
def get_unseen_token_ids(self, sequence_token_ids: List[int]) -> List[int]:
|
||||
"""Get the number of "unseen" tokens in the sequence.
|
||||
|
||||
Unseen tokens are tokens in the sequence corresponding to this block
|
||||
table, but are not yet appended to this block table.
|
||||
|
||||
Args:
|
||||
sequence_token_ids (List[int]): The list of token ids in the
|
||||
sequence.
|
||||
|
||||
Returns:
|
||||
List[int]: The postfix of sequence_token_ids that has not yet been
|
||||
appended to the block table.
|
||||
"""
|
||||
|
||||
# Since the block table is append-only, the unseen token ids are the
|
||||
# ones after the appended ones.
|
||||
return sequence_token_ids[self.num_full_slots:]
|
||||
|
||||
def _allocate_blocks_for_token_ids(
|
||||
self,
|
||||
prev_block: Optional[Block],
|
||||
token_ids: List[int],
|
||||
device: Device,
|
||||
extra_hash: Optional[int] = None) -> List[Block]:
|
||||
blocks: List[Block] = []
|
||||
|
||||
block_token_ids = []
|
||||
tail_token_ids = []
|
||||
for cur_token_ids in chunk_list(token_ids, self._block_size):
|
||||
if len(cur_token_ids) == self._block_size:
|
||||
block_token_ids.append(cur_token_ids)
|
||||
else:
|
||||
tail_token_ids.append(cur_token_ids)
|
||||
|
||||
if block_token_ids:
|
||||
blocks.extend(
|
||||
self._allocator.allocate_immutable_blocks(
|
||||
prev_block,
|
||||
block_token_ids=block_token_ids,
|
||||
device=device,
|
||||
extra_hash=extra_hash))
|
||||
prev_block = blocks[-1]
|
||||
|
||||
if tail_token_ids:
|
||||
assert len(tail_token_ids) == 1
|
||||
cur_token_ids = tail_token_ids[0]
|
||||
|
||||
block = self._allocator.allocate_mutable_block(
|
||||
prev_block=prev_block, device=device, extra_hash=extra_hash)
|
||||
block.append_token_ids(cur_token_ids)
|
||||
|
||||
blocks.append(block)
|
||||
|
||||
return blocks
|
||||
|
||||
def _get_all_token_ids(self) -> List[int]:
|
||||
# NOTE: This function is O(seq_len); use sparingly.
|
||||
token_ids: List[int] = []
|
||||
|
||||
if not self._is_allocated:
|
||||
return token_ids
|
||||
|
||||
for block in self.blocks:
|
||||
token_ids.extend(block.token_ids)
|
||||
|
||||
return token_ids
|
||||
|
||||
def _get_num_token_ids(self) -> int:
|
||||
res = 0
|
||||
for block in self.blocks:
|
||||
res += len(block.token_ids)
|
||||
|
||||
return res
|
||||
|
||||
@property
|
||||
def _is_allocated(self) -> bool:
|
||||
return len(self._blocks) > 0
|
||||
|
||||
@property
|
||||
def blocks(self) -> List[Block]:
|
||||
return self._blocks.list()
|
||||
|
||||
@property
|
||||
def _num_empty_slots(self) -> int:
|
||||
assert self._is_allocated
|
||||
return len(self._blocks) * self._block_size - self._num_full_slots
|
||||
|
||||
@property
|
||||
def num_full_slots(self) -> int:
|
||||
"""Returns the total number of tokens currently stored in the
|
||||
BlockTable.
|
||||
|
||||
Returns:
|
||||
int: The total number of tokens currently stored in the BlockTable.
|
||||
"""
|
||||
return self._num_full_slots
|
||||
|
||||
def get_num_blocks_touched_by_append_slots(
|
||||
self, token_ids: List[int], num_lookahead_slots: int) -> int:
|
||||
"""Determine how many blocks will be "touched" by appending the token
|
||||
ids.
|
||||
|
||||
This is required for the scheduler to determine whether a sequence can
|
||||
continue generation, or if it must be preempted.
|
||||
"""
|
||||
# Math below is equivalent to:
|
||||
# all_token_ids = token_ids + [-1] * num_lookahead_slots
|
||||
# token_blocks = self._chunk_token_blocks_for_append(all_token_ids)
|
||||
# return len(token_blocks)
|
||||
|
||||
num_token_ids = len(token_ids) + num_lookahead_slots
|
||||
first_chunk_size = self._block_size - (self._num_full_slots %
|
||||
self._block_size)
|
||||
num_token_blocks = (1 + math.ceil(
|
||||
(num_token_ids - first_chunk_size) / self._block_size))
|
||||
return num_token_blocks
|
||||
|
||||
def _chunk_token_blocks_for_append(
|
||||
self, token_ids: List[int]) -> List[List[int]]:
|
||||
"""Split the token ids into block-sized chunks so they can be easily
|
||||
appended to blocks. The first such "token block" may have less token ids
|
||||
than the block size, since the last allocated block may be partially
|
||||
full.
|
||||
|
||||
If no token ids are provided, then no chunks are returned.
|
||||
"""
|
||||
|
||||
if not token_ids:
|
||||
return []
|
||||
|
||||
first_chunk_size = self._block_size - (self._num_full_slots %
|
||||
self._block_size)
|
||||
token_blocks = [token_ids[:first_chunk_size]]
|
||||
token_blocks.extend(
|
||||
chunk_list(token_ids[first_chunk_size:], self._block_size))
|
||||
return token_blocks
|
||||
371
vllm/core/block/common.py
Normal file
371
vllm/core/block/common.py
Normal file
@@ -0,0 +1,371 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections import deque
|
||||
from dataclasses import dataclass
|
||||
from typing import Deque, Dict, Iterable, List, Optional, Protocol, Tuple
|
||||
|
||||
from vllm.core.block.interfaces import Block, BlockAllocator
|
||||
|
||||
BlockId = int
|
||||
RefCount = int
|
||||
|
||||
|
||||
class RefCounterProtocol(Protocol):
|
||||
|
||||
def incr(self, block_id: BlockId) -> RefCount:
|
||||
raise NotImplementedError
|
||||
|
||||
def decr(self, block_id: BlockId) -> RefCount:
|
||||
raise NotImplementedError
|
||||
|
||||
def get(self, block_id: BlockId) -> RefCount:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class RefCounter(RefCounterProtocol):
|
||||
"""A class for managing reference counts for a set of block indices.
|
||||
|
||||
The RefCounter class maintains a dictionary that maps block indices to their
|
||||
corresponding reference counts. It provides methods to increment, decrement,
|
||||
and retrieve the reference count for a given block index.
|
||||
|
||||
Args:
|
||||
all_block_indices (Iterable[BlockId]): An iterable of block indices
|
||||
to initialize the reference counter with.
|
||||
"""
|
||||
|
||||
def __init__(self, all_block_indices: Iterable[BlockId]):
|
||||
deduped = set(all_block_indices)
|
||||
self._refcounts: Dict[BlockId, RefCount] = {
|
||||
index: 0
|
||||
for index in deduped
|
||||
}
|
||||
|
||||
def incr(self, block_id: BlockId) -> RefCount:
|
||||
assert block_id in self._refcounts
|
||||
pre_incr_refcount = self._refcounts[block_id]
|
||||
|
||||
assert pre_incr_refcount >= 0
|
||||
|
||||
post_incr_refcount = pre_incr_refcount + 1
|
||||
self._refcounts[block_id] = post_incr_refcount
|
||||
return post_incr_refcount
|
||||
|
||||
def decr(self, block_id: BlockId) -> RefCount:
|
||||
assert block_id in self._refcounts
|
||||
refcount = self._refcounts[block_id]
|
||||
|
||||
assert refcount > 0
|
||||
refcount -= 1
|
||||
|
||||
self._refcounts[block_id] = refcount
|
||||
|
||||
return refcount
|
||||
|
||||
def get(self, block_id: BlockId) -> RefCount:
|
||||
assert block_id in self._refcounts
|
||||
return self._refcounts[block_id]
|
||||
|
||||
def as_readonly(self) -> "ReadOnlyRefCounter":
|
||||
return ReadOnlyRefCounter(self)
|
||||
|
||||
|
||||
class ReadOnlyRefCounter(RefCounterProtocol):
|
||||
"""A read-only view of the RefCounter class.
|
||||
|
||||
The ReadOnlyRefCounter class provides a read-only interface to access the
|
||||
reference counts maintained by a RefCounter instance. It does not allow
|
||||
modifications to the reference counts.
|
||||
|
||||
Args:
|
||||
refcounter (RefCounter): The RefCounter instance to create a read-only
|
||||
view for.
|
||||
"""
|
||||
|
||||
def __init__(self, refcounter: RefCounter):
|
||||
self._refcounter = refcounter
|
||||
|
||||
def incr(self, block_id: BlockId) -> RefCount:
|
||||
raise ValueError("Incr not allowed")
|
||||
|
||||
def decr(self, block_id: BlockId) -> RefCount:
|
||||
raise ValueError("Decr not allowed")
|
||||
|
||||
def get(self, block_id: BlockId) -> RefCount:
|
||||
return self._refcounter.get(block_id)
|
||||
|
||||
|
||||
class CopyOnWriteTracker:
|
||||
"""A class for tracking and managing copy-on-write operations for blocks.
|
||||
|
||||
The CopyOnWriteTracker class maintains a mapping of source block indices to
|
||||
their corresponding copy-on-write destination block indices. It works in
|
||||
conjunction with a RefCounter.
|
||||
|
||||
Args:
|
||||
refcounter (RefCounter): The reference counter used to track block
|
||||
reference counts.
|
||||
"""
|
||||
|
||||
def __init__(self, refcounter: RefCounterProtocol):
|
||||
self._copy_on_writes: List[Tuple[BlockId, BlockId]] = []
|
||||
self._refcounter = refcounter
|
||||
|
||||
def is_appendable(self, block: Block) -> bool:
|
||||
"""Checks if the block is shared or not. If shared, then it cannot
|
||||
be appended and needs to be duplicated via copy-on-write
|
||||
"""
|
||||
block_id = block.block_id
|
||||
if block_id is None:
|
||||
return True
|
||||
|
||||
refcount = self._refcounter.get(block_id)
|
||||
return refcount <= 1
|
||||
|
||||
def record_cow(self, src_block_id: Optional[BlockId],
|
||||
trg_block_id: Optional[BlockId]) -> None:
|
||||
"""Records a copy-on-write operation from source to target block id
|
||||
Args:
|
||||
src_block_id (BlockId): The source block id from which to copy
|
||||
the data
|
||||
trg_block_id (BlockId): The target block id to which the data
|
||||
is copied
|
||||
"""
|
||||
assert src_block_id is not None
|
||||
assert trg_block_id is not None
|
||||
self._copy_on_writes.append((src_block_id, trg_block_id))
|
||||
|
||||
def clear_cows(self) -> List[Tuple[BlockId, BlockId]]:
|
||||
"""Clears the copy-on-write tracking information and returns the current
|
||||
state.
|
||||
|
||||
This method returns a list mapping source block indices to
|
||||
destination block indices for the current copy-on-write operations.
|
||||
It then clears the internal tracking information.
|
||||
|
||||
Returns:
|
||||
List[Tuple[BlockId, BlockId]]: A list mapping source
|
||||
block indices to destination block indices for the
|
||||
current copy-on-write operations.
|
||||
"""
|
||||
cows = self._copy_on_writes
|
||||
self._copy_on_writes = []
|
||||
return cows
|
||||
|
||||
|
||||
class BlockPool:
|
||||
"""Used to pre-allocate block objects, in order to avoid excessive python
|
||||
object allocations/deallocations.
|
||||
The pool starts from "pool_size" objects and will increase to more objects
|
||||
if necessary
|
||||
|
||||
Note that multiple block objects may point to the same physical block id,
|
||||
which is why this pool is needed, so that it will be easier to support
|
||||
prefix caching and more complicated sharing of physical blocks.
|
||||
"""
|
||||
|
||||
def __init__(self, block_size: int, create_block: Block.Factory,
|
||||
allocator: BlockAllocator, pool_size: int):
|
||||
self._block_size = block_size
|
||||
self._create_block = create_block
|
||||
self._allocator = allocator
|
||||
self._pool_size = pool_size
|
||||
assert self._pool_size >= 0
|
||||
|
||||
self._free_ids: Deque[int] = deque(range(self._pool_size))
|
||||
self._pool = []
|
||||
for i in range(self._pool_size):
|
||||
self._pool.append(
|
||||
self._create_block(prev_block=None,
|
||||
token_ids=[],
|
||||
block_size=self._block_size,
|
||||
allocator=self._allocator,
|
||||
block_id=None,
|
||||
extra_hash=None))
|
||||
|
||||
def increase_pool(self):
|
||||
"""Doubles the internal pool size
|
||||
"""
|
||||
cur_pool_size = self._pool_size
|
||||
new_pool_size = cur_pool_size * 2
|
||||
self._pool_size = new_pool_size
|
||||
|
||||
self._free_ids += deque(range(cur_pool_size, new_pool_size))
|
||||
|
||||
for i in range(cur_pool_size, new_pool_size):
|
||||
self._pool.append(
|
||||
self._create_block(prev_block=None,
|
||||
token_ids=[],
|
||||
block_size=self._block_size,
|
||||
allocator=self._allocator,
|
||||
block_id=None,
|
||||
extra_hash=None))
|
||||
|
||||
def init_block(self,
|
||||
prev_block: Optional[Block],
|
||||
token_ids: List[int],
|
||||
block_size: int,
|
||||
physical_block_id: Optional[int],
|
||||
extra_hash: Optional[int] = None) -> Block:
|
||||
if len(self._free_ids) == 0:
|
||||
self.increase_pool()
|
||||
assert len(self._free_ids) > 0
|
||||
|
||||
pool_id = self._free_ids.popleft()
|
||||
|
||||
block = self._pool[pool_id]
|
||||
block.__init__( # type: ignore[misc]
|
||||
prev_block=prev_block,
|
||||
token_ids=token_ids,
|
||||
block_size=block_size,
|
||||
allocator=block._allocator, # type: ignore[attr-defined]
|
||||
block_id=physical_block_id,
|
||||
extra_hash=extra_hash)
|
||||
block.pool_id = pool_id # type: ignore[attr-defined]
|
||||
return block
|
||||
|
||||
def free_block(self, block: Block) -> None:
|
||||
self._free_ids.appendleft(block.pool_id) # type: ignore[attr-defined]
|
||||
|
||||
|
||||
class BlockList:
|
||||
"""This class is an optimization to allow fast-access to physical
|
||||
block ids. It maintains a block id list that is updated with the
|
||||
block list and this avoids the need to reconstruct the block id
|
||||
list on every iteration of the block manager
|
||||
"""
|
||||
|
||||
def __init__(self, blocks: List[Block]):
|
||||
self._blocks: List[Block] = []
|
||||
self._block_ids: List[int] = []
|
||||
|
||||
self.update(blocks)
|
||||
|
||||
def _add_block_id(self, block_id: Optional[BlockId]) -> None:
|
||||
assert block_id is not None
|
||||
self._block_ids.append(block_id)
|
||||
|
||||
def _update_block_id(self, block_index: int,
|
||||
new_block_id: Optional[BlockId]) -> None:
|
||||
assert new_block_id is not None
|
||||
self._block_ids[block_index] = new_block_id
|
||||
|
||||
def update(self, blocks: List[Block]):
|
||||
self._blocks = blocks
|
||||
|
||||
# Cache block ids for fast query
|
||||
self._block_ids = []
|
||||
for block in self._blocks:
|
||||
self._add_block_id(block.block_id)
|
||||
|
||||
def append_token_ids(self, block_index: int, token_ids: List[int]) -> None:
|
||||
block = self._blocks[block_index]
|
||||
prev_block_id = block.block_id
|
||||
|
||||
block.append_token_ids(token_ids)
|
||||
|
||||
# CoW or promotion may update the internal block_id
|
||||
if prev_block_id != block.block_id:
|
||||
self._update_block_id(block_index, block.block_id)
|
||||
|
||||
def append(self, new_block: Block):
|
||||
self._blocks.append(new_block)
|
||||
self._add_block_id(new_block.block_id)
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self._blocks)
|
||||
|
||||
def __getitem__(self, block_index: int) -> Block:
|
||||
return self._blocks[block_index]
|
||||
|
||||
def __setitem__(self, block_index: int, new_block: Block) -> None:
|
||||
self._blocks[block_index] = new_block
|
||||
self._update_block_id(block_index, new_block.block_id)
|
||||
|
||||
def reset(self):
|
||||
self._blocks = []
|
||||
self._block_ids = []
|
||||
|
||||
def list(self) -> List[Block]:
|
||||
return self._blocks
|
||||
|
||||
def ids(self) -> List[int]:
|
||||
return self._block_ids
|
||||
|
||||
|
||||
@dataclass
|
||||
class CacheMetricData:
|
||||
"""A utility dataclass to maintain cache metric.
|
||||
To avoid overflow, we maintain the hit rate in block granularity, so that
|
||||
we can maintain a single hit rate for n_completed_block x block_size,
|
||||
and calculate the real time hit rate by the following:
|
||||
BS = The number of queries per block.
|
||||
nB = The number of completed blocks.
|
||||
HR = hit rate of (nB x BS) queries.
|
||||
Q = current number of queries (< BS).
|
||||
H = current number of hits (< BS).
|
||||
hit rate = ((HR x nB) + (H / Q) x (Q / BS)) / (nB + Q / BS)
|
||||
"""
|
||||
num_completed_blocks: int = 0
|
||||
completed_block_cache_hit_rate: float = 0.0
|
||||
num_incompleted_block_queries: int = 0
|
||||
num_incompleted_block_hit: int = 0
|
||||
block_size: int = 1000
|
||||
|
||||
def query(self, hit: bool):
|
||||
self.num_incompleted_block_queries += 1
|
||||
self.num_incompleted_block_hit += 1 if hit else 0
|
||||
|
||||
# When a block is completed, update the cache hit rate
|
||||
# and reset the incomplete numbers.
|
||||
if self.num_incompleted_block_queries == self.block_size:
|
||||
hit_rate = (self.num_incompleted_block_hit /
|
||||
self.num_incompleted_block_queries)
|
||||
self.completed_block_cache_hit_rate = (
|
||||
self.completed_block_cache_hit_rate * self.num_completed_blocks
|
||||
+ hit_rate) / (self.num_completed_blocks + 1)
|
||||
self.num_incompleted_block_queries = 0
|
||||
self.num_incompleted_block_hit = 0
|
||||
self.num_completed_blocks += 1
|
||||
|
||||
def get_hit_rate(self):
|
||||
incomplete_ratio = self.num_incompleted_block_queries / self.block_size
|
||||
total_blocks = self.num_completed_blocks + incomplete_ratio
|
||||
if total_blocks == 0:
|
||||
return 0.0
|
||||
|
||||
completed_block_hit, incompleted_block_hit = 0.0, 0.0
|
||||
if self.num_completed_blocks > 0:
|
||||
completed_block_hit = (self.completed_block_cache_hit_rate *
|
||||
self.num_completed_blocks)
|
||||
if self.num_incompleted_block_queries > 0:
|
||||
incompleted_hit_rate = (self.num_incompleted_block_hit /
|
||||
self.num_incompleted_block_queries)
|
||||
incompleted_block_hit = (incompleted_hit_rate * incomplete_ratio)
|
||||
return (completed_block_hit + incompleted_block_hit) / total_blocks
|
||||
|
||||
|
||||
def get_all_blocks_recursively(last_block: Block) -> List[Block]:
|
||||
"""Retrieves all the blocks in a sequence starting from the last block.
|
||||
|
||||
This function recursively traverses the sequence of blocks in reverse order,
|
||||
starting from the given last block, and returns a list of all the blocks in
|
||||
the sequence.
|
||||
|
||||
Args:
|
||||
last_block (Block): The last block in the sequence.
|
||||
|
||||
Returns:
|
||||
List[Block]: A list of all the blocks in the sequence, in the order they
|
||||
appear.
|
||||
"""
|
||||
|
||||
def recurse(block: Block, lst: List[Block]) -> None:
|
||||
if block.prev_block is not None:
|
||||
recurse(block.prev_block, lst)
|
||||
lst.append(block)
|
||||
|
||||
all_blocks: List[Block] = []
|
||||
recurse(last_block, all_blocks)
|
||||
return all_blocks
|
||||
441
vllm/core/block/cpu_gpu_block_allocator.py
Normal file
441
vllm/core/block/cpu_gpu_block_allocator.py
Normal file
@@ -0,0 +1,441 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Dict, FrozenSet, List, Optional, Tuple
|
||||
|
||||
from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId,
|
||||
DeviceAwareBlockAllocator)
|
||||
from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
|
||||
from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import Device
|
||||
|
||||
|
||||
class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
|
||||
"""A block allocator that can allocate blocks on both CPU and GPU memory.
|
||||
|
||||
This class implements the `DeviceAwareBlockAllocator` interface and provides
|
||||
functionality for allocating and managing blocks of memory on both CPU and
|
||||
GPU devices.
|
||||
|
||||
The `CpuGpuBlockAllocator` maintains separate memory pools for CPU and GPU
|
||||
blocks, and allows for allocation, deallocation, forking, and swapping of
|
||||
blocks across these memory pools.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def create(
|
||||
allocator_type: str,
|
||||
num_gpu_blocks: int,
|
||||
num_cpu_blocks: int,
|
||||
block_size: int,
|
||||
) -> DeviceAwareBlockAllocator:
|
||||
"""Creates a CpuGpuBlockAllocator instance with the specified
|
||||
configuration.
|
||||
|
||||
This static method creates and returns a CpuGpuBlockAllocator instance
|
||||
based on the provided parameters. It initializes the CPU and GPU block
|
||||
allocators with the specified number of blocks, block size, and
|
||||
allocator type.
|
||||
|
||||
Args:
|
||||
allocator_type (str): The type of block allocator to use for CPU
|
||||
and GPU blocks. Currently supported values are "naive" and
|
||||
"prefix_caching".
|
||||
num_gpu_blocks (int): The number of blocks to allocate for GPU
|
||||
memory.
|
||||
num_cpu_blocks (int): The number of blocks to allocate for CPU
|
||||
memory.
|
||||
block_size (int): The size of each block in number of tokens.
|
||||
|
||||
Returns:
|
||||
DeviceAwareBlockAllocator: A CpuGpuBlockAllocator instance with the
|
||||
specified configuration.
|
||||
|
||||
Notes:
|
||||
- The block IDs are assigned contiguously, with GPU block IDs coming
|
||||
before CPU block IDs.
|
||||
"""
|
||||
# For HPU, block id 0 is used only for padding
|
||||
reserved_blocks = 1 if current_platform.is_hpu() else 0
|
||||
block_ids = list(
|
||||
range(reserved_blocks, num_gpu_blocks + num_cpu_blocks))
|
||||
num_gpu_blocks -= reserved_blocks
|
||||
gpu_block_ids = block_ids[:num_gpu_blocks]
|
||||
cpu_block_ids = block_ids[num_gpu_blocks:]
|
||||
|
||||
if allocator_type == "naive":
|
||||
gpu_allocator: BlockAllocator = NaiveBlockAllocator(
|
||||
create_block=NaiveBlock, # type: ignore
|
||||
num_blocks=num_gpu_blocks,
|
||||
block_size=block_size,
|
||||
block_ids=gpu_block_ids,
|
||||
)
|
||||
|
||||
cpu_allocator: BlockAllocator = NaiveBlockAllocator(
|
||||
create_block=NaiveBlock, # type: ignore
|
||||
num_blocks=num_cpu_blocks,
|
||||
block_size=block_size,
|
||||
block_ids=cpu_block_ids,
|
||||
)
|
||||
elif allocator_type == "prefix_caching":
|
||||
gpu_allocator = PrefixCachingBlockAllocator(
|
||||
num_blocks=num_gpu_blocks,
|
||||
block_size=block_size,
|
||||
block_ids=gpu_block_ids,
|
||||
)
|
||||
|
||||
cpu_allocator = PrefixCachingBlockAllocator(
|
||||
num_blocks=num_cpu_blocks,
|
||||
block_size=block_size,
|
||||
block_ids=cpu_block_ids,
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unknown allocator type {allocator_type=}")
|
||||
|
||||
return CpuGpuBlockAllocator(
|
||||
cpu_block_allocator=cpu_allocator,
|
||||
gpu_block_allocator=gpu_allocator,
|
||||
)
|
||||
|
||||
def __init__(self, cpu_block_allocator: BlockAllocator,
|
||||
gpu_block_allocator: BlockAllocator):
|
||||
assert not (
|
||||
cpu_block_allocator.all_block_ids
|
||||
& gpu_block_allocator.all_block_ids
|
||||
), "cpu and gpu block allocators can't have intersection of block ids"
|
||||
|
||||
self._allocators = {
|
||||
Device.CPU: cpu_block_allocator,
|
||||
Device.GPU: gpu_block_allocator,
|
||||
}
|
||||
|
||||
self._swap_mapping: Dict[int, int] = {}
|
||||
self._null_block: Optional[Block] = None
|
||||
|
||||
self._block_ids_to_allocator: Dict[int, BlockAllocator] = {}
|
||||
for _, allocator in self._allocators.items():
|
||||
for block_id in allocator.all_block_ids:
|
||||
self._block_ids_to_allocator[block_id] = allocator
|
||||
|
||||
def allocate_or_get_null_block(self) -> Block:
|
||||
if self._null_block is None:
|
||||
self._null_block = NullBlock(
|
||||
self.allocate_mutable_block(None, Device.GPU))
|
||||
return self._null_block
|
||||
|
||||
def allocate_mutable_block(self,
|
||||
prev_block: Optional[Block],
|
||||
device: Device,
|
||||
extra_hash: Optional[int] = None) -> Block:
|
||||
"""Allocates a new mutable block on the specified device.
|
||||
|
||||
Args:
|
||||
prev_block (Optional[Block]): The previous block to in the sequence.
|
||||
Used for prefix hashing.
|
||||
device (Device): The device on which to allocate the new block.
|
||||
extra_hash (Optional[int]): The hash value of additional
|
||||
factors, such as adapters, that influence the block hash
|
||||
in the prefix caching block.
|
||||
|
||||
Returns:
|
||||
Block: The newly allocated mutable block.
|
||||
"""
|
||||
return self._allocators[device].allocate_mutable_block(
|
||||
prev_block, extra_hash=extra_hash)
|
||||
|
||||
def allocate_immutable_blocks(
|
||||
self,
|
||||
prev_block: Optional[Block],
|
||||
block_token_ids: List[List[int]],
|
||||
device: Device,
|
||||
extra_hash: Optional[int] = None) -> List[Block]:
|
||||
"""Allocates a new group of immutable blocks with the provided block
|
||||
token IDs on the specified device.
|
||||
|
||||
Args:
|
||||
prev_block (Optional[Block]): The previous block in the sequence.
|
||||
Used for prefix hashing.
|
||||
block_token_ids (List[int]): The list of block token IDs to be
|
||||
stored in the new blocks.
|
||||
device (Device): The device on which to allocate the new block.
|
||||
extra_hash (Optional[int]): The hash value of additional
|
||||
factors, such as adapters, that influence the block hash
|
||||
in the prefix caching block.
|
||||
|
||||
Returns:
|
||||
List[Block]: The newly allocated list of immutable blocks
|
||||
containing the provided block token IDs.
|
||||
"""
|
||||
return self._allocators[device].allocate_immutable_blocks(
|
||||
prev_block, block_token_ids, extra_hash=extra_hash)
|
||||
|
||||
def allocate_immutable_block(self,
|
||||
prev_block: Optional[Block],
|
||||
token_ids: List[int],
|
||||
device: Device,
|
||||
extra_hash: Optional[int] = None) -> Block:
|
||||
"""Allocates a new immutable block with the provided token IDs on the
|
||||
specified device.
|
||||
|
||||
Args:
|
||||
prev_block (Optional[Block]): The previous block in the sequence.
|
||||
Used for prefix hashing.
|
||||
token_ids (List[int]): The list of token IDs to be stored in the new
|
||||
block.
|
||||
device (Device): The device on which to allocate the new block.
|
||||
extra_hash (Optional[int]): The hash value of additional
|
||||
factors, such as adapters, that influence the block hash
|
||||
in the prefix caching block.
|
||||
|
||||
Returns:
|
||||
Block: The newly allocated immutable block containing the provided
|
||||
token IDs.
|
||||
"""
|
||||
return self._allocators[device].allocate_immutable_block(
|
||||
prev_block, token_ids, extra_hash=extra_hash)
|
||||
|
||||
def free(self, block: Block) -> None:
|
||||
"""Frees the memory occupied by the given block.
|
||||
|
||||
Args:
|
||||
block (Block): The block to be freed.
|
||||
"""
|
||||
# Null block should never be freed
|
||||
if isinstance(block, NullBlock):
|
||||
return
|
||||
block_id = block.block_id
|
||||
assert block_id is not None
|
||||
allocator = self._block_ids_to_allocator[block_id]
|
||||
allocator.free(block)
|
||||
|
||||
def fork(self, last_block: Block) -> List[Block]:
|
||||
"""Creates a new sequence of blocks that shares the same underlying
|
||||
memory as the original sequence.
|
||||
|
||||
Args:
|
||||
last_block (Block): The last block in the original sequence.
|
||||
|
||||
Returns:
|
||||
List[Block]: A new list of blocks that shares the same memory as the
|
||||
original sequence.
|
||||
"""
|
||||
# do not attempt to fork the null block
|
||||
assert not isinstance(last_block, NullBlock)
|
||||
block_id = last_block.block_id
|
||||
assert block_id is not None
|
||||
allocator = self._block_ids_to_allocator[block_id]
|
||||
return allocator.fork(last_block)
|
||||
|
||||
def get_num_free_blocks(self, device: Device) -> int:
|
||||
"""Returns the number of free blocks available on the specified device.
|
||||
|
||||
Args:
|
||||
device (Device): The device for which to query the number of free
|
||||
blocks. AssertionError is raised if None is passed.
|
||||
|
||||
Returns:
|
||||
int: The number of free blocks available on the specified device.
|
||||
"""
|
||||
return self._allocators[device].get_num_free_blocks()
|
||||
|
||||
def get_num_total_blocks(self, device: Device) -> int:
|
||||
return self._allocators[device].get_num_total_blocks()
|
||||
|
||||
def get_physical_block_id(self, device: Device, absolute_id: int) -> int:
|
||||
"""Returns the zero-offset block id on certain device given the
|
||||
absolute block id.
|
||||
|
||||
Args:
|
||||
device (Device): The device for which to query relative block id.
|
||||
absolute_id (int): The absolute block id for the block in
|
||||
whole allocator.
|
||||
|
||||
Returns:
|
||||
int: The zero-offset block id on certain device.
|
||||
"""
|
||||
return self._allocators[device].get_physical_block_id(absolute_id)
|
||||
|
||||
def swap(self, blocks: List[Block], src_device: Device,
|
||||
dst_device: Device) -> Dict[int, int]:
|
||||
"""Execute the swap for the given blocks from source_device
|
||||
on to dest_device, save the current swap mapping and append
|
||||
them to the accumulated `self._swap_mapping` for each
|
||||
scheduling move.
|
||||
|
||||
Args:
|
||||
blocks: List of blocks to be swapped.
|
||||
src_device (Device): Device to swap the 'blocks' from.
|
||||
dst_device (Device): Device to swap the 'blocks' to.
|
||||
|
||||
Returns:
|
||||
Dict[int, int]: Swap mapping from source_device
|
||||
on to dest_device.
|
||||
"""
|
||||
src_block_ids = [block.block_id for block in blocks]
|
||||
self._allocators[src_device].swap_out(blocks)
|
||||
self._allocators[dst_device].swap_in(blocks)
|
||||
dst_block_ids = [block.block_id for block in blocks]
|
||||
|
||||
current_swap_mapping: Dict[int, int] = {}
|
||||
for src_block_id, dst_block_id in zip(src_block_ids, dst_block_ids):
|
||||
if src_block_id is not None and dst_block_id is not None:
|
||||
self._swap_mapping[src_block_id] = dst_block_id
|
||||
current_swap_mapping[src_block_id] = dst_block_id
|
||||
return current_swap_mapping
|
||||
|
||||
def get_num_full_blocks_touched(self, blocks: List[Block],
|
||||
device: Device) -> int:
|
||||
"""Returns the number of full blocks that will be touched by
|
||||
swapping in/out the given blocks on to the 'device'.
|
||||
|
||||
Args:
|
||||
blocks: List of blocks to be swapped.
|
||||
device (Device): Device to swap the 'blocks' on.
|
||||
|
||||
Returns:
|
||||
int: the number of full blocks that will be touched by
|
||||
swapping in/out the given blocks on to the 'device'.
|
||||
Non full blocks are ignored when deciding the number
|
||||
of blocks to touch.
|
||||
"""
|
||||
return self._allocators[device].get_num_full_blocks_touched(blocks)
|
||||
|
||||
def clear_copy_on_writes(self) -> List[Tuple[int, int]]:
|
||||
"""Clears the copy-on-write (CoW) state and returns the mapping of
|
||||
source to destination block IDs.
|
||||
|
||||
Returns:
|
||||
List[Tuple[int, int]]: A list mapping source block IDs to
|
||||
destination block IDs.
|
||||
"""
|
||||
# CoW only supported on GPU
|
||||
device = Device.GPU
|
||||
return self._allocators[device].clear_copy_on_writes()
|
||||
|
||||
def mark_blocks_as_accessed(self, block_ids: List[int],
|
||||
now: float) -> None:
|
||||
"""Mark blocks as accessed, only use for prefix caching."""
|
||||
# Prefix caching only supported on GPU.
|
||||
device = Device.GPU
|
||||
return self._allocators[device].mark_blocks_as_accessed(block_ids, now)
|
||||
|
||||
def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
|
||||
"""Mark blocks as accessed, only use for prefix caching."""
|
||||
# Prefix caching only supported on GPU.
|
||||
device = Device.GPU
|
||||
return self._allocators[device].mark_blocks_as_computed(block_ids)
|
||||
|
||||
def get_common_computed_block_ids(
|
||||
self, computed_seq_block_ids: List[List[int]]) -> List[int]:
|
||||
# Prefix caching only supported on GPU.
|
||||
device = Device.GPU
|
||||
return self._allocators[device].get_common_computed_block_ids(
|
||||
computed_seq_block_ids)
|
||||
|
||||
@property
|
||||
def all_block_ids(self) -> FrozenSet[int]:
|
||||
return frozenset(self._block_ids_to_allocator.keys())
|
||||
|
||||
def get_prefix_cache_hit_rate(self, device: Device) -> float:
|
||||
"""Prefix cache hit rate. -1 means not supported or disabled."""
|
||||
assert device in self._allocators
|
||||
return self._allocators[device].get_prefix_cache_hit_rate()
|
||||
|
||||
def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
|
||||
"""Reset prefix cache for specified or all devices."""
|
||||
if device:
|
||||
return self._allocators[device].reset_prefix_cache()
|
||||
success = True
|
||||
for allocator in self._allocators.values():
|
||||
success = success and allocator.reset_prefix_cache()
|
||||
return success
|
||||
|
||||
def get_and_reset_swaps(self) -> List[Tuple[int, int]]:
|
||||
"""Returns and clears the mapping of source to destination block IDs.
|
||||
Will be called after every swapping operations for now, and after every
|
||||
schedule when BlockManagerV2 become default. Currently not useful.
|
||||
|
||||
Returns:
|
||||
List[Tuple[int, int]]: A mapping of source to destination block IDs.
|
||||
"""
|
||||
mapping = self._swap_mapping.copy()
|
||||
self._swap_mapping.clear()
|
||||
return list(mapping.items())
|
||||
|
||||
def find_cached_blocks_prefix(
|
||||
self,
|
||||
block_hashes: List[int],
|
||||
device: Device = Device.GPU,
|
||||
) -> List[int]:
|
||||
return self._allocators[device].find_cached_blocks_prefix(block_hashes)
|
||||
|
||||
|
||||
class NullBlock(Block):
|
||||
"""
|
||||
Null blocks are used as a placeholders for KV cache blocks that have
|
||||
been dropped due to sliding window.
|
||||
This implementation just wraps an ordinary block and prevents it from
|
||||
being modified. It also allows for testing if a block is NullBlock
|
||||
via isinstance().
|
||||
"""
|
||||
|
||||
def __init__(self, proxy: Block):
|
||||
super().__init__()
|
||||
self._proxy = proxy
|
||||
|
||||
def append_token_ids(self, token_ids: List[BlockId]):
|
||||
raise ValueError("null block should not be modified")
|
||||
|
||||
@property
|
||||
def block_id(self):
|
||||
return self._proxy.block_id
|
||||
|
||||
@block_id.setter
|
||||
def block_id(self, value: Optional[BlockId]):
|
||||
raise ValueError("null block should not be modified")
|
||||
|
||||
@property
|
||||
def token_ids(self) -> List[BlockId]:
|
||||
return self._proxy.token_ids
|
||||
|
||||
@property
|
||||
def num_tokens_total(self) -> int:
|
||||
raise NotImplementedError(
|
||||
"num_tokens_total is not used for null block")
|
||||
|
||||
@property
|
||||
def num_empty_slots(self) -> BlockId:
|
||||
return self._proxy.num_empty_slots
|
||||
|
||||
@property
|
||||
def is_full(self):
|
||||
return self._proxy.is_full
|
||||
|
||||
@property
|
||||
def prev_block(self):
|
||||
return self._proxy.prev_block
|
||||
|
||||
@property
|
||||
def extra_hash(self):
|
||||
return None
|
||||
|
||||
@property
|
||||
def computed(self):
|
||||
return self._proxy.computed
|
||||
|
||||
@computed.setter
|
||||
def computed(self, value):
|
||||
self._proxy.computed = value
|
||||
|
||||
@property
|
||||
def last_accessed(self) -> float:
|
||||
return self._proxy.last_accessed
|
||||
|
||||
@last_accessed.setter
|
||||
def last_accessed(self, last_accessed_ts: float):
|
||||
self._proxy.last_accessed = last_accessed_ts
|
||||
|
||||
@property
|
||||
def content_hash(self):
|
||||
return self._proxy.content_hash
|
||||
319
vllm/core/block/interfaces.py
Normal file
319
vllm/core/block/interfaces.py
Normal file
@@ -0,0 +1,319 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, FrozenSet, List, Optional, Protocol, Tuple
|
||||
|
||||
from vllm.utils import Device
|
||||
|
||||
BlockId = int
|
||||
|
||||
|
||||
class Block(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def append_token_ids(self, token_ids: List[int]) -> None:
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def block_id(self) -> Optional[int]:
|
||||
pass
|
||||
|
||||
@block_id.setter
|
||||
@abstractmethod
|
||||
def block_id(self, value: Optional[int]) -> None:
|
||||
"""NOTE: Do not use this API outside Block."""
|
||||
self._block_id = value
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def token_ids(self) -> List[int]:
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def num_tokens_total(self) -> int:
|
||||
"""The number of tokens till the current block (inclusive)
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def num_empty_slots(self) -> int:
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def is_full(self) -> bool:
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def prev_block(self) -> Optional["Block"]:
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def extra_hash(self) -> Optional[int]:
|
||||
return None
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def computed(self) -> bool:
|
||||
raise NotImplementedError
|
||||
|
||||
@computed.setter
|
||||
@abstractmethod
|
||||
def computed(self, value) -> bool:
|
||||
"""Should be only used by PrefixCacingAllocator"""
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def last_accessed(self) -> float:
|
||||
raise NotImplementedError
|
||||
|
||||
@last_accessed.setter
|
||||
@abstractmethod
|
||||
def last_accessed(self, last_accessed_ts: float):
|
||||
raise NotImplementedError
|
||||
|
||||
class Factory(Protocol):
|
||||
|
||||
@abstractmethod
|
||||
def __call__(
|
||||
self,
|
||||
prev_block: Optional["Block"],
|
||||
token_ids: List[int],
|
||||
block_size: int,
|
||||
allocator: "BlockAllocator",
|
||||
block_id: Optional[int] = None,
|
||||
computed: bool = False,
|
||||
extra_hash: Optional[int] = None,
|
||||
) -> "Block":
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def content_hash(self) -> Optional[int]:
|
||||
"""Return the content-based hash of the current block, or None if it is
|
||||
not yet defined or not supported.
|
||||
|
||||
For the content-based hash to be defined, the current block must be
|
||||
full.
|
||||
"""
|
||||
return None
|
||||
|
||||
|
||||
class BlockAllocator(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def allocate_mutable_block(self, prev_block: Optional[Block],
|
||||
extra_hash: Optional[int]) -> Block:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def allocate_immutable_block(self, prev_block: Optional[Block],
|
||||
token_ids: List[int],
|
||||
extra_hash: Optional[int]) -> Block:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def allocate_immutable_blocks(self, prev_block: Optional[Block],
|
||||
block_token_ids: List[List[int]],
|
||||
extra_hash: Optional[int]) -> List[Block]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def free(self, block: Block) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def fork(self, last_block: Block) -> List[Block]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_num_total_blocks(self) -> int:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_num_free_blocks(self) -> int:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_physical_block_id(self, absolute_id: int) -> int:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def swap_out(self, blocks: List[Block]) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def swap_in(self, blocks: List[Block]) -> None:
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def all_block_ids(self) -> FrozenSet[int]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def clear_copy_on_writes(self) -> List[Tuple[int, int]]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def mark_blocks_as_accessed(self, block_ids: List[int],
|
||||
now: float) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_common_computed_block_ids(
|
||||
self, computed_seq_block_ids: List[List[int]]) -> List[int]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def cow_block_if_not_appendable(self, block: Block) -> BlockId:
|
||||
"""NOTE: This should not be used besides Block"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def promote_to_immutable_block(self, block: Block) -> BlockId:
|
||||
"""NOTE: This should not be used besides Block"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_num_full_blocks_touched(self, blocks: List[Block]) -> int:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_prefix_cache_hit_rate(self) -> float:
|
||||
"""Prefix cache hit rate. -1 means not supported or disabled."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def reset_prefix_cache(self) -> bool:
|
||||
"""Reset prefix cache."""
|
||||
pass
|
||||
|
||||
class NoFreeBlocksError(ValueError):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def find_cached_blocks_prefix(
|
||||
self,
|
||||
block_hashes: List[int],
|
||||
) -> List[int]:
|
||||
pass
|
||||
|
||||
|
||||
class DeviceAwareBlockAllocator(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def allocate_mutable_block(self,
|
||||
prev_block: Optional[Block],
|
||||
device: Device,
|
||||
extra_hash: Optional[int] = None) -> Block:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def allocate_immutable_block(self,
|
||||
prev_block: Optional[Block],
|
||||
token_ids: List[int],
|
||||
device: Device,
|
||||
extra_hash: Optional[int] = None) -> Block:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def allocate_immutable_blocks(
|
||||
self,
|
||||
prev_block: Optional[Block],
|
||||
block_token_ids: List[List[int]],
|
||||
device: Device,
|
||||
extra_hash: Optional[int] = None,
|
||||
) -> List[Block]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_num_free_blocks(self, device: Device) -> int:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_num_total_blocks(self, device: Device) -> int:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def free(self, block: Block) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def fork(self, last_block: Block) -> List[Block]:
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def all_block_ids(self) -> FrozenSet[int]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def clear_copy_on_writes(self) -> List[Tuple[int, int]]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def mark_blocks_as_accessed(self, block_ids: List[int],
|
||||
now: float) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_common_computed_block_ids(
|
||||
self, computed_seq_block_ids: List[List[int]]) -> List[int]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_num_full_blocks_touched(self, blocks: List[Block],
|
||||
device: Device) -> int:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def swap(self, blocks: List[Block], src_device: Device,
|
||||
dst_device: Device) -> Dict[int, int]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_physical_block_id(self, device: Device, absolute_id: int) -> int:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def allocate_or_get_null_block(self) -> Block:
|
||||
"""
|
||||
Null blocks are used as a placeholders for KV cache blocks that have
|
||||
been dropped due to sliding window.
|
||||
There is at most one null block per allocator.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_prefix_cache_hit_rate(self, device: Device) -> float:
|
||||
"""Prefix cache hit rate. -1 means not supported or disabled."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
|
||||
"""Reset prefix cache."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def find_cached_blocks_prefix(
|
||||
self,
|
||||
block_hashes: List[int],
|
||||
device: Device = Device.GPU,
|
||||
) -> List[int]:
|
||||
pass
|
||||
466
vllm/core/block/naive_block.py
Normal file
466
vllm/core/block/naive_block.py
Normal file
@@ -0,0 +1,466 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections import deque
|
||||
from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple, Union
|
||||
|
||||
from vllm.core.block.common import (BlockPool, CopyOnWriteTracker, RefCounter,
|
||||
get_all_blocks_recursively)
|
||||
from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
|
||||
|
||||
Refcount = int
|
||||
|
||||
|
||||
class NaiveBlockAllocator(BlockAllocator):
|
||||
"""A simple block allocator that manages blocks of memory without prefix
|
||||
caching.
|
||||
|
||||
Args:
|
||||
create_block (Block.Factory): A factory function for creating new
|
||||
blocks. This is used when a NaiveBlockAllocator is composed within
|
||||
a prefix caching allocator -- the naive block allocator must
|
||||
construct prefix caching blocks (but shouldn't know anything else
|
||||
about them).
|
||||
num_blocks (int): The total number of blocks to manage.
|
||||
block_size (int): The size of each block in tokens.
|
||||
block_ids (Optional[Iterable[int]], optional): An optional iterable of
|
||||
block IDs. If not provided, block IDs will be assigned sequentially
|
||||
from 0 to num_blocks - 1.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
create_block: Block.Factory,
|
||||
num_blocks: int,
|
||||
block_size: int,
|
||||
block_ids: Optional[Iterable[int]] = None,
|
||||
block_pool: Optional[BlockPool] = None,
|
||||
):
|
||||
if block_ids is None:
|
||||
block_ids = range(num_blocks)
|
||||
|
||||
self._free_block_indices: Deque[BlockId] = deque(block_ids)
|
||||
self._all_block_indices = frozenset(block_ids)
|
||||
assert len(self._all_block_indices) == num_blocks
|
||||
|
||||
self._refcounter = RefCounter(
|
||||
all_block_indices=self._free_block_indices)
|
||||
self._block_size = block_size
|
||||
|
||||
self._cow_tracker = CopyOnWriteTracker(
|
||||
refcounter=self._refcounter.as_readonly())
|
||||
|
||||
if block_pool is None:
|
||||
extra_factor = 4
|
||||
# Pre-allocate "num_blocks * extra_factor" block objects.
|
||||
# The "* extra_factor" is a buffer to allow more block objects
|
||||
# than physical blocks
|
||||
self._block_pool = BlockPool(self._block_size, create_block, self,
|
||||
num_blocks * extra_factor)
|
||||
else:
|
||||
# In this case, the block pool is provided by the caller,
|
||||
# which means that there is most likely a need to share
|
||||
# a block pool between allocators
|
||||
self._block_pool = block_pool
|
||||
|
||||
def allocate_immutable_block(self,
|
||||
prev_block: Optional[Block],
|
||||
token_ids: List[int],
|
||||
extra_hash: Optional[int] = None,
|
||||
device: Optional[Device] = None) -> Block:
|
||||
"""Allocates a new immutable block with the given token IDs, linked to
|
||||
the previous block.
|
||||
|
||||
Args:
|
||||
prev_block (Optional[Block]): The previous block in the sequence. If
|
||||
None, then the block to be allocated is the first block in the
|
||||
sequence.
|
||||
token_ids (List[int]): The token IDs to be stored in the new block.
|
||||
|
||||
Returns:
|
||||
Block: The newly allocated immutable block.
|
||||
"""
|
||||
assert device is None
|
||||
block = self.allocate_mutable_block(prev_block=prev_block)
|
||||
block.append_token_ids(token_ids)
|
||||
return block
|
||||
|
||||
def allocate_immutable_blocks(
|
||||
self,
|
||||
prev_block: Optional[Block],
|
||||
block_token_ids: List[List[int]],
|
||||
extra_hash: Optional[int] = None,
|
||||
device: Optional[Device] = None) -> List[Block]:
|
||||
assert device is None
|
||||
num_blocks = len(block_token_ids)
|
||||
|
||||
block_ids = []
|
||||
for i in range(num_blocks):
|
||||
block_ids.append(self._allocate_block_id())
|
||||
|
||||
blocks = []
|
||||
for i in range(num_blocks):
|
||||
prev_block = self._block_pool.init_block(
|
||||
prev_block=prev_block,
|
||||
token_ids=block_token_ids[i],
|
||||
block_size=self._block_size,
|
||||
physical_block_id=block_ids[i])
|
||||
blocks.append(prev_block)
|
||||
|
||||
return blocks
|
||||
|
||||
def allocate_mutable_block(self,
|
||||
prev_block: Optional[Block],
|
||||
extra_hash: Optional[int] = None,
|
||||
device: Optional[Device] = None) -> Block:
|
||||
"""Allocates a new mutable block, linked to the previous block.
|
||||
|
||||
Args:
|
||||
prev_block (Optional[Block]): The previous block in the sequence. If
|
||||
None, then the block to be allocated is the first block in the
|
||||
sequence.
|
||||
|
||||
Returns:
|
||||
Block: The newly allocated mutable block.
|
||||
"""
|
||||
assert device is None
|
||||
block_id = self._allocate_block_id()
|
||||
block = self._block_pool.init_block(prev_block=prev_block,
|
||||
token_ids=[],
|
||||
block_size=self._block_size,
|
||||
physical_block_id=block_id)
|
||||
return block
|
||||
|
||||
def _allocate_block_id(self) -> BlockId:
|
||||
if not self._free_block_indices:
|
||||
raise BlockAllocator.NoFreeBlocksError()
|
||||
|
||||
block_id = self._free_block_indices.popleft()
|
||||
self._refcounter.incr(block_id)
|
||||
return block_id
|
||||
|
||||
def _free_block_id(self, block: Union[Block, BlockId]) -> None:
|
||||
if isinstance(block, Block):
|
||||
block_id = block.block_id
|
||||
block.block_id = None
|
||||
else:
|
||||
block_id = block
|
||||
assert block_id is not None
|
||||
|
||||
refcount = self._refcounter.decr(block_id)
|
||||
if refcount == 0:
|
||||
self._free_block_indices.appendleft(block_id)
|
||||
|
||||
def free(self, block: Block, keep_block_object: bool = False) -> None:
|
||||
# Release the physical block id
|
||||
self._free_block_id(block)
|
||||
|
||||
# Release the block object
|
||||
if not keep_block_object:
|
||||
self._block_pool.free_block(block)
|
||||
|
||||
def free_block_id(self, block_id: BlockId) -> None:
|
||||
self._free_block_id(block_id)
|
||||
|
||||
def fork(self, last_block: Block) -> List[Block]:
|
||||
"""Creates a new sequence of blocks that shares the same underlying
|
||||
memory as the original sequence.
|
||||
|
||||
Args:
|
||||
last_block (Block): The last block in the original sequence.
|
||||
|
||||
Returns:
|
||||
List[Block]: The new sequence of blocks that shares the same memory
|
||||
as the original sequence.
|
||||
"""
|
||||
source_blocks = get_all_blocks_recursively(last_block)
|
||||
|
||||
forked_blocks: List[Block] = []
|
||||
prev_block = None
|
||||
for block in source_blocks:
|
||||
|
||||
# Increment refcount for each block.
|
||||
assert block.block_id is not None
|
||||
refcount = self._refcounter.incr(block.block_id)
|
||||
assert refcount != 1, "can't fork free'd block"
|
||||
|
||||
forked_block = self._block_pool.init_block(
|
||||
prev_block=prev_block,
|
||||
token_ids=block.token_ids,
|
||||
block_size=self._block_size,
|
||||
physical_block_id=block.block_id)
|
||||
|
||||
forked_blocks.append(forked_block)
|
||||
prev_block = forked_blocks[-1]
|
||||
|
||||
return forked_blocks
|
||||
|
||||
def get_num_free_blocks(self) -> int:
|
||||
return len(self._free_block_indices)
|
||||
|
||||
def get_num_total_blocks(self) -> int:
|
||||
return len(self._all_block_indices)
|
||||
|
||||
def get_physical_block_id(self, absolute_id: int) -> int:
|
||||
"""Returns the zero-offset block id on certain block allocator
|
||||
given the absolute block id.
|
||||
|
||||
Args:
|
||||
absolute_id (int): The absolute block id for the block
|
||||
in whole allocator.
|
||||
|
||||
Returns:
|
||||
int: The zero-offset block id on certain device.
|
||||
"""
|
||||
return sorted(self._all_block_indices).index(absolute_id)
|
||||
|
||||
@property
|
||||
def refcounter(self):
|
||||
return self._refcounter
|
||||
|
||||
@property
|
||||
def all_block_ids(self) -> FrozenSet[int]:
|
||||
return self._all_block_indices
|
||||
|
||||
def cow_block_if_not_appendable(self, block: Block) -> BlockId:
|
||||
"""Performs a copy-on-write operation on the given block if it is not
|
||||
appendable.
|
||||
|
||||
Args:
|
||||
block (Block): The block to check for copy-on-write.
|
||||
|
||||
Returns:
|
||||
BlockId: The block index of the new block if a copy-on-write
|
||||
operation was performed, or the original block index if
|
||||
no copy-on-write was necessary.
|
||||
"""
|
||||
src_block_id = block.block_id
|
||||
assert src_block_id is not None
|
||||
|
||||
if self._cow_tracker.is_appendable(block):
|
||||
return src_block_id
|
||||
|
||||
self._free_block_id(block)
|
||||
trg_block_id = self._allocate_block_id()
|
||||
|
||||
self._cow_tracker.record_cow(src_block_id, trg_block_id)
|
||||
|
||||
return trg_block_id
|
||||
|
||||
def clear_copy_on_writes(self) -> List[Tuple[BlockId, BlockId]]:
|
||||
"""Returns the copy-on-write source->destination mapping and clears it.
|
||||
|
||||
Returns:
|
||||
List[Tuple[BlockId, BlockId]]: A list mapping source
|
||||
block indices to destination block indices.
|
||||
"""
|
||||
return self._cow_tracker.clear_cows()
|
||||
|
||||
def mark_blocks_as_accessed(self, block_ids: List[int],
|
||||
now: float) -> None:
|
||||
"""Mark blocks as accessed, used in prefix caching.
|
||||
|
||||
Since the naive allocator does not implement prefix caching, we do
|
||||
nothing.
|
||||
"""
|
||||
pass
|
||||
|
||||
def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
|
||||
"""Mark blocks as computed, used in prefix caching.
|
||||
|
||||
Since the naive allocator does not implement prefix caching, we do
|
||||
nothing.
|
||||
"""
|
||||
pass
|
||||
|
||||
def get_common_computed_block_ids(
|
||||
self, computed_seq_block_ids: List[List[int]]) -> List[int]:
|
||||
"""Determine blocks that can be skipped in prefill.
|
||||
|
||||
Since the naive allocator does not support prefix caching, always return
|
||||
an empty list.
|
||||
"""
|
||||
return []
|
||||
|
||||
def promote_to_immutable_block(self, block: Block) -> BlockId:
|
||||
raise NotImplementedError("There is no promotion for naive blocks")
|
||||
|
||||
def get_num_full_blocks_touched(self, blocks: List[Block]) -> int:
|
||||
"""Returns the number of full blocks that will be touched by
|
||||
swapping in/out.
|
||||
|
||||
Args:
|
||||
blocks: List of blocks to be swapped.
|
||||
Returns:
|
||||
int: the number of full blocks that will be touched by
|
||||
swapping in/out the given blocks. Non full blocks are ignored
|
||||
when deciding the number of blocks to touch.
|
||||
"""
|
||||
# NOTE: for naive block, we use set to eliminate common blocks among
|
||||
# seqs, also we compare the empty slots in the mutable blocks with
|
||||
# lookahead slots to get the number of unique new block that are
|
||||
# needed.
|
||||
old_block_set = set()
|
||||
for block in blocks:
|
||||
if block.is_full:
|
||||
old_block_set.add(block)
|
||||
return len(old_block_set)
|
||||
|
||||
def swap_out(self, blocks: List[Block]) -> None:
|
||||
for block in blocks:
|
||||
self._free_block_id(block)
|
||||
|
||||
def swap_in(self, blocks: List[Block]) -> None:
|
||||
for block in blocks:
|
||||
# Here we allocate either immutable or mutable block and then
|
||||
# extract its block_id. Note that the block object is released
|
||||
# and the block_id is assigned to "block" to allow reusing the
|
||||
# existing "block" object
|
||||
if block.is_full:
|
||||
tmp_block = self.allocate_immutable_block(
|
||||
prev_block=block.prev_block, token_ids=block.token_ids)
|
||||
else:
|
||||
tmp_block = self.allocate_mutable_block(
|
||||
prev_block=block.prev_block)
|
||||
tmp_block.append_token_ids(block.token_ids)
|
||||
|
||||
block_id = tmp_block.block_id
|
||||
tmp_block.block_id = None
|
||||
self._block_pool.free_block(tmp_block)
|
||||
|
||||
block.block_id = block_id # Assign block_id
|
||||
|
||||
def get_prefix_cache_hit_rate(self) -> float:
|
||||
return -1
|
||||
|
||||
def reset_prefix_cache(self) -> bool:
|
||||
"""No prefix cache for naive block allocator."""
|
||||
return True
|
||||
|
||||
def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]:
|
||||
# Not applicable for naive block allocator.
|
||||
return []
|
||||
|
||||
|
||||
class NaiveBlock(Block):
|
||||
"""An implementation of the Block class that does not support prefix
|
||||
caching.
|
||||
|
||||
The NaiveBlock class represents a block of token IDs with a fixed size. It
|
||||
provides methods for appending token IDs to the block and manages copy-on
|
||||
-write operations when necessary.
|
||||
|
||||
Args:
|
||||
prev_block (Block): The previous block in the sequence.
|
||||
token_ids (List[int]): The initial token IDs to be stored in the block.
|
||||
block_size (int): The maximum number of token IDs that can be stored in
|
||||
the block.
|
||||
allocator (BlockAllocator): The block allocator associated with this
|
||||
block.
|
||||
block_id (Optional[int], optional): The physical block index
|
||||
of this block. Defaults to None, which means no allocation has been
|
||||
made.
|
||||
_cow_target (Optional[Block], optional): The copy-on-write target block.
|
||||
If not provided, it defaults to self.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
prev_block: Optional[Block],
|
||||
token_ids: List[int],
|
||||
block_size: int,
|
||||
allocator: BlockAllocator,
|
||||
block_id: Optional[int] = None,
|
||||
_cow_target: Optional[Block] = None,
|
||||
extra_hash: Optional[int] = None):
|
||||
self._token_ids: List[int] = []
|
||||
self._block_size = block_size
|
||||
self._prev_block = prev_block
|
||||
self._block_id = block_id
|
||||
self._allocator = allocator
|
||||
self._cow_target = _cow_target if _cow_target is not None else self
|
||||
|
||||
self._append_token_ids_no_cow(token_ids)
|
||||
|
||||
def append_token_ids(self, token_ids: List[int]) -> None:
|
||||
"""Appends the given token IDs to the block and performs a
|
||||
copy-on-write if necessary.
|
||||
|
||||
Args:
|
||||
token_ids (Optional[List[int]]): The token IDs to be appended
|
||||
to the block.
|
||||
"""
|
||||
self._append_token_ids_no_cow(token_ids)
|
||||
|
||||
if self._block_id is not None:
|
||||
self._block_id = (self._allocator.cow_block_if_not_appendable(
|
||||
self._cow_target))
|
||||
|
||||
def _append_token_ids_no_cow(self, token_ids: List[int]) -> None:
|
||||
"""Appends the given token IDs to the block
|
||||
|
||||
Args:
|
||||
token_ids (List[int]): The token IDs to be appended to the block.
|
||||
"""
|
||||
if len(token_ids) == 0:
|
||||
return
|
||||
|
||||
assert len(token_ids) <= self.num_empty_slots
|
||||
|
||||
self._token_ids.extend(token_ids)
|
||||
|
||||
@property
|
||||
def computed(self) -> bool:
|
||||
raise NotImplementedError
|
||||
|
||||
@computed.setter
|
||||
def computed(self, value) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def last_accessed(self) -> float:
|
||||
raise NotImplementedError
|
||||
|
||||
@last_accessed.setter
|
||||
def last_accessed(self, last_accessed_ts: float):
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def block_id(self) -> Optional[int]:
|
||||
return self._block_id
|
||||
|
||||
@block_id.setter
|
||||
def block_id(self, value: Optional[int]) -> None:
|
||||
self._block_id = value
|
||||
|
||||
@property
|
||||
def is_full(self) -> bool:
|
||||
return self.num_empty_slots == 0
|
||||
|
||||
@property
|
||||
def num_empty_slots(self) -> int:
|
||||
return self._block_size - len(self.token_ids)
|
||||
|
||||
@property
|
||||
def token_ids(self) -> List[int]:
|
||||
return self._token_ids
|
||||
|
||||
@property
|
||||
def num_tokens_total(self) -> int:
|
||||
raise NotImplementedError(
|
||||
"num_tokens_total is not used for naive block")
|
||||
|
||||
@property
|
||||
def block_size(self) -> int:
|
||||
return self._block_size
|
||||
|
||||
@property
|
||||
def prev_block(self) -> Optional["Block"]:
|
||||
return self._prev_block
|
||||
|
||||
@property
|
||||
def extra_hash(self):
|
||||
return None
|
||||
|
||||
@property
|
||||
def content_hash(self) -> Optional[int]:
|
||||
return None
|
||||
1135
vllm/core/block/prefix_caching_block.py
Normal file
1135
vllm/core/block/prefix_caching_block.py
Normal file
File diff suppressed because it is too large
Load Diff
28
vllm/core/block/utils.py
Normal file
28
vllm/core/block/utils.py
Normal file
@@ -0,0 +1,28 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Block manager utils."""
|
||||
from vllm.sequence import SequenceGroup
|
||||
from vllm.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
|
||||
STR_NOT_IMPL_ENC_DEC_SWA)
|
||||
|
||||
|
||||
def check_no_caching_or_swa_for_blockmgr_encdec(
|
||||
block_mgr, seq_group: SequenceGroup) -> None:
|
||||
'''
|
||||
Enforce that prefix caching & sliding-window attention (SWA)
|
||||
are currently unsupported *specifically* for encoder/decoder models.
|
||||
|
||||
Raises NotImplementedError if unsupported scenario is detected.
|
||||
|
||||
Arguments:
|
||||
|
||||
* block_mgr: BlockSpaceManager instance
|
||||
* seq_group: SequenceGroup passed to block_mgr
|
||||
'''
|
||||
|
||||
if seq_group.is_encoder_decoder():
|
||||
if block_mgr.max_block_sliding_window is not None:
|
||||
raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA)
|
||||
|
||||
if block_mgr.enable_caching:
|
||||
raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE)
|
||||
Reference in New Issue
Block a user