Iluvatar-mrv100 SDK 4.3.0
This commit is contained in:
0
vllm/v1/core/__init__.py
Normal file
0
vllm/v1/core/__init__.py
Normal file
281
vllm/v1/core/block_pool.py
Normal file
281
vllm/v1/core/block_pool.py
Normal file
@@ -0,0 +1,281 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from collections import defaultdict
|
||||
from collections.abc import Iterable
|
||||
from typing import Callable, Optional
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
|
||||
KVCacheBlock,
|
||||
generate_block_hash_extra_keys,
|
||||
hash_block_tokens)
|
||||
from vllm.v1.request import Request
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class BlockPool:
|
||||
"""BlockPool that manages KVCacheBlocks.
|
||||
It provides methods to allocate, free and cache the kv cache blocks. The
|
||||
free_block_queue stores the free blocks in eviction order to enable
|
||||
allocation, free, and cache eviction. The cached_block_hash_to_block
|
||||
maps between block hash and cached block to support finding cached blocks
|
||||
by their block hash.
|
||||
|
||||
Args:
|
||||
num_gpu_blocks: The number of blocks in the pool.
|
||||
enable_caching: Whether to enable prefix caching.
|
||||
"""
|
||||
|
||||
def __init__(self, num_gpu_blocks: int, enable_caching: bool):
|
||||
assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0
|
||||
self.num_gpu_blocks = num_gpu_blocks
|
||||
self.enable_caching = enable_caching
|
||||
# All kv-cache blocks.
|
||||
self.blocks: list[KVCacheBlock] = [
|
||||
KVCacheBlock(idx) for idx in range(num_gpu_blocks)
|
||||
]
|
||||
# Free block queue that constructs and manipulates a doubly linked
|
||||
# list of free blocks (including eviction candidates when caching is
|
||||
# enabled).
|
||||
self.free_block_queue = FreeKVCacheBlockQueue(self.blocks)
|
||||
|
||||
# {block_hash: {block ID: block}}. A cached block is
|
||||
# a full block with a block hash that can be used for prefix caching.
|
||||
# The cached block may be used by running requests or in the
|
||||
# free_block_queue that could potentially be evicted.
|
||||
# NOTE: We currently don't de-duplicate the blocks in the cache,
|
||||
# meaning that if a block becomes full and is cached, we don't check
|
||||
# if there is already an identical block in the cache. This is because
|
||||
# we want to make sure the allocated block IDs won't change so that
|
||||
# block tables are append-only.
|
||||
self.cached_block_hash_to_block: dict[BlockHashType, dict[
|
||||
int, KVCacheBlock]] = defaultdict(dict)
|
||||
|
||||
# To represent a placeholder block with block_id=0.
|
||||
# The ref_cnt of null_block is not maintained, needs special care to
|
||||
# avoid freeing it.
|
||||
self.null_block = self.free_block_queue.popleft()
|
||||
|
||||
def get_cached_block(self,
|
||||
block_hash: BlockHashType) -> Optional[KVCacheBlock]:
|
||||
"""Get a cached block by the block hash, or None if cache miss.
|
||||
If there are duplicated blocks, we return the first block in the cache.
|
||||
|
||||
Args:
|
||||
block_hash: The hash value of the block.
|
||||
|
||||
Returns:
|
||||
The cached block if it exists, or None.
|
||||
"""
|
||||
if block_hash in self.cached_block_hash_to_block:
|
||||
first_block_id = list(
|
||||
self.cached_block_hash_to_block[block_hash].keys())[0]
|
||||
return self.cached_block_hash_to_block[block_hash][first_block_id]
|
||||
return None
|
||||
|
||||
def cache_full_blocks(
|
||||
self,
|
||||
request: Request,
|
||||
blocks: list[KVCacheBlock],
|
||||
block_hashes: list[BlockHashType],
|
||||
num_cached_blocks: int,
|
||||
num_full_blocks: int,
|
||||
block_size: int,
|
||||
hash_fn: Callable,
|
||||
) -> None:
|
||||
"""Cache a list of full blocks for prefix caching.
|
||||
This function takes a list of blocks that will have their block hash
|
||||
metadata to be updated and cached. Given a request, it computes the
|
||||
block hashes for the blocks starting from `num_cached_blocks` to
|
||||
`num_full_blocks`, updating the metadata for each block
|
||||
and caching them in the `cached_block_hash_to_block`.
|
||||
|
||||
Args:
|
||||
request: The request to cache the blocks.
|
||||
blocks: All blocks in the request.
|
||||
block_hashes: Block hashes of the blocks in the request. Note that
|
||||
this list may be shorter than the blocks list. In this case the
|
||||
missed block hash will be computed in this function.
|
||||
num_cached_blocks: The number of blocks that are already cached.
|
||||
num_full_blocks: The number of blocks that are full and should
|
||||
be cached after this function.
|
||||
block_size: Number of tokens in each block.
|
||||
hash_fn: The hash function to use for block hashes.
|
||||
"""
|
||||
if num_cached_blocks == num_full_blocks:
|
||||
return
|
||||
new_full_blocks = blocks[num_cached_blocks:num_full_blocks]
|
||||
assert len(block_hashes) >= num_cached_blocks
|
||||
new_block_hashes = block_hashes[num_cached_blocks:]
|
||||
|
||||
# Update the new blocks with the block hashes through the chain.
|
||||
if num_cached_blocks == 0:
|
||||
prev_block_hash_value = None
|
||||
else:
|
||||
prev_block = blocks[num_cached_blocks - 1]
|
||||
assert prev_block.block_hash is not None
|
||||
prev_block_hash_value = prev_block.block_hash.hash_value
|
||||
|
||||
for i, blk in enumerate(new_full_blocks):
|
||||
assert blk.block_hash is None
|
||||
|
||||
if i < len(new_block_hashes):
|
||||
# The block hash may already be computed in
|
||||
# "get_computed_blocks" if the tokens are not generated by
|
||||
# this request (either the prompt tokens or the previously
|
||||
# generated tokens with preemption). In this case we simply
|
||||
# reuse the block hash.
|
||||
block_hash = new_block_hashes[i]
|
||||
else:
|
||||
# Otherwise compute the block hash and cache it in the request
|
||||
# in case it will be preempted in the future.
|
||||
blk_idx = num_cached_blocks + i
|
||||
start_token_idx = blk_idx * block_size
|
||||
end_token_idx = (blk_idx + 1) * block_size
|
||||
block_tokens = request.all_token_ids[
|
||||
start_token_idx:end_token_idx]
|
||||
assert len(block_tokens) == block_size, (
|
||||
f"Expected {block_size} tokens, got "
|
||||
f"{len(block_tokens)} at {blk_idx}th block for request "
|
||||
f"{request.request_id}({request})")
|
||||
|
||||
# Generate extra keys for multi-modal inputs. Note that since
|
||||
# we reach to this branch only when the block is completed with
|
||||
# generated tokens, we only need to consider the last mm input.
|
||||
extra_keys, _ = generate_block_hash_extra_keys(
|
||||
request, start_token_idx, end_token_idx, -1)
|
||||
|
||||
# Compute the hash of the current block.
|
||||
block_hash = hash_block_tokens(hash_fn, prev_block_hash_value,
|
||||
block_tokens, extra_keys)
|
||||
block_hashes.append(block_hash)
|
||||
|
||||
# Update and added the full block to the cache.
|
||||
blk.block_hash = block_hash
|
||||
self.cached_block_hash_to_block[block_hash][blk.block_id] = blk
|
||||
prev_block_hash_value = block_hash.hash_value
|
||||
|
||||
def get_new_blocks(self, num_blocks: int) -> list[KVCacheBlock]:
|
||||
"""Get new blocks from the free block pool.
|
||||
|
||||
Note that we do not check block cache in this function.
|
||||
|
||||
Args:
|
||||
num_blocks: The number of blocks to allocate.
|
||||
|
||||
Returns:
|
||||
A list of new block.
|
||||
"""
|
||||
if num_blocks > self.get_num_free_blocks():
|
||||
raise ValueError(
|
||||
f"Cannot get {num_blocks} free blocks from the pool")
|
||||
|
||||
ret: list[KVCacheBlock] = []
|
||||
idx = 0
|
||||
while idx < num_blocks:
|
||||
# First allocate blocks.
|
||||
curr_block = self.free_block_queue.popleft()
|
||||
assert curr_block.ref_cnt == 0
|
||||
|
||||
# If the block is cached, evict it.
|
||||
if self.enable_caching:
|
||||
self._maybe_evict_cached_block(curr_block)
|
||||
|
||||
curr_block.incr_ref()
|
||||
ret.append(curr_block)
|
||||
idx += 1
|
||||
|
||||
return ret
|
||||
|
||||
def _maybe_evict_cached_block(self, block: KVCacheBlock) -> bool:
|
||||
"""
|
||||
If a block is cached in `cached_block_hash_to_block`, we reset its hash
|
||||
metadata and evict it from the cache.
|
||||
|
||||
Args:
|
||||
block: The block to evict.
|
||||
|
||||
Returns:
|
||||
True if the block is evicted, False otherwise.
|
||||
"""
|
||||
block_hash = block.block_hash
|
||||
if block_hash and block_hash in self.cached_block_hash_to_block:
|
||||
block.reset_hash()
|
||||
del self.cached_block_hash_to_block[block_hash][block.block_id]
|
||||
|
||||
if len(self.cached_block_hash_to_block[block_hash]) == 0:
|
||||
del self.cached_block_hash_to_block[block_hash]
|
||||
|
||||
return True
|
||||
return False
|
||||
|
||||
def touch(self, blocks: list[KVCacheBlock]) -> None:
|
||||
"""Touch a block increases its reference count by 1, and may remove
|
||||
the block from the free queue. This is used when a block is hit by
|
||||
another request with the same prefix.
|
||||
|
||||
Args:
|
||||
blocks: A list of blocks to touch.
|
||||
"""
|
||||
for block in blocks:
|
||||
# ref_cnt=0 means this block is in the free list (i.e. eviction
|
||||
# candidate), so remove it.
|
||||
if block.ref_cnt == 0 and block != self.null_block:
|
||||
self.free_block_queue.remove(block)
|
||||
block.incr_ref()
|
||||
|
||||
def free_blocks(self, ordered_blocks: Iterable[KVCacheBlock]) -> None:
|
||||
"""Free a list of blocks. The blocks should be ordered by their
|
||||
eviction priority, where the first block will be evicted first.
|
||||
|
||||
Args:
|
||||
ordered_blocks: A list of blocks to free ordered by their eviction
|
||||
priority.
|
||||
"""
|
||||
for block in ordered_blocks:
|
||||
block.decr_ref()
|
||||
# null_block should not be added to the free list.
|
||||
if block.ref_cnt == 0 and block != self.null_block:
|
||||
self.free_block_queue.append(block)
|
||||
|
||||
def reset_prefix_cache(self) -> bool:
|
||||
"""Reset prefix cache. This function may be used in RLHF
|
||||
flows to invalid prefix caching after the weights are updated,
|
||||
or used for resetting prefix caching status for benchmarking.
|
||||
|
||||
Returns:
|
||||
bool: True if the prefix cache is successfully reset,
|
||||
False otherwise.
|
||||
"""
|
||||
num_used_blocks = (self.num_gpu_blocks - self.get_num_free_blocks())
|
||||
if num_used_blocks != 1: # The null block is always marked as used
|
||||
logger.warning(
|
||||
"Failed to reset prefix cache because some "
|
||||
"blocks (%d) are not freed yet", num_used_blocks - 1)
|
||||
return False
|
||||
|
||||
# Remove all hashes so that no new blocks will hit.
|
||||
self.cached_block_hash_to_block = defaultdict(dict)
|
||||
|
||||
# Remove all hashes from all blocks.
|
||||
for block in self.blocks:
|
||||
block.reset_hash()
|
||||
|
||||
logger.info("Successfully reset prefix cache")
|
||||
return True
|
||||
|
||||
def get_num_free_blocks(self) -> int:
|
||||
"""Get the number of free blocks in the pool.
|
||||
|
||||
Returns:
|
||||
The number of free blocks.
|
||||
"""
|
||||
return self.free_block_queue.num_free_blocks
|
||||
|
||||
def get_usage(self) -> float:
|
||||
"""Get the KV cache usage.
|
||||
|
||||
Returns:
|
||||
The KV cache usage (between 0.0 and 1.0).
|
||||
"""
|
||||
return 1.0 - (self.get_num_free_blocks() / self.num_gpu_blocks)
|
||||
141
vllm/v1/core/encoder_cache_manager.py
Normal file
141
vllm/v1/core/encoder_cache_manager.py
Normal file
@@ -0,0 +1,141 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.multimodal import MultiModalRegistry
|
||||
from vllm.v1.request import Request
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ModelConfig, SchedulerConfig
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class EncoderCacheManager:
|
||||
|
||||
def __init__(self, cache_size: int):
|
||||
self.cache_size = cache_size
|
||||
self.num_free_slots = cache_size
|
||||
# req_id -> cached input ids
|
||||
self.cached: dict[str, set[int]] = {}
|
||||
# list of [req_id, input_id]
|
||||
self.freed: list[tuple[str, int]] = []
|
||||
|
||||
def has_cache(self, request: Request, input_id: int) -> bool:
|
||||
req_id = request.request_id
|
||||
return req_id in self.cached and input_id in self.cached[req_id]
|
||||
|
||||
def can_allocate(self, request: Request, input_id: int) -> bool:
|
||||
num_tokens = request.get_num_encoder_tokens(input_id)
|
||||
return num_tokens <= self.num_free_slots
|
||||
|
||||
def allocate(self, request: Request, input_id: int) -> None:
|
||||
req_id = request.request_id
|
||||
if req_id not in self.cached:
|
||||
self.cached[req_id] = set()
|
||||
self.cached[req_id].add(input_id)
|
||||
self.num_free_slots -= request.get_num_encoder_tokens(input_id)
|
||||
|
||||
def get_cached_input_ids(self, request: Request) -> set[int]:
|
||||
return self.cached.get(request.request_id, set())
|
||||
|
||||
def free_encoder_input(self, request: Request, input_id: int) -> None:
|
||||
"""Free a single encoder input id for the request."""
|
||||
req_id = request.request_id
|
||||
if req_id not in self.cached:
|
||||
return
|
||||
|
||||
self.cached[req_id].discard(input_id)
|
||||
if len(self.cached[req_id]) == 0:
|
||||
del self.cached[req_id]
|
||||
self.num_free_slots += request.get_num_encoder_tokens(input_id)
|
||||
self.freed.append((req_id, input_id))
|
||||
|
||||
def free(self, request: Request) -> None:
|
||||
"""Free all cached input ids for the request."""
|
||||
input_ids = self.get_cached_input_ids(request).copy()
|
||||
for input_id in input_ids:
|
||||
self.free_encoder_input(request, input_id)
|
||||
|
||||
def get_freed_ids(self) -> list[tuple[str, int]]:
|
||||
freed = self.freed
|
||||
self.freed = []
|
||||
return freed
|
||||
|
||||
|
||||
def compute_encoder_budget(
|
||||
model_config: "ModelConfig",
|
||||
scheduler_config: "SchedulerConfig",
|
||||
mm_registry: MultiModalRegistry,
|
||||
) -> tuple[int, int]:
|
||||
"""Compute the encoder cache budget based on the model and scheduler
|
||||
configurations.
|
||||
|
||||
Args:
|
||||
model_config: Model configuration.
|
||||
scheduler_config: Scheduler configuration.
|
||||
mm_registry: Provides information about the token cost.
|
||||
|
||||
Returns:
|
||||
- Compute budget for encoder execution, in unit of number of tokens
|
||||
in the input sequence.
|
||||
- Space budget for encoder cache size, in unit of number of tokens
|
||||
in the input sequence.
|
||||
"""
|
||||
|
||||
if not model_config.is_multimodal_model:
|
||||
return 0, 0
|
||||
|
||||
# TODO: handle encoder-decoder models once we support them.
|
||||
(
|
||||
encoder_compute_budget,
|
||||
encoder_cache_size,
|
||||
) = _compute_encoder_budget_multimodal(
|
||||
model_config,
|
||||
scheduler_config,
|
||||
mm_registry,
|
||||
)
|
||||
|
||||
return encoder_compute_budget, encoder_cache_size
|
||||
|
||||
|
||||
def _compute_encoder_budget_multimodal(
|
||||
model_config: "ModelConfig",
|
||||
scheduler_config: "SchedulerConfig",
|
||||
mm_registry: MultiModalRegistry,
|
||||
) -> tuple[int, int]:
|
||||
"""Compute the encoder cache budget based on the model and scheduler
|
||||
configurations for a multimodal model.
|
||||
|
||||
Args:
|
||||
model_config: Model configuration.
|
||||
scheduler_config: Scheduler configuration.
|
||||
mm_registry: Provides information about the token cost.
|
||||
|
||||
Returns:
|
||||
- Compute budget for encoder execution, in unit of number of tokens
|
||||
in the input sequence.
|
||||
- Space budget for encoder cache size, in unit of number of tokens
|
||||
in the input sequence.
|
||||
"""
|
||||
|
||||
max_tokens_by_modality_dict = mm_registry \
|
||||
.get_max_tokens_per_item_by_nonzero_modality(model_config)
|
||||
|
||||
if not max_tokens_by_modality_dict:
|
||||
logger.warning(
|
||||
"All non-text modalities supported by the model have been "
|
||||
"explicitly disabled via limit_mm_per_prompt. Encoder cache will "
|
||||
"not be initialized.")
|
||||
return 0, 0
|
||||
|
||||
_, max_tokens_per_mm_item = max(max_tokens_by_modality_dict.items(),
|
||||
key=lambda item: item[1])
|
||||
|
||||
encoder_compute_budget = max(scheduler_config.max_num_encoder_input_tokens,
|
||||
max_tokens_per_mm_item)
|
||||
encoder_cache_size = max(scheduler_config.encoder_cache_size,
|
||||
max_tokens_per_mm_item)
|
||||
|
||||
return encoder_compute_budget, encoder_cache_size
|
||||
376
vllm/v1/core/kv_cache_manager.py
Normal file
376
vllm/v1/core/kv_cache_manager.py
Normal file
@@ -0,0 +1,376 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from collections import defaultdict
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import cdiv, sha256
|
||||
from vllm.v1.core.block_pool import BlockPool
|
||||
from vllm.v1.core.kv_cache_utils import (BlockHashType, KVCacheBlock,
|
||||
hash_request_tokens)
|
||||
from vllm.v1.core.specialized_manager import get_specialized_manager
|
||||
from vllm.v1.kv_cache_interface import KVCacheConfig
|
||||
from vllm.v1.metrics.stats import PrefixCacheStats
|
||||
from vllm.v1.request import Request, RequestStatus
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class KVCacheManager:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
kv_cache_config: KVCacheConfig,
|
||||
max_model_len: int,
|
||||
enable_caching: bool = True,
|
||||
caching_hash_algo: str = "builtin",
|
||||
num_preallocate_tokens: int = 64,
|
||||
log_stats: bool = False,
|
||||
) -> None:
|
||||
assert len(kv_cache_config.kv_cache_groups) == 1, (
|
||||
"KVCacheManager does not support hybrid models with more than 1 "
|
||||
"kv cache group")
|
||||
kv_cache_spec = kv_cache_config.kv_cache_groups[0].kv_cache_spec
|
||||
self.block_size = kv_cache_spec.block_size
|
||||
self.num_gpu_blocks = kv_cache_config.num_blocks
|
||||
self.max_model_len = max_model_len
|
||||
self.max_num_blocks_per_req = cdiv(max_model_len, self.block_size)
|
||||
|
||||
self.enable_caching = enable_caching
|
||||
self.caching_hash_fn = sha256 if caching_hash_algo == "sha256" else hash
|
||||
# FIXME: make prefix cache stats conditional on log_stats
|
||||
self.log_stats = log_stats
|
||||
# NOTE(woosuk): To avoid frequent block allocation, we preallocate some
|
||||
# blocks for each request. For example, when a request reaches the end
|
||||
# of its block table, we preallocate N blocks in advance. This way, we
|
||||
# reduce the overhead of updating free_block_ids and ref_cnts for each
|
||||
# request every step (at the cost of some memory waste).
|
||||
# NOTE(woosuk): This is different from the "lookahead" slots since this
|
||||
# does not guarantee that the request always has N empty blocks. After
|
||||
# the request gets N empty blocks, it starts to use the blocks without
|
||||
# further allocation. When it uses up all the N empty blocks, it gets
|
||||
# N new empty blocks.
|
||||
self.num_preallocate_tokens = num_preallocate_tokens
|
||||
self.num_preallocate_blocks = cdiv(num_preallocate_tokens,
|
||||
self.block_size)
|
||||
|
||||
self.block_pool = BlockPool(self.num_gpu_blocks, enable_caching)
|
||||
|
||||
self.specialized_manager = get_specialized_manager(
|
||||
kv_cache_spec=kv_cache_spec,
|
||||
block_pool=self.block_pool,
|
||||
)
|
||||
|
||||
# Mapping from request ID to blocks to track the blocks allocated
|
||||
# for each request, so that we can free the blocks when the request
|
||||
# is finished.
|
||||
self.req_to_blocks: defaultdict[str,
|
||||
list[KVCacheBlock]] = defaultdict(list)
|
||||
|
||||
# Mapping from request ID to kv block hashes.
|
||||
# This is to avoid recomputing the block hashes for each call of
|
||||
# `get_computed_blocks` or `allocate_slots`.
|
||||
self.req_to_block_hashes: defaultdict[
|
||||
str, list[BlockHashType]] = defaultdict(list)
|
||||
|
||||
# {req_id: The number of cached blocks for this given request}
|
||||
# This is used to track the number of cached blocks for each request.
|
||||
# This is only used to track the RUNNING requests, we do not track the
|
||||
# data for reempted ones.
|
||||
self.num_cached_block: dict[str, int] = {}
|
||||
self.prefix_cache_stats = PrefixCacheStats()
|
||||
|
||||
@property
|
||||
def usage(self) -> float:
|
||||
"""Get the KV cache usage.
|
||||
|
||||
Returns:
|
||||
The KV cache usage (between 0.0 and 1.0).
|
||||
"""
|
||||
return self.block_pool.get_usage()
|
||||
|
||||
def make_prefix_cache_stats(self) -> PrefixCacheStats:
|
||||
"""Get (and reset) the prefix cache stats.
|
||||
|
||||
Returns:
|
||||
The current prefix caching stats.
|
||||
"""
|
||||
stats = self.prefix_cache_stats
|
||||
self.prefix_cache_stats = PrefixCacheStats()
|
||||
return stats
|
||||
|
||||
def get_computed_blocks(
|
||||
self, request: Request) -> tuple[list[KVCacheBlock], int]:
|
||||
"""Get the computed (cached) blocks for the request.
|
||||
Note that the computed blocks must be full.
|
||||
|
||||
Args:
|
||||
request: The request to get the computed blocks.
|
||||
|
||||
Returns:
|
||||
A tuple containing:
|
||||
- A list of blocks that are computed for the request.
|
||||
- The number of computed tokens.
|
||||
"""
|
||||
if not self.enable_caching:
|
||||
# Prefix caching is disabled.
|
||||
return [], 0
|
||||
|
||||
# The block hashes for the request may already be computed
|
||||
# if the scheduler has tried to schedule the request before.
|
||||
block_hashes = self.req_to_block_hashes[request.request_id]
|
||||
if not block_hashes:
|
||||
block_hashes = hash_request_tokens(self.caching_hash_fn,
|
||||
self.block_size, request)
|
||||
self.req_to_block_hashes[request.request_id] = block_hashes
|
||||
|
||||
self.prefix_cache_stats.requests += 1
|
||||
if request.sampling_params.prompt_logprobs is None:
|
||||
if len(block_hashes) * self.block_size == request.num_tokens:
|
||||
# When prompt length is divisible by the block size and all
|
||||
# blocks are cached, we need to recompute the last token. This
|
||||
# have to be achieved by re-computing an entire block because
|
||||
# allocate_slots() assumes num_computed_tokens is always a
|
||||
# multiple of the block size. To achieve this, remove the last
|
||||
# block hash from the block_hashes for find_longest_cache_hit
|
||||
# This limitation can potentially be removed in the future to
|
||||
# slightly improve the performance.
|
||||
last_block_hash = block_hashes.pop()
|
||||
else:
|
||||
last_block_hash = None
|
||||
|
||||
computed_blocks = (
|
||||
self.specialized_manager.find_longest_cache_hit(block_hashes))
|
||||
|
||||
if last_block_hash is not None:
|
||||
# Add back the last block hash if it was removed.
|
||||
block_hashes.append(last_block_hash)
|
||||
|
||||
self.prefix_cache_stats.queries += len(block_hashes)
|
||||
self.prefix_cache_stats.hits += len(computed_blocks)
|
||||
|
||||
# NOTE(woosuk): Since incomplete blocks are not eligible for
|
||||
# sharing, `num_computed_tokens` is always a multiple of
|
||||
# `block_size`.
|
||||
num_computed_tokens = len(computed_blocks) * self.block_size
|
||||
return computed_blocks, num_computed_tokens
|
||||
else:
|
||||
# Skip cache hits for prompt logprobs
|
||||
return [], 0
|
||||
|
||||
def allocate_slots(
|
||||
self,
|
||||
request: Request,
|
||||
num_tokens: int,
|
||||
new_computed_blocks: Optional[list[KVCacheBlock]] = None
|
||||
) -> Optional[list[KVCacheBlock]]:
|
||||
"""Add slots for a request with new tokens to append.
|
||||
|
||||
Args:
|
||||
request: The request to allocate slots.
|
||||
num_tokens: The number of tokens to allocate. Note that this does
|
||||
not include the tokens that have already been computed.
|
||||
new_computed_blocks: A list of new computed blocks just hitting the
|
||||
prefix caching.
|
||||
|
||||
Blocks layout:
|
||||
-----------------------------------------------------------------------
|
||||
| < computed > | < new computed > | < new > | < pre-allocated > |
|
||||
-----------------------------------------------------------------------
|
||||
| < required > |
|
||||
--------------------------------------------------
|
||||
| < full > |
|
||||
------------------------------------------------
|
||||
| <new full> |
|
||||
--------------
|
||||
The following *_blocks are illustrated in this layout.
|
||||
|
||||
Returns:
|
||||
A list of new allocated blocks.
|
||||
"""
|
||||
if num_tokens == 0:
|
||||
raise ValueError("num_tokens must be greater than 0")
|
||||
|
||||
new_computed_blocks = new_computed_blocks or []
|
||||
|
||||
req_blocks = self.req_to_blocks[request.request_id]
|
||||
|
||||
# Free the blocks that are skipped during the attention computation
|
||||
# (e.g., tokens outside the sliding window).
|
||||
# We can do this even if we cannot schedule this request due to
|
||||
# insufficient free blocks.
|
||||
# Should call this function before allocating new blocks to reduce
|
||||
# the number of evicted blocks.
|
||||
removed_blocks = self.specialized_manager.remove_skipped_blocks(
|
||||
req_blocks, request.num_computed_tokens)
|
||||
self.block_pool.free_blocks(removed_blocks)
|
||||
|
||||
# The number of computed tokens is the number of computed tokens plus
|
||||
# the new prefix caching hits
|
||||
num_computed_tokens = (request.num_computed_tokens +
|
||||
len(new_computed_blocks) * self.block_size)
|
||||
num_required_blocks = cdiv(num_computed_tokens + num_tokens,
|
||||
self.block_size)
|
||||
num_new_blocks = (num_required_blocks - len(req_blocks) -
|
||||
len(new_computed_blocks))
|
||||
|
||||
# If a computed block of a request is an eviction candidate (in the
|
||||
# free queue and ref_cnt == 0), it cannot be counted as a free block
|
||||
# when allocating this request.
|
||||
num_evictable_computed_blocks = sum(1 for blk in new_computed_blocks
|
||||
if blk.ref_cnt == 0)
|
||||
if (num_new_blocks > self.block_pool.get_num_free_blocks() -
|
||||
num_evictable_computed_blocks):
|
||||
# Cannot allocate new blocks
|
||||
return None
|
||||
|
||||
# Touch the computed blocks to make sure they won't be evicted.
|
||||
if self.enable_caching:
|
||||
self.block_pool.touch(new_computed_blocks)
|
||||
else:
|
||||
assert not new_computed_blocks, (
|
||||
"Computed blocks should be empty when "
|
||||
"prefix caching is disabled")
|
||||
|
||||
# Append the new computed blocks to the request blocks until now to
|
||||
# avoid the case where the new blocks cannot be allocated.
|
||||
req_blocks.extend(new_computed_blocks)
|
||||
|
||||
# Start to handle new blocks
|
||||
|
||||
if num_new_blocks <= 0:
|
||||
# No new block is needed.
|
||||
new_blocks = []
|
||||
else:
|
||||
# Get new blocks from the free block pool considering
|
||||
# preallocated blocks.
|
||||
num_new_blocks = min(
|
||||
num_new_blocks + self.num_preallocate_blocks,
|
||||
self.block_pool.get_num_free_blocks(),
|
||||
# Should not exceed the maximum number of blocks per request.
|
||||
# This is especially because the block table has the shape
|
||||
# [..., max_num_blocks_per_req].
|
||||
self.max_num_blocks_per_req - len(req_blocks),
|
||||
)
|
||||
assert num_new_blocks > 0
|
||||
|
||||
# Concatenate the computed block IDs and the new block IDs.
|
||||
new_blocks = self.block_pool.get_new_blocks(num_new_blocks)
|
||||
req_blocks.extend(new_blocks)
|
||||
|
||||
if not self.enable_caching:
|
||||
return new_blocks
|
||||
|
||||
# Use `new_computed_blocks` for a new request, and `num_cached_block`
|
||||
# for a running request.
|
||||
num_cached_blocks = self.num_cached_block.get(request.request_id,
|
||||
len(new_computed_blocks))
|
||||
# Speculated tokens might be rejected in the future, so we does
|
||||
# not cache any speculated tokens. We only cache blocks with
|
||||
# generated (accepted) tokens.
|
||||
num_full_blocks_after_append = (num_computed_tokens + num_tokens - len(
|
||||
request.spec_token_ids)) // self.block_size
|
||||
|
||||
self.block_pool.cache_full_blocks(
|
||||
request=request,
|
||||
blocks=req_blocks,
|
||||
block_hashes=self.req_to_block_hashes[request.request_id],
|
||||
num_cached_blocks=num_cached_blocks,
|
||||
num_full_blocks=num_full_blocks_after_append,
|
||||
block_size=self.block_size,
|
||||
hash_fn=self.caching_hash_fn,
|
||||
)
|
||||
|
||||
self.num_cached_block[
|
||||
request.request_id] = num_full_blocks_after_append
|
||||
return new_blocks
|
||||
|
||||
def free(self, request: Request) -> None:
|
||||
"""Free the blocks allocated for the request.
|
||||
When caching is enabled, we free the blocks in reverse order so that
|
||||
the tail blocks are evicted first.
|
||||
|
||||
Args:
|
||||
request: The request to free the blocks.
|
||||
"""
|
||||
# Default to [] in case a request is freed (aborted) before alloc.
|
||||
blocks = self.req_to_blocks.pop(request.request_id, [])
|
||||
ordered_blocks: Iterable[KVCacheBlock] = blocks
|
||||
if self.enable_caching:
|
||||
# Free blocks in reverse order so that the tail blocks are
|
||||
# freed first.
|
||||
ordered_blocks = reversed(blocks)
|
||||
|
||||
self.block_pool.free_blocks(ordered_blocks)
|
||||
self.num_cached_block.pop(request.request_id, None)
|
||||
|
||||
def reset_prefix_cache(self) -> bool:
|
||||
"""Reset prefix cache. This function may be used in RLHF
|
||||
flows to invalid prefix caching after the weights are updated,
|
||||
or used for resetting prefix caching status for benchmarking.
|
||||
|
||||
Returns:
|
||||
bool: True if the prefix cache is successfully reset,
|
||||
False otherwise.
|
||||
"""
|
||||
if self.block_pool.reset_prefix_cache():
|
||||
self.prefix_cache_stats.reset = True
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_num_common_prefix_blocks(
|
||||
self,
|
||||
request: Request,
|
||||
num_running_requests: int,
|
||||
) -> int:
|
||||
"""Calculate the number of common prefix blocks shared by all requests
|
||||
in the RUNNING state.
|
||||
|
||||
The function determines this by selecting any request and iterating
|
||||
through its blocks. A block is considered a common prefix block if its
|
||||
`ref_cnt` equals the total number of requests in the RUNNING state.
|
||||
|
||||
NOTE(woosuk): The number of requests in the RUNNING state is **greater
|
||||
than or equal to** the number of requests scheduled in the current step.
|
||||
This is because the RUNNING state only indicates that:
|
||||
1. The request has not yet finished, and
|
||||
2. The request holds its blocks unfreed.
|
||||
|
||||
While all scheduled requests must be in the RUNNING state, the inverse
|
||||
is not necessarily true. There may be RUNNING requests that are not
|
||||
scheduled in the current step.
|
||||
|
||||
This can result in an edge case where the number of common prefix blocks
|
||||
is 0, even though all scheduled requests share a common prefix. This
|
||||
occurs because there may be unscheduled RUNNING requests that do not
|
||||
share the common prefix. Currently, this case cannot be easily detected,
|
||||
so the function returns 0 in such cases.
|
||||
|
||||
Args:
|
||||
request: Any request in the RUNNING state, used to identify the
|
||||
common prefix blocks.
|
||||
num_running_requests: The total number of requests in the RUNNING
|
||||
state. This can be different from the number of scheduled
|
||||
requests in the current step.
|
||||
|
||||
Returns:
|
||||
int: The number of common prefix blocks.
|
||||
"""
|
||||
assert request.status == RequestStatus.RUNNING
|
||||
blocks = self.req_to_blocks[request.request_id]
|
||||
num_common_blocks = 0
|
||||
for block in blocks:
|
||||
if block.ref_cnt == num_running_requests:
|
||||
num_common_blocks += 1
|
||||
else:
|
||||
break
|
||||
return num_common_blocks
|
||||
|
||||
def free_block_hashes(self, request: Request) -> None:
|
||||
"""Discard the block hashes for the request.
|
||||
|
||||
NOTE: Unlike `free`, this method should be called only when the request
|
||||
is finished, not when it is preempted.
|
||||
"""
|
||||
self.req_to_block_hashes.pop(request.request_id, None)
|
||||
690
vllm/v1/core/kv_cache_utils.py
Normal file
690
vllm/v1/core/kv_cache_utils.py
Normal file
@@ -0,0 +1,690 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""KV-Cache Utilities."""
|
||||
import os
|
||||
from collections import deque
|
||||
from collections.abc import Sequence
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Callable, NamedTuple, Optional
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import sha256
|
||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
||||
KVCacheGroupSpec, KVCacheSpec,
|
||||
KVCacheTensor, SlidingWindowSpec)
|
||||
from vllm.v1.metrics.stats import PrefixCacheStats
|
||||
from vllm.v1.request import Request
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class BlockHashType(NamedTuple):
|
||||
"""Hash value of a block (int), the token IDs in the block, and extra keys.
|
||||
We keep a tuple of token IDs and extra keys to reduce the likelihood of
|
||||
hash collisions when the hash value is the same. By using SHA256 however,
|
||||
hash collisions are practically impossible.
|
||||
"""
|
||||
# Hash value of the block in an integer.
|
||||
hash_value: int
|
||||
# Token IDs in the block.
|
||||
token_ids: tuple[int, ...]
|
||||
# Extra keys for the block.
|
||||
extra_keys: Optional[Any] = None
|
||||
|
||||
|
||||
# The hash seed for the first block of the prefix block sequence.
|
||||
#
|
||||
# Even if the hash function is the builtin hash(), we use sha256 to generate
|
||||
# the initial hash to simplify the code. This is not performance critical
|
||||
# as it is done one per process.
|
||||
#
|
||||
# We use a random value to avoid hash collisions or PYTHONHASHSEED environment
|
||||
# variable if set such that processes can share the seed if needed.
|
||||
# This aligns with the behavior of Python's hash() function, which also uses
|
||||
# a random seed if PYTHONHASHSEED is not set.
|
||||
NONE_HASH = int.from_bytes(os.urandom(32), byteorder="big") if os.getenv(
|
||||
'PYTHONHASHSEED') is not None else sha256(os.getenv('PYTHONHASHSEED'))
|
||||
|
||||
|
||||
class PrefixCachingMetrics:
|
||||
"""Metrics for prefix caching with a hit rate of the most recent N requests.
|
||||
|
||||
Args:
|
||||
interval: The number of the most recent requests to aggregate.
|
||||
Defaults to 1000.
|
||||
"""
|
||||
|
||||
def __init__(self, interval: int = 1000):
|
||||
self.interval = interval
|
||||
# The current aggregated values.
|
||||
self.aggregated_requests = 0
|
||||
self.aggregated_query_total = 0
|
||||
self.aggregated_query_hit = 0
|
||||
# A deque of (requests, queries, hits) for the most recent requests.
|
||||
self.query_queue: deque[tuple[int, int, int]] = deque()
|
||||
|
||||
def observe(self, stats: PrefixCacheStats):
|
||||
"""Observe the prefix caching for a set of requests.
|
||||
|
||||
This function is called with information gathered when new requests
|
||||
are being scheduled and are looking for computed blocks.
|
||||
|
||||
When there are more than `interval` requests, the oldest set of
|
||||
requestsare removed from the metrics.
|
||||
|
||||
Args:
|
||||
stats: The prefix cache stats.
|
||||
"""
|
||||
# reset_prefix_cache was invoked before the current update.
|
||||
# Reset the metrics before aggregating the current stats.
|
||||
if stats.reset:
|
||||
self.reset()
|
||||
|
||||
# Update the metrics.
|
||||
self.query_queue.append((stats.requests, stats.queries, stats.hits))
|
||||
self.aggregated_requests += stats.requests
|
||||
self.aggregated_query_total += stats.queries
|
||||
self.aggregated_query_hit += stats.hits
|
||||
|
||||
# Remove the oldest stats if the number of requests exceeds.
|
||||
if self.aggregated_requests > self.interval:
|
||||
old_requests, old_queries, old_hits = self.query_queue.popleft()
|
||||
self.aggregated_requests -= old_requests
|
||||
self.aggregated_query_total -= old_queries
|
||||
self.aggregated_query_hit -= old_hits
|
||||
|
||||
def reset(self):
|
||||
"""Reset the metrics."""
|
||||
self.aggregated_requests = 0
|
||||
self.aggregated_query_total = 0
|
||||
self.aggregated_query_hit = 0
|
||||
self.query_queue.clear()
|
||||
|
||||
@property
|
||||
def hit_rate(self) -> float:
|
||||
"""Calculate the hit rate for the past N requests."""
|
||||
if self.aggregated_query_total == 0:
|
||||
return 0.0
|
||||
return self.aggregated_query_hit / self.aggregated_query_total
|
||||
|
||||
|
||||
@dataclass
|
||||
class KVCacheBlock:
|
||||
"""KV-cache block metadata."""
|
||||
# Block ID, ranging from 0 to num_gpu_blocks - 1.
|
||||
block_id: int
|
||||
# Reference count.
|
||||
ref_cnt: int = 0
|
||||
# The hash of the block composed of (block hash, tuple of token IDs).
|
||||
# It is only available when the block is full.
|
||||
_block_hash: Optional[BlockHashType] = None
|
||||
|
||||
# Used to construct a doubly linked list for free blocks.
|
||||
# These two attributes should only be manipulated by FreeKVCacheBlockQueue.
|
||||
prev_free_block: Optional["KVCacheBlock"] = None
|
||||
next_free_block: Optional["KVCacheBlock"] = None
|
||||
|
||||
def incr_ref(self):
|
||||
self.ref_cnt += 1
|
||||
|
||||
def decr_ref(self):
|
||||
self.ref_cnt -= 1
|
||||
|
||||
@property
|
||||
def block_hash(self) -> Optional[BlockHashType]:
|
||||
return self._block_hash
|
||||
|
||||
@block_hash.setter
|
||||
def block_hash(self, block_hash: BlockHashType):
|
||||
assert self.block_hash is None, (
|
||||
"The block already has a hash. This should not happen.")
|
||||
self._block_hash = block_hash
|
||||
|
||||
def reset_hash(self):
|
||||
"""Reset the block hash when the block is evicted."""
|
||||
self._block_hash = None
|
||||
|
||||
def __repr__(self) -> str:
|
||||
# Use block_id instead of KVCacheBlock object to avoid calling __repr__
|
||||
# on KVCacheBlock object recursively.
|
||||
prev_block_id = self.prev_free_block.block_id \
|
||||
if self.prev_free_block else None
|
||||
next_block_id = self.next_free_block.block_id \
|
||||
if self.next_free_block else None
|
||||
return (f"KVCacheBlock(block_id={self.block_id}, "
|
||||
f"ref_cnt={self.ref_cnt}, "
|
||||
f"_block_hash={self._block_hash}, "
|
||||
f"prev_free_block={prev_block_id}, "
|
||||
f"next_free_block={next_block_id})")
|
||||
|
||||
|
||||
class FreeKVCacheBlockQueue:
|
||||
"""This class organizes a list of KVCacheBlock objects to a doubly linked
|
||||
list of free blocks. We implement this class instead of using Python
|
||||
builtin deque to support removing a block in the middle of the queue
|
||||
in O(1) time. To close the performance gap to the builtin deque which is
|
||||
implemented in C++, this class does not allocate any Python objects when
|
||||
manipulating the linked list. Instead, this class manipulates the
|
||||
prev_free_block and next_free_block attributes of the given blocks.
|
||||
|
||||
The queue is ordered by block ID in the beginning. When a block is allocated
|
||||
and then freed, it will be appended back with the eviction order:
|
||||
1. The least recent used block is at the front (LRU).
|
||||
2. If two blocks have the same last accessed time (allocated by the
|
||||
same sequence), the one with more hash tokens (the tail of a block
|
||||
chain) is at the front.
|
||||
Note that we maintain this order by reversing the block order when free
|
||||
blocks of a request. This operation is outside of this class.
|
||||
|
||||
Args:
|
||||
blocks: A list of KVCacheBlock objects.
|
||||
"""
|
||||
|
||||
def __init__(self, blocks: list[KVCacheBlock]) -> None:
|
||||
self.num_free_blocks = len(blocks)
|
||||
|
||||
# Initialize the doubly linked list of free blocks.
|
||||
self.free_list_head: Optional[KVCacheBlock] = blocks[0]
|
||||
self.free_list_tail: Optional[KVCacheBlock] = blocks[-1]
|
||||
for i in range(self.num_free_blocks):
|
||||
if i > 0:
|
||||
blocks[i].prev_free_block = blocks[i - 1]
|
||||
if i < self.num_free_blocks - 1:
|
||||
blocks[i].next_free_block = blocks[i + 1]
|
||||
|
||||
def popleft(self) -> KVCacheBlock:
|
||||
"""Pop the first free block and reduce num_free_blocks by 1.
|
||||
|
||||
Returns:
|
||||
The first free block.
|
||||
"""
|
||||
if not self.free_list_head:
|
||||
raise ValueError("No free blocks available")
|
||||
|
||||
block = self.free_list_head
|
||||
self.remove(block)
|
||||
return block
|
||||
|
||||
def remove(self, block: KVCacheBlock) -> None:
|
||||
"""Remove a block in the free list and reduce num_free_blocks by 1.
|
||||
|
||||
Args:
|
||||
block: The block to remove.
|
||||
"""
|
||||
if block.prev_free_block is not None:
|
||||
# Link the previous block to the next block.
|
||||
block.prev_free_block.next_free_block = block.next_free_block
|
||||
if block.next_free_block is not None:
|
||||
# Link the next block to the previous block.
|
||||
block.next_free_block.prev_free_block = block.prev_free_block
|
||||
|
||||
if block == self.free_list_head:
|
||||
# Update the head if the block is the head.
|
||||
self.free_list_head = block.next_free_block
|
||||
if block == self.free_list_tail:
|
||||
# Update the tail if the block is the tail.
|
||||
self.free_list_tail = block.prev_free_block
|
||||
|
||||
# Remove the block from the linked list.
|
||||
block.prev_free_block = block.next_free_block = None
|
||||
self.num_free_blocks -= 1
|
||||
|
||||
def append(self, block: KVCacheBlock) -> None:
|
||||
"""Put a block back into the free list and increase
|
||||
num_free_blocks by 1.
|
||||
|
||||
Args:
|
||||
block: The block to append.
|
||||
"""
|
||||
if self.free_list_tail is not None:
|
||||
# Link the last block to the new block.
|
||||
self.free_list_tail.next_free_block = block
|
||||
block.prev_free_block = self.free_list_tail
|
||||
self.free_list_tail = block
|
||||
else:
|
||||
# The free list is empty.
|
||||
assert self.free_list_head is None
|
||||
self.free_list_head = self.free_list_tail = block
|
||||
|
||||
block.next_free_block = None
|
||||
self.num_free_blocks += 1
|
||||
|
||||
def get_all_free_blocks(self) -> list[KVCacheBlock]:
|
||||
"""Get all free blocks in the free list. Mainly used for testing.
|
||||
|
||||
Returns:
|
||||
A list of free blocks.
|
||||
"""
|
||||
ret = []
|
||||
curr_block = self.free_list_head
|
||||
while curr_block is not None:
|
||||
ret.append(curr_block)
|
||||
curr_block = curr_block.next_free_block
|
||||
return ret
|
||||
|
||||
|
||||
def need_extra_keys(request: Request) -> bool:
|
||||
"""Check whether the blocks allocated to this request need extra hash keys.
|
||||
|
||||
Args:
|
||||
request (Request): The request.
|
||||
|
||||
Returns:
|
||||
bool: Whether blocks allocated to this request need extra hash keys.
|
||||
"""
|
||||
|
||||
# Multimodal requests need to include the MM hash.
|
||||
# LoRA requests need to include the LoRA ID.
|
||||
return bool(request.mm_positions) or (request.lora_request is not None)
|
||||
|
||||
|
||||
def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int,
|
||||
end_token_idx: int,
|
||||
start_mm_idx: int) -> tuple[list[Any], int]:
|
||||
"""Generate extra keys related to MultiModal request for block hash
|
||||
computation. For multi-modal inputs, the extra keys are
|
||||
(mm_hash, start_offset) that indicate a mm input contained in the
|
||||
block and its starting offset in the block tokens.
|
||||
|
||||
Args:
|
||||
request: The request object.
|
||||
start_token_idx: The start token index of the block.
|
||||
end_token_idx: The end token index of the block.
|
||||
start_mm_idx: The start multi-modal index of the block.
|
||||
|
||||
Returns:
|
||||
A tuple of extra keys and the next multi-modal index.
|
||||
"""
|
||||
extra_keys: list[Any] = []
|
||||
|
||||
mm_positions, mm_hashes = request.mm_positions, request.mm_hashes
|
||||
if not mm_positions:
|
||||
return extra_keys, start_mm_idx
|
||||
|
||||
if mm_positions and len(mm_positions) != len(mm_hashes):
|
||||
raise ValueError(
|
||||
"The number of multi-modal positions and hashes must match. This "
|
||||
"is likely because you do not enable MM preprocessor hashing. "
|
||||
"Please set disable_mm_preprocessor_cache=False.")
|
||||
|
||||
# Note that we assume mm_positions is sorted by offset.
|
||||
# We do not need to check all mm inputs if the start token index is out of
|
||||
# range. This usually happens in the late prefill phase and decoding phase.
|
||||
if mm_positions[-1]["offset"] + mm_positions[-1][
|
||||
"length"] < start_token_idx:
|
||||
return extra_keys, start_mm_idx
|
||||
|
||||
# Support start_mm_idx == -1 to indicate the last mm input.
|
||||
if start_mm_idx < 0:
|
||||
assert -start_mm_idx <= len(mm_positions)
|
||||
start_mm_idx = len(mm_positions) + start_mm_idx
|
||||
|
||||
curr_mm_idx = start_mm_idx
|
||||
while mm_positions and curr_mm_idx < len(mm_positions):
|
||||
assert mm_hashes[curr_mm_idx] is not None
|
||||
offset = mm_positions[curr_mm_idx]["offset"]
|
||||
length = mm_positions[curr_mm_idx]["length"]
|
||||
if end_token_idx > offset:
|
||||
if start_token_idx > offset + length:
|
||||
# This block has passed the current mm input.
|
||||
curr_mm_idx += 1
|
||||
continue
|
||||
|
||||
# The block contains the current mm input.
|
||||
extra_keys.append(mm_hashes[curr_mm_idx])
|
||||
|
||||
if end_token_idx >= offset + length:
|
||||
# If this block contains the end of the current mm input,
|
||||
# move to the next mm input as this block may also contain
|
||||
# the next mm input.
|
||||
curr_mm_idx += 1
|
||||
else:
|
||||
# Otherwise this block is done with mm inputs.
|
||||
break
|
||||
else:
|
||||
# This block has not reached the current mm input.
|
||||
break
|
||||
return extra_keys, curr_mm_idx
|
||||
|
||||
|
||||
def _gen_lora_extra_hash_keys(request: Request) -> list[int]:
|
||||
"""Generate extra keys related to LoRA for block hash computation.
|
||||
|
||||
Args:
|
||||
request: The request object.
|
||||
|
||||
Returns:
|
||||
Return LoRA id of the request if it is a LoRA request. Return empty
|
||||
list otherwise.
|
||||
"""
|
||||
if not request.lora_request:
|
||||
return []
|
||||
return [request.lora_request.lora_int_id]
|
||||
|
||||
|
||||
def generate_block_hash_extra_keys(
|
||||
request: Request, start_token_idx: int, end_token_idx: int,
|
||||
start_mm_idx: int) -> tuple[Optional[tuple[Any, ...]], int]:
|
||||
"""Generate extra keys for the block hash. The extra keys can come from
|
||||
the multi-modal inputs and request specific metadata (e.g., LoRA ID).
|
||||
|
||||
Args:
|
||||
request: The request object.
|
||||
start_token_idx: The start token index of the block.
|
||||
end_token_idx: The end token index of the block.
|
||||
start_mm_idx: The start multi-modal index of the block.
|
||||
|
||||
Returns:
|
||||
A tuple of extra keys and the next multi-modal index.
|
||||
"""
|
||||
mm_extra_keys: list[Any]
|
||||
mm_extra_keys, new_start_mm_idx = _gen_mm_extra_hash_keys(
|
||||
request, start_token_idx, end_token_idx, start_mm_idx)
|
||||
lora_extra_keys: list[int] = _gen_lora_extra_hash_keys(request)
|
||||
|
||||
extra_keys: list[Any] = lora_extra_keys + mm_extra_keys
|
||||
|
||||
if not extra_keys:
|
||||
return None, new_start_mm_idx
|
||||
|
||||
return tuple(extra_keys), new_start_mm_idx
|
||||
|
||||
|
||||
def hash_block_tokens(
|
||||
hash_function: Callable,
|
||||
parent_block_hash: Optional[int],
|
||||
curr_block_token_ids: Sequence[int],
|
||||
extra_keys: Optional[tuple[Any, ...]] = None) -> BlockHashType:
|
||||
"""Computes a hash value corresponding to the contents of a block and
|
||||
the contents of the preceding block(s). The hash value is used for
|
||||
prefix caching. We use LRU cache for this function to avoid recomputing
|
||||
hash values for the same block contents.
|
||||
|
||||
Args:
|
||||
parent_block_hash: The hash of the parent block. None
|
||||
if this is the first block.
|
||||
curr_block_token_ids: A list of token ids in the current
|
||||
block. The current block is assumed to be full.
|
||||
extra_keys: Extra keys for the block.
|
||||
|
||||
Returns:
|
||||
The hash value of the block and the token ids in the block.
|
||||
The entire tuple is used as the hash key of the block.
|
||||
"""
|
||||
if not parent_block_hash:
|
||||
parent_block_hash = NONE_HASH
|
||||
|
||||
curr_block_token_ids_tuple = tuple(curr_block_token_ids)
|
||||
return BlockHashType(
|
||||
hash_function(
|
||||
(parent_block_hash, curr_block_token_ids_tuple, extra_keys)),
|
||||
curr_block_token_ids_tuple, extra_keys)
|
||||
|
||||
|
||||
def hash_request_tokens(hash_function: Any, block_size: int,
|
||||
request: Request) -> list[BlockHashType]:
|
||||
"""Computes hash values of a chain of blocks given a sequence of
|
||||
token IDs. The hash value is used for prefix caching.
|
||||
|
||||
Args:
|
||||
block_size: The size of each block.
|
||||
request: The request object.
|
||||
|
||||
Returns:
|
||||
The list of computed hash values.
|
||||
"""
|
||||
token_ids = request.all_token_ids
|
||||
|
||||
req_need_extra_keys = need_extra_keys(request)
|
||||
req_extra_keys = None
|
||||
curr_mm_idx = 0
|
||||
|
||||
ret = []
|
||||
parent_block_hash_value = None
|
||||
for start in range(0, len(token_ids), block_size):
|
||||
end = start + block_size
|
||||
block_token_ids = token_ids[start:end]
|
||||
# Do not hash the block if it is not full.
|
||||
if len(block_token_ids) < block_size:
|
||||
break
|
||||
|
||||
if req_need_extra_keys:
|
||||
# MM and LoRA requests need extra keys for block-hash computation.
|
||||
req_extra_keys, curr_mm_idx = generate_block_hash_extra_keys(
|
||||
request, start, end, curr_mm_idx)
|
||||
|
||||
block_hash = hash_block_tokens(hash_function, parent_block_hash_value,
|
||||
block_token_ids, req_extra_keys)
|
||||
ret.append(block_hash)
|
||||
parent_block_hash_value = block_hash.hash_value
|
||||
return ret
|
||||
|
||||
|
||||
def check_enough_kv_cache_memory(vllm_config: VllmConfig,
|
||||
kv_cache_spec: dict[str, KVCacheSpec],
|
||||
available_memory: int):
|
||||
"""
|
||||
Checks whether `available_memory` is enough for the KV cache to hold at
|
||||
least one request with the model's max_model_len.
|
||||
|
||||
Args:
|
||||
vllm_config: The global VllmConfig
|
||||
kv_cache_spec: The kv cache spec of each attention layer in the model
|
||||
available_memory: Memory available for KV cache in bytes.
|
||||
|
||||
Raises:
|
||||
ValueError: If there is not enough memory available for the KV cache.
|
||||
"""
|
||||
|
||||
if available_memory <= 0:
|
||||
raise ValueError("No available memory for the cache blocks. "
|
||||
"Try increasing `gpu_memory_utilization` when "
|
||||
"initializing the engine.")
|
||||
|
||||
max_model_len = vllm_config.model_config.max_model_len
|
||||
needed_memory = 0
|
||||
for layer_spec in kv_cache_spec.values():
|
||||
needed_memory += layer_spec.max_memory_usage_bytes(vllm_config)
|
||||
|
||||
if needed_memory > available_memory:
|
||||
raise ValueError(
|
||||
f"To serve at least one request with the models's max seq len "
|
||||
f"({max_model_len}), ({needed_memory/1024/1024/1024:.2f} GiB KV "
|
||||
f"cache is needed, which is larger than the available KV cache "
|
||||
f"memory ({available_memory/1024/1024/1024:.2f} GiB). Try "
|
||||
f"increasing `gpu_memory_utilization` or decreasing "
|
||||
f"`max_model_len` when initializing the engine.")
|
||||
|
||||
|
||||
def create_kv_cache_group_specs(
|
||||
kv_cache_spec: dict[str, KVCacheSpec],
|
||||
grouped_layer_names: list[list[str]]) -> list[KVCacheGroupSpec]:
|
||||
"""
|
||||
Create KVCacheGroupSpec object for each kv cache group layer.
|
||||
The layers in the same group should share the same
|
||||
KVCacheSpec.
|
||||
|
||||
Args:
|
||||
kv_cache_spec:
|
||||
A mapping from each layer name to its corresponding KVCacheSpec.
|
||||
grouped_layer_names:
|
||||
A list of kv cache groups, where each element is a list of layer
|
||||
names that belong to the same group and should share the same
|
||||
KVCacheSpec.
|
||||
Returns:
|
||||
A list of KVCacheGroupSpec objects, one for each group.
|
||||
"""
|
||||
kv_cache_groups = []
|
||||
for layer_names_one_group in grouped_layer_names:
|
||||
layer_spec = kv_cache_spec[layer_names_one_group[0]]
|
||||
assert all(
|
||||
kv_cache_spec[layer_name] == layer_spec
|
||||
for layer_name in layer_names_one_group[1:]), (
|
||||
"All layers in the same KV cache group must share the same "
|
||||
"KVCacheSpec.")
|
||||
kv_cache_groups.append(
|
||||
KVCacheGroupSpec(layer_names_one_group, layer_spec))
|
||||
return kv_cache_groups
|
||||
|
||||
|
||||
def is_kv_cache_type_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
|
||||
"""
|
||||
Whether all layers in the given KVCacheSpec have the same type of KV cache.
|
||||
|
||||
Args:
|
||||
kv_cache_spec: The kv cache spec of each attention layer in the model
|
||||
|
||||
Returns:
|
||||
True if all layers have the same type, False otherwise.
|
||||
"""
|
||||
|
||||
layer_keys = set(layer.type_id for layer in kv_cache_spec.values())
|
||||
return len(layer_keys) == 1
|
||||
|
||||
|
||||
def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
|
||||
kv_cache_spec: dict[str, KVCacheSpec],
|
||||
available_memory: int) -> KVCacheConfig:
|
||||
"""
|
||||
Generates the KV cache configuration for a model with one type of KV cache.
|
||||
Divide the available memory equally among all layers.
|
||||
|
||||
Args:
|
||||
vllm_config: The global VllmConfig
|
||||
kv_cache_spec: The kv cache spec of each attention layer in the model
|
||||
available_memory: Memory available for KV cache in bytes.
|
||||
|
||||
Returns:
|
||||
The generated KVCacheConfig
|
||||
"""
|
||||
|
||||
page_sizes = {layer.page_size_bytes for layer in kv_cache_spec.values()}
|
||||
scale_page_sizes = {layer.scale_page_size_bytes for layer in kv_cache_spec.values()}
|
||||
assert len(page_sizes) == 1
|
||||
page_size = page_sizes.pop()
|
||||
scale_page_size = scale_page_sizes.pop()
|
||||
|
||||
num_blocks = int(available_memory // (page_size + scale_page_size) // len(kv_cache_spec))
|
||||
num_blocks = max(num_blocks, 0)
|
||||
|
||||
if vllm_config.cache_config.num_gpu_blocks_override is not None:
|
||||
num_gpu_blocks_override = \
|
||||
vllm_config.cache_config.num_gpu_blocks_override
|
||||
logger.info(
|
||||
"Overriding num_gpu_blocks=%d with "
|
||||
"num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override)
|
||||
num_blocks = num_gpu_blocks_override
|
||||
|
||||
num_tokens = num_blocks * vllm_config.cache_config.block_size
|
||||
num_tokens_str = f"{num_tokens:,}"
|
||||
logger.info("GPU KV cache size: %s tokens", num_tokens_str)
|
||||
max_model_len_str = f"{vllm_config.model_config.max_model_len:,}"
|
||||
max_concurrency = num_tokens / vllm_config.model_config.max_model_len
|
||||
logger.info("Maximum concurrency for %s tokens per request: %.2fx",
|
||||
max_model_len_str, max_concurrency)
|
||||
|
||||
per_layer_size = (page_size + scale_page_size) * num_blocks
|
||||
# All layers have the same KV cache spec, so we create one kv cache group
|
||||
# for all layers.
|
||||
grouped_layer_names = [list(kv_cache_spec.keys())]
|
||||
|
||||
kv_cache_config = KVCacheConfig(
|
||||
num_blocks=num_blocks,
|
||||
tensors={
|
||||
layer_name: KVCacheTensor(size=per_layer_size)
|
||||
for layer_name in kv_cache_spec
|
||||
},
|
||||
kv_cache_groups=create_kv_cache_group_specs(kv_cache_spec,
|
||||
grouped_layer_names),
|
||||
)
|
||||
return kv_cache_config
|
||||
|
||||
|
||||
def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
|
||||
"""
|
||||
Only models with one type of KV cache are supported yet. This function tries
|
||||
to convert the KV cache specs to one type if the model is a hybrid model
|
||||
with multiple type of KV cache. It will convert all SlidingWindowSpec to
|
||||
FullAttentionSpec if both types are present.
|
||||
|
||||
Args:
|
||||
kv_cache_spec: The kv cache spec of each attention layer in the model
|
||||
"""
|
||||
|
||||
has_full_attention = any(
|
||||
isinstance(spec, FullAttentionSpec) for spec in kv_cache_spec.values())
|
||||
has_sliding_window = any(
|
||||
isinstance(spec, SlidingWindowSpec) for spec in kv_cache_spec.values())
|
||||
if has_full_attention and has_sliding_window:
|
||||
for layer_name, spec in kv_cache_spec.items():
|
||||
if isinstance(spec, SlidingWindowSpec):
|
||||
kv_cache_spec[layer_name] = FullAttentionSpec(
|
||||
block_size=spec.block_size,
|
||||
num_kv_heads=spec.num_kv_heads,
|
||||
head_size=spec.head_size,
|
||||
dtype=spec.dtype,
|
||||
use_mla=spec.use_mla,
|
||||
)
|
||||
|
||||
|
||||
def get_kv_cache_config(vllm_config: VllmConfig,
|
||||
kv_cache_spec: dict[str, KVCacheSpec],
|
||||
available_memory: int) -> KVCacheConfig:
|
||||
"""
|
||||
Generates the KV cache configuration for a model
|
||||
TODO: support hybrid models with more than one type of KV cache.
|
||||
|
||||
Args:
|
||||
vllm_config: The global VllmConfig
|
||||
kv_cache_spec: The kv cache spec of each attention layer in the model
|
||||
available_memory: Memory available for KV cache in bytes.
|
||||
|
||||
Returns:
|
||||
The generated KVCacheConfigs
|
||||
"""
|
||||
check_enough_kv_cache_memory(vllm_config, kv_cache_spec, available_memory)
|
||||
unify_hybrid_kv_cache_specs(kv_cache_spec)
|
||||
if is_kv_cache_type_uniform(kv_cache_spec):
|
||||
# KV cache of all layers are the same, which is true for
|
||||
# most models. Allocate the same amount of memory for
|
||||
# each layer.
|
||||
return _get_kv_cache_config_uniform_type(vllm_config, kv_cache_spec,
|
||||
available_memory)
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def unify_kv_cache_configs(kv_cache_configs: list[KVCacheConfig]):
|
||||
"""
|
||||
Make the KV cache configurations for each worker consistent, so that all
|
||||
workers can be controlled by the same KVCacheManager.
|
||||
This function verifies that the layer group of each worker are the same,
|
||||
and changes the num_blocks of each worker to the smallest among all workers.
|
||||
|
||||
Args:
|
||||
kv_cache_configs: The KV cache configurations for each worker. Will be
|
||||
in-place modified to make them consistent.
|
||||
"""
|
||||
|
||||
# Sort the kv cache groups by the type_id of their KV cache spec.
|
||||
# This can avoid the inconsistency caused by the order of groups.
|
||||
for kv_cache_config in kv_cache_configs:
|
||||
kv_cache_config.kv_cache_groups.sort(
|
||||
key=lambda x: x.kv_cache_spec.type_id)
|
||||
|
||||
# Verify that the groups of each rank are the same.
|
||||
for kv_cache_config in kv_cache_configs[1:]:
|
||||
for group_rank_0, group_rank_i in zip(
|
||||
kv_cache_configs[0].kv_cache_groups,
|
||||
kv_cache_config.kv_cache_groups):
|
||||
assert group_rank_0.kv_cache_spec == group_rank_i.kv_cache_spec
|
||||
|
||||
# Change the num_blocks of each rank to the smallest among all ranks. We
|
||||
# do not need to shrink the tensor size because it is valid to only use the
|
||||
# first `num_blocks` blocks of the tensor.
|
||||
min_num_blocks = min(kv_cache_config.num_blocks
|
||||
for kv_cache_config in kv_cache_configs)
|
||||
for kv_cache_config in kv_cache_configs:
|
||||
kv_cache_config.num_blocks = min_num_blocks
|
||||
|
||||
return kv_cache_configs
|
||||
0
vllm/v1/core/sched/__init__.py
Normal file
0
vllm/v1/core/sched/__init__.py
Normal file
139
vllm/v1/core/sched/interface.py
Normal file
139
vllm/v1/core/sched/interface.py
Normal file
@@ -0,0 +1,139 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from abc import ABC, abstractmethod
|
||||
from collections.abc import Iterable
|
||||
from typing import TYPE_CHECKING, Optional, Union
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.engine import EngineCoreOutputs
|
||||
from vllm.v1.metrics.stats import SchedulerStats
|
||||
from vllm.v1.outputs import ModelRunnerOutput
|
||||
from vllm.v1.request import Request, RequestStatus
|
||||
|
||||
|
||||
class SchedulerInterface(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def schedule(self) -> "SchedulerOutput":
|
||||
"""Schedule the requests to process in this scheduling step.
|
||||
|
||||
The scheduling decision is made at the iteration level. Each scheduling
|
||||
step corresponds to a single forward pass of the model. Therefore, this
|
||||
method is called repeatedly by a busy loop in the engine.
|
||||
|
||||
Essentially, the scheduler produces a dictionary of {req_id: num_tokens}
|
||||
that specifies how many tokens to process for each request in this
|
||||
scheduling step. For example, num_tokens can be as large as the number
|
||||
of prompt tokens for new requests, or it can be 1 for the requests that
|
||||
are auto-regressively generating new tokens one by one. Otherwise, it
|
||||
can be somewhere in between in case of chunked prefills, prefix caching,
|
||||
speculative decoding, etc.
|
||||
|
||||
Additionally, the scheduler also returns useful data about each request
|
||||
or the batch as a whole. The model runner will use this information in
|
||||
preparing inputs to the model.
|
||||
|
||||
Returns:
|
||||
A SchedulerOutput object containing information about the scheduled
|
||||
requests.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def update_from_output(
|
||||
self,
|
||||
scheduler_output: "SchedulerOutput",
|
||||
model_runner_output: "ModelRunnerOutput",
|
||||
) -> "EngineCoreOutputs":
|
||||
"""Update the scheduler state based on the model runner output.
|
||||
|
||||
This method is called after the model runner has processed the scheduled
|
||||
requests. The model runner output includes generated token ids, draft
|
||||
token ids for next step, etc. The scheduler uses this information to
|
||||
update its states, checks the finished requests, and returns the output
|
||||
for each request.
|
||||
|
||||
Returns:
|
||||
A EngineCoreOutputs object containing the outputs for each request.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def add_request(self, request: "Request") -> None:
|
||||
"""Add a new request to the scheduler's internal queue.
|
||||
|
||||
Args:
|
||||
request: The new request being added.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def finish_requests(
|
||||
self,
|
||||
request_ids: Union[str, Iterable[str]],
|
||||
finished_status: "RequestStatus",
|
||||
) -> None:
|
||||
"""Finish the requests in the scheduler's internal queue. If the request
|
||||
is not in the queue, this method will do nothing.
|
||||
|
||||
This method is called in two cases:
|
||||
1. When the request is aborted by the client.
|
||||
2. When the frontend process detects a stop string of the request after
|
||||
de-tokenizing its generated tokens.
|
||||
|
||||
Args:
|
||||
request_ids: A single or a list of request IDs.
|
||||
finished_status: The finished status of the given requests.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def get_num_unfinished_requests(self) -> int:
|
||||
"""Number of unfinished requests in the scheduler's internal queue."""
|
||||
raise NotImplementedError
|
||||
|
||||
def has_unfinished_requests(self) -> bool:
|
||||
"""Returns True if there are unfinished requests in the scheduler's
|
||||
internal queue."""
|
||||
return self.get_num_unfinished_requests() > 0
|
||||
|
||||
@abstractmethod
|
||||
def has_finished_requests(self) -> bool:
|
||||
"""Returns True if there are finished requests that need to be cleared.
|
||||
NOTE: This is different from `not self.has_unfinished_requests()`.
|
||||
|
||||
The scheduler maintains an internal list of the requests finished in the
|
||||
previous step. This list is returned from the next call to schedule(),
|
||||
to be sent to the model runner in the next step to clear cached states
|
||||
for these finished requests.
|
||||
|
||||
This method checks if this internal list of finished requests is
|
||||
non-empty. This information is useful for DP attention.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def has_requests(self) -> bool:
|
||||
"""Returns True if there are unfinished requests, or finished requests
|
||||
not yet returned in SchedulerOutputs."""
|
||||
return self.has_unfinished_requests() or self.has_finished_requests()
|
||||
|
||||
@abstractmethod
|
||||
def get_num_unscheduled_requests(self) -> int:
|
||||
"""Number of requests that are not being processed by the executor."""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def reset_prefix_cache(self) -> bool:
|
||||
"""Reset the prefix cache for KV cache.
|
||||
|
||||
This is particularly required when the model weights are live-updated.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def make_stats(self) -> Optional["SchedulerStats"]:
|
||||
"""Make a SchedulerStats object for logging.
|
||||
|
||||
The SchedulerStats object is created for every scheduling step.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
123
vllm/v1/core/sched/output.py
Normal file
123
vllm/v1/core/sched/output.py
Normal file
@@ -0,0 +1,123 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import numpy as np
|
||||
import numpy.typing as npt
|
||||
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.v1.request import Request
|
||||
|
||||
|
||||
@dataclass
|
||||
class NewRequestData:
|
||||
|
||||
req_id: str
|
||||
prompt_token_ids: list[int]
|
||||
prompt: Optional[str]
|
||||
mm_inputs: list[MultiModalKwargs]
|
||||
mm_hashes: list[str]
|
||||
mm_positions: list[PlaceholderRange]
|
||||
sampling_params: SamplingParams
|
||||
block_ids: list[int]
|
||||
num_computed_tokens: int
|
||||
lora_request: Optional[LoRARequest]
|
||||
|
||||
@classmethod
|
||||
def from_request(
|
||||
cls,
|
||||
request: Request,
|
||||
block_ids: list[int],
|
||||
) -> NewRequestData:
|
||||
return cls(
|
||||
req_id=request.request_id,
|
||||
prompt_token_ids=request.prompt_token_ids,
|
||||
prompt=request.prompt,
|
||||
mm_inputs=request.mm_inputs,
|
||||
mm_hashes=request.mm_hashes,
|
||||
mm_positions=request.mm_positions,
|
||||
sampling_params=request.sampling_params,
|
||||
block_ids=block_ids,
|
||||
num_computed_tokens=request.num_computed_tokens,
|
||||
lora_request=request.lora_request,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CachedRequestData:
|
||||
|
||||
req_id: str
|
||||
# If resumed_from_preemption is False, new_block_ids will be appended to
|
||||
# the request's block IDs. If True, new_block_ids will be used as the
|
||||
# request's block IDs instead of appending to the existing block IDs.
|
||||
resumed_from_preemption: bool
|
||||
new_token_ids: list[int]
|
||||
new_block_ids: list[int]
|
||||
num_computed_tokens: int
|
||||
|
||||
@classmethod
|
||||
def from_request(
|
||||
cls,
|
||||
request: Request,
|
||||
resumed_from_preemption: bool,
|
||||
new_token_ids: list[int],
|
||||
new_block_ids: list[int],
|
||||
) -> CachedRequestData:
|
||||
return cls(
|
||||
req_id=request.request_id,
|
||||
resumed_from_preemption=resumed_from_preemption,
|
||||
new_token_ids=new_token_ids,
|
||||
new_block_ids=new_block_ids,
|
||||
num_computed_tokens=request.num_computed_tokens,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SchedulerOutput:
|
||||
|
||||
# list of the requests that are scheduled for the first time.
|
||||
# We cache the request's data in each worker process, so that we don't
|
||||
# need to re-send it every scheduling step.
|
||||
scheduled_new_reqs: list[NewRequestData]
|
||||
# list of the requests that have been scheduled before.
|
||||
# Since the request's data is already cached in the worker processes,
|
||||
# we only send the diff to minimize the communication cost.
|
||||
scheduled_cached_reqs: list[CachedRequestData]
|
||||
|
||||
# req_id -> num_scheduled_tokens
|
||||
# Number of tokens scheduled for each request.
|
||||
num_scheduled_tokens: dict[str, int]
|
||||
# Total number of tokens scheduled for all requests.
|
||||
# Equal to sum(num_scheduled_tokens.values())
|
||||
total_num_scheduled_tokens: int
|
||||
# req_id -> spec_token_ids
|
||||
# If a request does not have any spec decode tokens, it will not be
|
||||
# included in the dictionary.
|
||||
scheduled_spec_decode_tokens: dict[str, list[int]]
|
||||
# req_id -> encoder input indices that need processing.
|
||||
# E.g., if a request has [0, 1], it could mean the vision encoder needs
|
||||
# to process that the request's 0-th and 1-th images in the current step.
|
||||
scheduled_encoder_inputs: dict[str, list[int]]
|
||||
# Number of common prefix blocks for all requests.
|
||||
# This can be used for cascade attention.
|
||||
num_common_prefix_blocks: int
|
||||
|
||||
# Request IDs that are finished in between the previous and the current
|
||||
# steps. This is used to notify the workers about the finished requests
|
||||
# so that they can free the cached states for those requests.
|
||||
finished_req_ids: set[str]
|
||||
# list of (req_id, encoder_input_index) tuples.
|
||||
# Used to free the encoder cache.
|
||||
free_encoder_input_ids: list[tuple[str, int]]
|
||||
|
||||
# Dict of request ids to their index within the batch
|
||||
# for filling the next token bitmask
|
||||
structured_output_request_ids: dict[str, int]
|
||||
# the bitmask for the whole batch
|
||||
grammar_bitmask: Optional[npt.NDArray[np.int32]]
|
||||
759
vllm/v1/core/sched/scheduler.py
Normal file
759
vllm/v1/core/sched/scheduler.py
Normal file
@@ -0,0 +1,759 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from collections import deque
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
from vllm.config import CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
|
||||
from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
|
||||
compute_encoder_budget)
|
||||
from vllm.v1.core.kv_cache_manager import KVCacheManager
|
||||
from vllm.v1.core.sched.interface import SchedulerInterface
|
||||
from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
|
||||
SchedulerOutput)
|
||||
from vllm.v1.core.sched.utils import check_stop
|
||||
from vllm.v1.engine import (EngineCoreEventType, EngineCoreOutput,
|
||||
EngineCoreOutputs)
|
||||
from vllm.v1.kv_cache_interface import KVCacheConfig
|
||||
from vllm.v1.metrics.stats import SchedulerStats
|
||||
from vllm.v1.outputs import ModelRunnerOutput
|
||||
from vllm.v1.request import Request, RequestStatus
|
||||
from vllm.v1.spec_decode.metrics import SpecDecodingStats
|
||||
from vllm.v1.structured_output import StructuredOutputManager
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class Scheduler(SchedulerInterface):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
scheduler_config: SchedulerConfig,
|
||||
model_config: ModelConfig,
|
||||
cache_config: CacheConfig,
|
||||
lora_config: Optional[LoRAConfig],
|
||||
kv_cache_config: KVCacheConfig,
|
||||
structured_output_manager: StructuredOutputManager,
|
||||
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
||||
include_finished_set: bool = False,
|
||||
log_stats: bool = False,
|
||||
) -> None:
|
||||
self.scheduler_config = scheduler_config
|
||||
self.cache_config = cache_config
|
||||
self.lora_config = lora_config
|
||||
self.kv_cache_config = kv_cache_config
|
||||
self.log_stats = log_stats
|
||||
self.structured_output_manager = structured_output_manager
|
||||
|
||||
# include_finished_set controls whether a separate set of finished
|
||||
# request ids should be included in the EngineCoreOutputs returned
|
||||
# by update_from_outputs(). This is currently used in the multi-engine
|
||||
# case to track request lifetimes efficiently.
|
||||
self.include_finished_set = include_finished_set
|
||||
|
||||
# Scheduling constraints.
|
||||
self.max_num_running_reqs = self.scheduler_config.max_num_seqs
|
||||
self.max_num_scheduled_tokens = \
|
||||
self.scheduler_config.max_num_batched_tokens
|
||||
self.max_model_len = self.scheduler_config.max_model_len
|
||||
|
||||
# Create the KV cache manager.
|
||||
self.kv_cache_manager = KVCacheManager(
|
||||
kv_cache_config=kv_cache_config,
|
||||
max_model_len=self.max_model_len,
|
||||
enable_caching=cache_config.enable_prefix_caching,
|
||||
caching_hash_algo=self.cache_config.prefix_caching_hash_algo,
|
||||
log_stats=self.log_stats)
|
||||
self.block_size = self.cache_config.block_size
|
||||
|
||||
# req_id -> Request
|
||||
self.requests: dict[str, Request] = {}
|
||||
# Priority queues for requests.
|
||||
self.waiting: deque[Request] = deque()
|
||||
self.running: list[Request] = []
|
||||
# The requests that have been scheduled and are being executed
|
||||
# by the executor.
|
||||
self.scheduled_req_ids: set[str] = set()
|
||||
|
||||
# The request IDs that are finished in between the previous and the
|
||||
# current steps. This is used to notify the workers about the finished
|
||||
# requests so that they can free the cached states for those requests.
|
||||
# This is flushed at the end of each scheduling step.
|
||||
self.finished_req_ids: set[str] = set()
|
||||
|
||||
# OPTIMIZATION: Cache the CachedRequestData objects to avoid creating
|
||||
# them at each scheduling step.
|
||||
# Request id -> CachedRequestData
|
||||
self._cached_reqs_data: dict[str, CachedRequestData] = {}
|
||||
|
||||
# Encoder-related.
|
||||
# Calculate encoder cache size if applicable
|
||||
# NOTE: For now we use the same budget for both compute and space.
|
||||
# This can be changed when we make encoder cache for embedding caching
|
||||
# across requests.
|
||||
encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
|
||||
model_config=model_config,
|
||||
scheduler_config=scheduler_config,
|
||||
mm_registry=mm_registry,
|
||||
)
|
||||
|
||||
# NOTE(woosuk): Here, "encoder" includes the vision encoder (and
|
||||
# projector if needed). Currently, we assume that the encoder also
|
||||
# has the Transformer architecture (e.g., ViT).
|
||||
self.max_num_encoder_input_tokens = encoder_compute_budget
|
||||
# NOTE: For the models without encoder (e.g., text-only models),
|
||||
# the encoder cache will not be initialized because cache size is 0
|
||||
# for these models.
|
||||
self.encoder_cache_manager = EncoderCacheManager(
|
||||
cache_size=encoder_cache_size)
|
||||
|
||||
def schedule(self) -> SchedulerOutput:
|
||||
# NOTE(woosuk) on the scheduling algorithm:
|
||||
# There's no "decoding phase" nor "prefill phase" in the scheduler.
|
||||
# Each request just has the num_computed_tokens and
|
||||
# num_tokens_with_spec. num_tokens_with_spec =
|
||||
# len(prompt_token_ids) + len(output_token_ids) + len(spec_token_ids).
|
||||
# At each step, the scheduler tries to assign tokens to the requests
|
||||
# so that each request's num_computed_tokens can catch up its
|
||||
# num_tokens_with_spec. This is general enough to cover
|
||||
# chunked prefills, prefix caching, speculative decoding,
|
||||
# and the "jump decoding" optimization in the future.
|
||||
|
||||
scheduled_new_reqs: list[Request] = []
|
||||
scheduled_resumed_reqs: list[Request] = []
|
||||
scheduled_running_reqs: list[Request] = []
|
||||
preempted_reqs: list[Request] = []
|
||||
|
||||
# NOTE: structured_output_request_ids maps
|
||||
# a request's (request that uses structured output)
|
||||
# request_id to the running request index.
|
||||
# This will helps us determine to slice the grammar bitmask
|
||||
# and only applies valid mask for requests that
|
||||
# uses structured decoding.
|
||||
structured_output_request_ids: dict[str, int] = {}
|
||||
|
||||
req_to_new_block_ids: dict[str, list[int]] = {}
|
||||
num_scheduled_tokens: dict[str, int] = {}
|
||||
token_budget = self.max_num_scheduled_tokens
|
||||
# Encoder-related.
|
||||
scheduled_encoder_inputs: dict[str, list[int]] = {}
|
||||
encoder_budget = self.max_num_encoder_input_tokens
|
||||
# Spec decode-related.
|
||||
scheduled_spec_decode_tokens: dict[str, list[int]] = {}
|
||||
|
||||
# For logging.
|
||||
scheduled_timestamp = time.monotonic()
|
||||
|
||||
# First, schedule the RUNNING requests.
|
||||
req_index = 0
|
||||
while req_index < len(self.running) and token_budget > 0:
|
||||
request = self.running[req_index]
|
||||
if request.request_id in self.scheduled_req_ids:
|
||||
# This request has already been scheduled.
|
||||
req_index += 1
|
||||
continue
|
||||
|
||||
num_new_tokens = (request.num_tokens_with_spec -
|
||||
request.num_computed_tokens)
|
||||
if (0 < self.scheduler_config.long_prefill_token_threshold <
|
||||
num_new_tokens):
|
||||
num_new_tokens = (
|
||||
self.scheduler_config.long_prefill_token_threshold)
|
||||
num_new_tokens = min(num_new_tokens, token_budget)
|
||||
assert num_new_tokens > 0
|
||||
|
||||
# Schedule encoder inputs.
|
||||
if request.has_encoder_inputs:
|
||||
(encoder_inputs_to_schedule, num_new_tokens,
|
||||
new_encoder_budget) = self._try_schedule_encoder_inputs(
|
||||
request, request.num_computed_tokens, num_new_tokens,
|
||||
encoder_budget)
|
||||
if num_new_tokens == 0:
|
||||
# The request cannot be scheduled because the encoder budget
|
||||
# or the encoder cache is exhausted.
|
||||
# NOTE(woosuk): By using `continue` instead of `break` here,
|
||||
# we intentionally relax the strict FCFS scheduling policy
|
||||
# to allow lower-priority requests to be scheduled when a
|
||||
# higher-priority request is blocked by encoder constraints.
|
||||
req_index += 1
|
||||
continue
|
||||
else:
|
||||
encoder_inputs_to_schedule = None
|
||||
new_encoder_budget = encoder_budget
|
||||
|
||||
while True:
|
||||
new_blocks = self.kv_cache_manager.allocate_slots(
|
||||
request, num_new_tokens)
|
||||
if new_blocks is None:
|
||||
# The request cannot be scheduled.
|
||||
# Preempt the lowest-priority request.
|
||||
preempted_req = self.running.pop()
|
||||
self.kv_cache_manager.free(preempted_req)
|
||||
preempted_req.status = RequestStatus.PREEMPTED
|
||||
preempted_req.num_computed_tokens = 0
|
||||
if self.log_stats:
|
||||
preempted_req.record_event(
|
||||
EngineCoreEventType.PREEMPTED, scheduled_timestamp)
|
||||
|
||||
self.waiting.appendleft(preempted_req)
|
||||
preempted_reqs.append(preempted_req)
|
||||
if preempted_req == request:
|
||||
# No more request to preempt.
|
||||
can_schedule = False
|
||||
break
|
||||
else:
|
||||
# The request can be scheduled.
|
||||
can_schedule = True
|
||||
break
|
||||
if not can_schedule:
|
||||
break
|
||||
assert new_blocks is not None
|
||||
|
||||
# Schedule the request.
|
||||
scheduled_running_reqs.append(request)
|
||||
self.scheduled_req_ids.add(request.request_id)
|
||||
if request.use_structured_output:
|
||||
# PERF: in case of chunked prefill,
|
||||
# request might not include any new tokens.
|
||||
# Therefore, we might introduce some additional
|
||||
# cycle to fill in the bitmask, which could be a big no-op.
|
||||
structured_output_request_ids[request.request_id] = req_index
|
||||
req_to_new_block_ids[request.request_id] = [
|
||||
b.block_id for b in new_blocks
|
||||
]
|
||||
num_scheduled_tokens[request.request_id] = num_new_tokens
|
||||
token_budget -= num_new_tokens
|
||||
req_index += 1
|
||||
|
||||
# Speculative decode related.
|
||||
if request.spec_token_ids:
|
||||
num_scheduled_spec_tokens = (num_new_tokens +
|
||||
request.num_computed_tokens -
|
||||
request.num_tokens)
|
||||
if num_scheduled_spec_tokens > 0:
|
||||
# Trim spec_token_ids list to num_scheduled_spec_tokens.
|
||||
del request.spec_token_ids[num_scheduled_spec_tokens:]
|
||||
scheduled_spec_decode_tokens[request.request_id] = (
|
||||
request.spec_token_ids)
|
||||
|
||||
# Encoder-related.
|
||||
if encoder_inputs_to_schedule:
|
||||
scheduled_encoder_inputs[request.request_id] = (
|
||||
encoder_inputs_to_schedule)
|
||||
# Allocate the encoder cache.
|
||||
for i in encoder_inputs_to_schedule:
|
||||
self.encoder_cache_manager.allocate(request, i)
|
||||
encoder_budget = new_encoder_budget
|
||||
|
||||
# Record the LoRAs in scheduled_running_reqs
|
||||
scheduled_loras: set[int] = set()
|
||||
if self.lora_config:
|
||||
scheduled_loras = set(
|
||||
req.lora_request.lora_int_id for req in scheduled_running_reqs
|
||||
if req.lora_request and req.lora_request.lora_int_id > 0)
|
||||
assert len(scheduled_loras) <= self.lora_config.max_loras
|
||||
|
||||
# Use a temporary deque to collect requests that need to be skipped
|
||||
# and put back at the head of the waiting queue later
|
||||
skipped_waiting_requests: deque[Request] = deque()
|
||||
|
||||
# Next, schedule the WAITING requests.
|
||||
if not preempted_reqs:
|
||||
while self.waiting and token_budget > 0:
|
||||
if len(self.running) == self.max_num_running_reqs:
|
||||
break
|
||||
|
||||
request = self.waiting[0]
|
||||
|
||||
# Skip request if the structured output request is still waiting
|
||||
# for FSM compilation.
|
||||
if request.status == RequestStatus.WAITING_FOR_FSM:
|
||||
structured_output_req = request.structured_output_request
|
||||
if structured_output_req and structured_output_req.grammar:
|
||||
request.status = RequestStatus.WAITING
|
||||
else:
|
||||
self.waiting.popleft()
|
||||
skipped_waiting_requests.appendleft(request)
|
||||
continue
|
||||
|
||||
# Check that adding the request still respects the max_loras
|
||||
# constraint.
|
||||
if self.lora_config and request.lora_request and (
|
||||
len(scheduled_loras) == self.lora_config.max_loras
|
||||
and request.lora_request.lora_int_id
|
||||
not in scheduled_loras):
|
||||
# Scheduling would exceed max_loras, skip.
|
||||
self.waiting.popleft()
|
||||
skipped_waiting_requests.appendleft(request)
|
||||
continue
|
||||
|
||||
# Get already-cached tokens.
|
||||
computed_blocks, num_computed_tokens = \
|
||||
self.kv_cache_manager.get_computed_blocks(request)
|
||||
# Number of tokens to be scheduled.
|
||||
# We use `request.num_tokens` instead of
|
||||
# `request.num_prompt_tokens` to consider the resumed requests,
|
||||
# which have output tokens.
|
||||
num_new_tokens = request.num_tokens - num_computed_tokens
|
||||
if (0 < self.scheduler_config.long_prefill_token_threshold <
|
||||
num_new_tokens):
|
||||
num_new_tokens = (
|
||||
self.scheduler_config.long_prefill_token_threshold)
|
||||
num_new_tokens = min(num_new_tokens, token_budget)
|
||||
assert num_new_tokens > 0
|
||||
|
||||
# Schedule encoder inputs.
|
||||
if request.has_encoder_inputs:
|
||||
(encoder_inputs_to_schedule, num_new_tokens,
|
||||
new_encoder_budget) = self._try_schedule_encoder_inputs(
|
||||
request, num_computed_tokens, num_new_tokens,
|
||||
encoder_budget)
|
||||
if num_new_tokens == 0:
|
||||
# The request cannot be scheduled.
|
||||
break
|
||||
else:
|
||||
encoder_inputs_to_schedule = None
|
||||
new_encoder_budget = encoder_budget
|
||||
|
||||
new_blocks = self.kv_cache_manager.allocate_slots(
|
||||
request, num_new_tokens, computed_blocks)
|
||||
if new_blocks is None:
|
||||
# The request cannot be scheduled.
|
||||
break
|
||||
|
||||
self.waiting.popleft()
|
||||
if request.use_structured_output:
|
||||
structured_output_request_ids[
|
||||
request.request_id] = req_index
|
||||
req_index += 1
|
||||
self.running.append(request)
|
||||
self.scheduled_req_ids.add(request.request_id)
|
||||
if self.log_stats:
|
||||
request.record_event(EngineCoreEventType.SCHEDULED,
|
||||
scheduled_timestamp)
|
||||
if request.status == RequestStatus.WAITING:
|
||||
scheduled_new_reqs.append(request)
|
||||
elif request.status == RequestStatus.PREEMPTED:
|
||||
scheduled_resumed_reqs.append(request)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Invalid request status: {request.status}")
|
||||
|
||||
if self.lora_config and request.lora_request:
|
||||
scheduled_loras.add(request.lora_request.lora_int_id)
|
||||
req_to_new_block_ids[request.request_id] = [
|
||||
b.block_id for b in computed_blocks + new_blocks
|
||||
]
|
||||
num_scheduled_tokens[request.request_id] = num_new_tokens
|
||||
token_budget -= num_new_tokens
|
||||
request.status = RequestStatus.RUNNING
|
||||
request.num_computed_tokens = num_computed_tokens
|
||||
|
||||
# Encoder-related.
|
||||
if encoder_inputs_to_schedule:
|
||||
scheduled_encoder_inputs[request.request_id] = (
|
||||
encoder_inputs_to_schedule)
|
||||
# Allocate the encoder cache.
|
||||
for i in encoder_inputs_to_schedule:
|
||||
self.encoder_cache_manager.allocate(request, i)
|
||||
encoder_budget = new_encoder_budget
|
||||
|
||||
# Put back any skipped requests at the head of the waiting queue
|
||||
if skipped_waiting_requests:
|
||||
self.waiting.extendleft(skipped_waiting_requests)
|
||||
|
||||
# Check if the scheduling constraints are satisfied.
|
||||
total_num_scheduled_tokens = sum(num_scheduled_tokens.values())
|
||||
assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens
|
||||
assert token_budget >= 0
|
||||
assert len(self.running) <= self.max_num_running_reqs
|
||||
# Since some requests in the RUNNING queue may not be scheduled in
|
||||
# this step, the total number of scheduled requests can be smaller than
|
||||
# len(self.running).
|
||||
assert (len(scheduled_new_reqs) + len(scheduled_resumed_reqs) +
|
||||
len(scheduled_running_reqs) <= len(self.running))
|
||||
|
||||
# Get the longest common prefix among all requests in the running queue.
|
||||
# This can be potentially used for cascade attention.
|
||||
num_common_prefix_blocks = 0
|
||||
if self.running:
|
||||
any_request = self.running[0]
|
||||
num_common_prefix_blocks = (
|
||||
self.kv_cache_manager.get_num_common_prefix_blocks(
|
||||
any_request, len(self.running)))
|
||||
|
||||
grammar_bitmask = self.structured_output_manager.grammar_bitmask(
|
||||
self.requests,
|
||||
structured_output_request_ids,
|
||||
len(self.running),
|
||||
)
|
||||
# Construct the scheduler output.
|
||||
new_reqs_data = [
|
||||
NewRequestData.from_request(req,
|
||||
req_to_new_block_ids[req.request_id])
|
||||
for req in scheduled_new_reqs
|
||||
]
|
||||
resumed_reqs_data = [
|
||||
self._make_cached_request_data(
|
||||
req,
|
||||
num_scheduled_tokens[req.request_id],
|
||||
len(scheduled_spec_decode_tokens.get(req.request_id, ())),
|
||||
req_to_new_block_ids[req.request_id],
|
||||
resumed_from_preemption=True,
|
||||
) for req in scheduled_resumed_reqs
|
||||
]
|
||||
running_reqs_data = [
|
||||
self._make_cached_request_data(
|
||||
req,
|
||||
num_scheduled_tokens[req.request_id],
|
||||
len(scheduled_spec_decode_tokens.get(req.request_id, ())),
|
||||
req_to_new_block_ids[req.request_id],
|
||||
resumed_from_preemption=False,
|
||||
) for req in scheduled_running_reqs
|
||||
]
|
||||
scheduler_output = SchedulerOutput(
|
||||
scheduled_new_reqs=new_reqs_data,
|
||||
scheduled_cached_reqs=resumed_reqs_data + running_reqs_data,
|
||||
num_scheduled_tokens=num_scheduled_tokens,
|
||||
total_num_scheduled_tokens=total_num_scheduled_tokens,
|
||||
scheduled_spec_decode_tokens=scheduled_spec_decode_tokens,
|
||||
scheduled_encoder_inputs=scheduled_encoder_inputs,
|
||||
num_common_prefix_blocks=num_common_prefix_blocks,
|
||||
# finished_req_ids is an existing state in the scheduler,
|
||||
# instead of being newly scheduled in this step.
|
||||
# It contains the request IDs that are finished in between
|
||||
# the previous and the current steps.
|
||||
finished_req_ids=self.finished_req_ids,
|
||||
free_encoder_input_ids=self.encoder_cache_manager.get_freed_ids(),
|
||||
structured_output_request_ids=structured_output_request_ids,
|
||||
grammar_bitmask=grammar_bitmask,
|
||||
)
|
||||
|
||||
# Advance the number of computed tokens for the request AFTER
|
||||
# the request is scheduled.
|
||||
# 1. The scheduler_output of the current step has to include the
|
||||
# original number of scheduled tokens to determine input IDs.
|
||||
# 2. Advance the number of computed tokens here allowing us to
|
||||
# schedule the prefill request again immediately in the next
|
||||
# scheduling step.
|
||||
# 3. If some tokens (e.g. spec tokens) are rejected later, the number of
|
||||
# computed tokens will be adjusted in update_from_output.
|
||||
for req_id, num_scheduled_token in num_scheduled_tokens.items():
|
||||
self.requests[req_id].num_computed_tokens += num_scheduled_token
|
||||
|
||||
self.finished_req_ids = set()
|
||||
return scheduler_output
|
||||
|
||||
def _make_cached_request_data(
|
||||
self,
|
||||
request: Request,
|
||||
num_scheduled_tokens: int,
|
||||
num_scheduled_spec_tokens: int,
|
||||
new_block_ids: list[int],
|
||||
resumed_from_preemption: bool,
|
||||
) -> CachedRequestData:
|
||||
# OPTIMIZATION: Cache the CachedRequestData objects to avoid creating
|
||||
# them at each scheduling step.
|
||||
num_computed_tokens = request.num_computed_tokens
|
||||
num_regular_tokens = num_scheduled_tokens - num_scheduled_spec_tokens
|
||||
new_token_ids = request.all_token_ids[
|
||||
num_computed_tokens:num_computed_tokens + num_regular_tokens]
|
||||
req_data = self._cached_reqs_data.get(request.request_id)
|
||||
if req_data is not None:
|
||||
req_data.resumed_from_preemption = resumed_from_preemption
|
||||
req_data.new_token_ids = new_token_ids
|
||||
req_data.new_block_ids = new_block_ids
|
||||
req_data.num_computed_tokens = num_computed_tokens
|
||||
else:
|
||||
req_data = CachedRequestData.from_request(request,
|
||||
resumed_from_preemption,
|
||||
new_token_ids,
|
||||
new_block_ids)
|
||||
self._cached_reqs_data[request.request_id] = req_data
|
||||
return req_data
|
||||
|
||||
def _try_schedule_encoder_inputs(
|
||||
self,
|
||||
request: Request,
|
||||
num_computed_tokens: int,
|
||||
num_new_tokens: int,
|
||||
encoder_budget: int,
|
||||
) -> tuple[list[int], int, int]:
|
||||
"""
|
||||
Determine which encoder inputs need to be scheduled in the current step,
|
||||
and update `num_new_tokens` and encoder token budget accordingly.
|
||||
|
||||
An encoder input will be scheduled if:
|
||||
- Its output tokens overlap with the range of tokens being computed
|
||||
in this step, i.e.,
|
||||
[num_computed_tokens, num_computed_tokens + num_new_tokens).
|
||||
- It is not already computed and stored in the encoder cache.
|
||||
- There is sufficient encoder token budget to process it.
|
||||
- The encoder cache has space to store it.
|
||||
|
||||
If an encoder input cannot be scheduled due to cache or budget
|
||||
limitations, the method adjusts `num_new_tokens` to schedule only the
|
||||
decoder tokens up to just before the unschedulable encoder input.
|
||||
"""
|
||||
encoder_inputs_to_schedule: list[int] = []
|
||||
mm_positions = request.mm_positions
|
||||
assert mm_positions is not None
|
||||
assert len(mm_positions) > 0
|
||||
for i, pos_info in enumerate(mm_positions):
|
||||
start_pos = pos_info["offset"]
|
||||
num_encoder_tokens = pos_info["length"]
|
||||
|
||||
# The encoder output is needed if the two ranges overlap:
|
||||
# [num_computed_tokens, num_computed_tokens + num_new_tokens) and
|
||||
# [start_pos, start_pos + num_encoder_tokens)
|
||||
if start_pos >= num_computed_tokens + num_new_tokens:
|
||||
# The encoder input is not needed in this step.
|
||||
break
|
||||
if start_pos + num_encoder_tokens <= num_computed_tokens:
|
||||
# The encoder input is already computed and stored
|
||||
# in the decoder's KV cache.
|
||||
continue
|
||||
|
||||
if self.encoder_cache_manager.has_cache(request, i):
|
||||
# The encoder input is already computed and cached.
|
||||
continue
|
||||
if (not self.encoder_cache_manager.can_allocate(request, i)
|
||||
or num_encoder_tokens > encoder_budget):
|
||||
# The encoder cache is full or the encoder budget is exhausted.
|
||||
# NOTE(woosuk): We assume that the encoder input tokens should
|
||||
# be processed altogether, as the encoder usually uses
|
||||
# bidirectional attention.
|
||||
if num_computed_tokens < start_pos:
|
||||
# We only schedule the decoder tokens just before the
|
||||
# encoder input.
|
||||
num_new_tokens = start_pos - num_computed_tokens
|
||||
else:
|
||||
# Because of prefix caching, num_computed_tokens is greater
|
||||
# than start_pos even though its encoder input is not
|
||||
# available. In this case, we can't schedule any token for
|
||||
# the request in this step.
|
||||
num_new_tokens = 0
|
||||
break
|
||||
|
||||
encoder_budget -= num_encoder_tokens
|
||||
encoder_inputs_to_schedule.append(i)
|
||||
return encoder_inputs_to_schedule, num_new_tokens, encoder_budget
|
||||
|
||||
def update_from_output(
|
||||
self,
|
||||
scheduler_output: SchedulerOutput,
|
||||
model_runner_output: ModelRunnerOutput,
|
||||
) -> EngineCoreOutputs:
|
||||
sampled_token_ids = model_runner_output.sampled_token_ids
|
||||
spec_token_ids = model_runner_output.spec_token_ids
|
||||
logprobs = model_runner_output.logprobs
|
||||
prompt_logprobs_dict = model_runner_output.prompt_logprobs_dict
|
||||
num_scheduled_tokens = scheduler_output.num_scheduled_tokens
|
||||
|
||||
new_running: list[Request] = []
|
||||
outputs: list[EngineCoreOutput] = []
|
||||
spec_decoding_stats: Optional[SpecDecodingStats] = None
|
||||
|
||||
# NOTE(woosuk): As len(self.running) can be up to 1K or more, the below
|
||||
# loop can be a performance bottleneck. We should do our best to avoid
|
||||
# expensive operations inside the loop.
|
||||
for request in self.running:
|
||||
req_id = request.request_id
|
||||
num_tokens_scheduled = num_scheduled_tokens.get(req_id, 0)
|
||||
if num_tokens_scheduled == 0:
|
||||
# The request was not scheduled in this step.
|
||||
new_running.append(request)
|
||||
continue
|
||||
|
||||
req_index = model_runner_output.req_id_to_index[req_id]
|
||||
generated_token_ids = sampled_token_ids[req_index]
|
||||
|
||||
scheduled_spec_token_ids = (
|
||||
scheduler_output.scheduled_spec_decode_tokens.get(req_id))
|
||||
if scheduled_spec_token_ids:
|
||||
# num_computed_tokens represents the number of tokens
|
||||
# processed in the current step, considering scheduled
|
||||
# tokens and rejections. If some tokens are rejected,
|
||||
# num_computed_tokens is decreased by the number of rejected
|
||||
# tokens, where is given by:
|
||||
# len(scheduled_spec_token_ids) + 1 - len(generated_token_ids).
|
||||
num_tokens_rejected = (len(scheduled_spec_token_ids) + 1 -
|
||||
len(generated_token_ids))
|
||||
request.num_computed_tokens -= num_tokens_rejected
|
||||
spec_decoding_stats = self.make_spec_decoding_stats(
|
||||
spec_decoding_stats,
|
||||
num_draft_tokens=len(scheduled_spec_token_ids),
|
||||
num_accepted_tokens=len(generated_token_ids) - 1)
|
||||
|
||||
cached_encoder_input_ids = (
|
||||
self.encoder_cache_manager.get_cached_input_ids(request))
|
||||
# OPTIMIZATION: Avoid list(set) if the set is empty.
|
||||
if cached_encoder_input_ids:
|
||||
for input_id in list(cached_encoder_input_ids):
|
||||
mm_positions = request.mm_positions[input_id]
|
||||
start_pos = mm_positions["offset"]
|
||||
num_tokens = mm_positions["length"]
|
||||
if start_pos + num_tokens <= request.num_computed_tokens:
|
||||
# The encoder output is already processed and stored
|
||||
# in the decoder's KV cache.
|
||||
self.encoder_cache_manager.free_encoder_input(
|
||||
request, input_id)
|
||||
|
||||
# Add newly generated spec token ids to the request.
|
||||
if spec_token_ids is not None:
|
||||
request.spec_token_ids = spec_token_ids[req_index]
|
||||
|
||||
stopped = False
|
||||
new_logprobs = None
|
||||
new_token_ids = generated_token_ids
|
||||
|
||||
# Append generated tokens and check for stop. Note that if
|
||||
# a request is still being prefilled, we expect the model runner
|
||||
# to return empty token ids for the request.
|
||||
for num_new, output_token_id in enumerate(new_token_ids, 1):
|
||||
request.append_output_token_ids(output_token_id)
|
||||
|
||||
# Check for stop and update request state.
|
||||
# This must be called before we make the EngineCoreOutput.
|
||||
stopped = check_stop(request, self.max_model_len)
|
||||
if stopped:
|
||||
self._free_request(request)
|
||||
del new_token_ids[num_new:] # Trim new tokens if needed.
|
||||
break
|
||||
|
||||
# Extract sample logprobs if needed.
|
||||
if request.sampling_params.logprobs is not None and logprobs:
|
||||
# NOTE: once we support N tokens per step (spec decode),
|
||||
# the outer lists can be of length > 1.
|
||||
new_logprobs = logprobs.slice(req_index, req_index + 1)
|
||||
|
||||
if new_token_ids and request.use_structured_output:
|
||||
# NOTE: structured_output_request
|
||||
# should not be None if use_structured_output, we have
|
||||
# check above, so safe to ignore type warning
|
||||
request.structured_output_request.grammar.accept_tokens( # type: ignore[union-attr]
|
||||
req_id, new_token_ids)
|
||||
|
||||
# Get prompt logprobs for this request.
|
||||
prompt_logprobs_tensors = prompt_logprobs_dict.get(req_id)
|
||||
if new_token_ids:
|
||||
# Add EngineCoreOutput for this Request.
|
||||
outputs.append(
|
||||
EngineCoreOutput(
|
||||
request_id=req_id,
|
||||
new_token_ids=new_token_ids,
|
||||
finish_reason=request.get_finished_reason(),
|
||||
new_logprobs=new_logprobs,
|
||||
new_prompt_logprobs_tensors=prompt_logprobs_tensors,
|
||||
stop_reason=request.stop_reason,
|
||||
events=request.take_events()))
|
||||
else:
|
||||
# Invariant: EngineCore returns no partial prefill outputs.
|
||||
assert not prompt_logprobs_tensors
|
||||
|
||||
self.scheduled_req_ids.remove(req_id)
|
||||
if not stopped:
|
||||
new_running.append(request)
|
||||
|
||||
self.running = new_running
|
||||
engine_core_outputs = EngineCoreOutputs(
|
||||
outputs=outputs,
|
||||
scheduler_stats=self.make_stats(spec_decoding_stats),
|
||||
)
|
||||
if self.include_finished_set:
|
||||
#TODO currently sending duplicates here, improve this
|
||||
engine_core_outputs.finished_requests = (
|
||||
scheduler_output.finished_req_ids | self.finished_req_ids)
|
||||
|
||||
return engine_core_outputs
|
||||
|
||||
def add_request(self, request: Request) -> None:
|
||||
self.waiting.append(request)
|
||||
self.requests[request.request_id] = request
|
||||
if self.log_stats:
|
||||
request.record_event(EngineCoreEventType.QUEUED)
|
||||
|
||||
def finish_requests(
|
||||
self,
|
||||
request_ids: Union[str, Iterable[str]],
|
||||
finished_status: RequestStatus,
|
||||
) -> None:
|
||||
"""Handles the finish signal from outside the scheduler.
|
||||
|
||||
For example, the API server can abort a request when the client
|
||||
disconnects.
|
||||
"""
|
||||
assert RequestStatus.is_finished(finished_status)
|
||||
if isinstance(request_ids, str):
|
||||
request_ids = (request_ids, )
|
||||
else:
|
||||
request_ids = set(request_ids)
|
||||
|
||||
for req_id in request_ids:
|
||||
request = self.requests.get(req_id)
|
||||
if request is None:
|
||||
# Invalid request ID.
|
||||
continue
|
||||
|
||||
if request.status == RequestStatus.RUNNING:
|
||||
self.running.remove(request)
|
||||
self.scheduled_req_ids.discard(request.request_id)
|
||||
else:
|
||||
self.waiting.remove(request)
|
||||
request.status = finished_status
|
||||
self._free_request(request)
|
||||
|
||||
def _free_request(self, request: Request) -> None:
|
||||
assert request.is_finished()
|
||||
self.kv_cache_manager.free(request)
|
||||
self.kv_cache_manager.free_block_hashes(request)
|
||||
self.encoder_cache_manager.free(request)
|
||||
self._cached_reqs_data.pop(request.request_id, None)
|
||||
del self.requests[request.request_id]
|
||||
self.finished_req_ids.add(request.request_id)
|
||||
|
||||
def get_num_unfinished_requests(self) -> int:
|
||||
return len(self.waiting) + len(self.running)
|
||||
|
||||
def has_finished_requests(self) -> bool:
|
||||
return len(self.finished_req_ids) > 0
|
||||
|
||||
def get_num_unscheduled_requests(self) -> int:
|
||||
"""Number of requests that are not being processed by the executor."""
|
||||
return self.get_num_unfinished_requests() - len(self.scheduled_req_ids)
|
||||
|
||||
def reset_prefix_cache(self) -> bool:
|
||||
return self.kv_cache_manager.reset_prefix_cache()
|
||||
|
||||
def make_stats(
|
||||
self,
|
||||
spec_decoding_stats: Optional[SpecDecodingStats] = None,
|
||||
) -> Optional[SchedulerStats]:
|
||||
if not self.log_stats:
|
||||
return None
|
||||
return SchedulerStats(
|
||||
num_running_reqs=len(self.running),
|
||||
num_waiting_reqs=len(self.waiting),
|
||||
gpu_cache_usage=self.kv_cache_manager.usage,
|
||||
prefix_cache_stats=self.kv_cache_manager.make_prefix_cache_stats(),
|
||||
spec_decoding_stats=spec_decoding_stats,
|
||||
)
|
||||
|
||||
def make_spec_decoding_stats(
|
||||
self,
|
||||
spec_decoding_stats: Optional[SpecDecodingStats],
|
||||
num_draft_tokens: int,
|
||||
num_accepted_tokens: int,
|
||||
) -> Optional[SpecDecodingStats]:
|
||||
if not self.log_stats:
|
||||
return None
|
||||
if spec_decoding_stats is None:
|
||||
spec_decoding_stats = SpecDecodingStats()
|
||||
spec_decoding_stats.observe(num_draft_tokens=num_draft_tokens,
|
||||
num_accepted_tokens=num_accepted_tokens)
|
||||
return spec_decoding_stats
|
||||
22
vllm/v1/core/sched/utils.py
Normal file
22
vllm/v1/core/sched/utils.py
Normal file
@@ -0,0 +1,22 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from vllm.v1.request import Request, RequestStatus
|
||||
|
||||
|
||||
def check_stop(request: Request, max_model_len: int) -> bool:
|
||||
if (request.num_tokens >= max_model_len
|
||||
or request.num_output_tokens >= request.max_tokens):
|
||||
request.status = RequestStatus.FINISHED_LENGTH_CAPPED
|
||||
return True
|
||||
|
||||
sampling_params = request.sampling_params
|
||||
last_token_id = request.output_token_ids[-1]
|
||||
if (not sampling_params.ignore_eos
|
||||
and last_token_id == request.eos_token_id):
|
||||
request.status = RequestStatus.FINISHED_STOPPED
|
||||
return True
|
||||
|
||||
if last_token_id in (sampling_params.stop_token_ids or ()):
|
||||
request.status = RequestStatus.FINISHED_STOPPED
|
||||
request.stop_reason = last_token_id
|
||||
return True
|
||||
return False
|
||||
161
vllm/v1/core/specialized_manager.py
Normal file
161
vllm/v1/core/specialized_manager.py
Normal file
@@ -0,0 +1,161 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from vllm.utils import cdiv
|
||||
from vllm.v1.core.block_pool import BlockPool
|
||||
from vllm.v1.core.kv_cache_utils import BlockHashType, KVCacheBlock
|
||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheSpec,
|
||||
SlidingWindowSpec)
|
||||
|
||||
|
||||
class SpecializedManager(ABC):
|
||||
"""
|
||||
An abstract base class for specialized managers that handle the kv
|
||||
cache management logic of different attention layers.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
kv_cache_spec: KVCacheSpec,
|
||||
block_pool: BlockPool,
|
||||
) -> None:
|
||||
"""
|
||||
Initializes the SpecializedManager.
|
||||
Args:
|
||||
kv_cache_spec: The kv_cache_spec for this manager.
|
||||
block_pool: The block pool.
|
||||
"""
|
||||
|
||||
self.block_size = kv_cache_spec.block_size
|
||||
self.kv_cache_spec = kv_cache_spec
|
||||
self.block_pool = block_pool
|
||||
|
||||
@abstractmethod
|
||||
def find_longest_cache_hit(
|
||||
self, block_hashes: list[BlockHashType]) -> list[KVCacheBlock]:
|
||||
"""
|
||||
Get the longest cache hit prefix of the blocks. If no cache hit is
|
||||
found, return an empty list.
|
||||
|
||||
Args:
|
||||
block_hashes: The block hashes of the request.
|
||||
Returns:
|
||||
A list of cached blocks with skipped blocks replaced by null block.
|
||||
For example, sliding window manager should return a list like
|
||||
[NULL, NULL, KVCacheBlock(7), KVCacheBlock(8)] for block size 4 and
|
||||
sliding window 8.
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def remove_skipped_blocks(self, blocks: list[KVCacheBlock],
|
||||
num_computed_tokens: int) -> list[KVCacheBlock]:
|
||||
"""
|
||||
Remove the blocks that are no longer needed from `blocks`. The removed
|
||||
blocks should be replaced by null_block. Return the removed blocks in
|
||||
eviction order, where the first returned block should be evicted first.
|
||||
Don't free the removed blocks in this function.
|
||||
|
||||
Args:
|
||||
blocks: The list of blocks to be updated.
|
||||
num_computed_tokens: The number of tokens that have been computed.
|
||||
Returns:
|
||||
The removed blocks in eviction order.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class FullAttentionManager(SpecializedManager):
|
||||
|
||||
def find_longest_cache_hit(
|
||||
self, block_hashes: list[BlockHashType]) -> list[KVCacheBlock]:
|
||||
computed_blocks: list[KVCacheBlock] = []
|
||||
for block_hash in block_hashes:
|
||||
# block_hashes is a chain of block hashes. If a block hash is not
|
||||
# in the cached_block_hash_to_id, the following block hashes are
|
||||
# not computed yet for sure.
|
||||
if cached_block := self.block_pool.get_cached_block(block_hash):
|
||||
computed_blocks.append(cached_block)
|
||||
else:
|
||||
break
|
||||
return computed_blocks
|
||||
|
||||
def remove_skipped_blocks(self, blocks: list[KVCacheBlock],
|
||||
num_computed_tokens: int) -> list[KVCacheBlock]:
|
||||
# No need to remove blocks for full attention.
|
||||
return []
|
||||
|
||||
|
||||
class SlidingWindowManager(SpecializedManager):
|
||||
|
||||
def __init__(self, kv_cache_spec: SlidingWindowSpec,
|
||||
block_pool: BlockPool):
|
||||
super().__init__(kv_cache_spec, block_pool)
|
||||
self.sliding_window = kv_cache_spec.sliding_window
|
||||
# The number of contiguous blocks needed for prefix cache hit.
|
||||
# -1 since the input token itself is also included in the window
|
||||
self.sliding_window_contiguous_blocks = cdiv(
|
||||
(kv_cache_spec.sliding_window - 1), self.block_size)
|
||||
self._null_block = block_pool.null_block
|
||||
|
||||
def find_longest_cache_hit(
|
||||
self, block_hashes: list[BlockHashType]) -> list[KVCacheBlock]:
|
||||
# TODO: reduce i by sliding_window_contiguous_blocks when cache miss, to
|
||||
# optimize the time complexity from O(len(block_hashes)) to
|
||||
# O(len(block_hashes) / sliding_window_contiguous_blocks +
|
||||
# sliding_window_contiguous_blocks),
|
||||
# which is good for low cache hit rate scenarios.
|
||||
computed_blocks = [self._null_block] * len(block_hashes)
|
||||
num_contiguous_blocks = 0
|
||||
|
||||
# Search from right to left and early stop when a match is found.
|
||||
for i in range(len(block_hashes) - 1, -1, -1):
|
||||
if cached_block := self.block_pool.get_cached_block(
|
||||
block_hashes[i]):
|
||||
computed_blocks[i] = cached_block
|
||||
num_contiguous_blocks += 1
|
||||
if (num_contiguous_blocks
|
||||
>= self.sliding_window_contiguous_blocks):
|
||||
# Trim the trailing blocks.
|
||||
# E.g., [NULL, NULL, 8, 3, NULL, 9] -> [NULL, NULL, 8, 3]
|
||||
# when sliding_window_contiguous_blocks=2.
|
||||
del computed_blocks[i + num_contiguous_blocks:]
|
||||
return computed_blocks
|
||||
else:
|
||||
num_contiguous_blocks = 0
|
||||
# The first `num_contiguous_blocks` is a cache hit even if
|
||||
# `num_contiguous_blocks < sliding_window_contiguous_blocks`.
|
||||
del computed_blocks[num_contiguous_blocks:]
|
||||
return computed_blocks
|
||||
|
||||
def remove_skipped_blocks(self, blocks: list[KVCacheBlock],
|
||||
num_computed_tokens: int) -> list[KVCacheBlock]:
|
||||
# Remove the blocks that are no longer be in the sliding window and
|
||||
# skipped during the attention computation.
|
||||
last_useful_token = num_computed_tokens - self.sliding_window + 1
|
||||
last_useful_block = last_useful_token // self.block_size
|
||||
|
||||
removed_blocks: list[KVCacheBlock] = []
|
||||
for i in range(last_useful_block - 1, -1, -1):
|
||||
if blocks[i] == self._null_block:
|
||||
# If the block is already a null block, the blocks before it
|
||||
# should also have been set to null blocks by the previous calls
|
||||
# to this function.
|
||||
break
|
||||
removed_blocks.append(blocks[i])
|
||||
blocks[i] = self._null_block
|
||||
return removed_blocks
|
||||
|
||||
|
||||
spec_manager_map: dict[type[KVCacheSpec], type[SpecializedManager]] = {
|
||||
FullAttentionSpec: FullAttentionManager,
|
||||
SlidingWindowSpec: SlidingWindowManager,
|
||||
}
|
||||
|
||||
|
||||
def get_specialized_manager(kv_cache_spec: KVCacheSpec,
|
||||
block_pool: BlockPool) -> SpecializedManager:
|
||||
manager_class = spec_manager_map[type(kv_cache_spec)]
|
||||
manager = manager_class(kv_cache_spec, block_pool)
|
||||
return manager
|
||||
Reference in New Issue
Block a user