forked from EngineX-Cambricon/enginex-mlu370-vllm
add qwen3
This commit is contained in:
0
vllm-v0.6.2/vllm/v1/core/__init__.py
Normal file
0
vllm-v0.6.2/vllm/v1/core/__init__.py
Normal file
48
vllm-v0.6.2/vllm/v1/core/encoder_cache_manager.py
Normal file
48
vllm-v0.6.2/vllm/v1/core/encoder_cache_manager.py
Normal file
@@ -0,0 +1,48 @@
|
||||
from typing import Dict, List, Set, Tuple
|
||||
|
||||
from vllm.v1.request import Request
|
||||
|
||||
|
||||
class EncoderCacheManager:
|
||||
|
||||
def __init__(self, cache_size: int):
|
||||
self.cache_size = cache_size
|
||||
self.num_free_slots = cache_size
|
||||
# req_id -> cached input ids
|
||||
self.cached: Dict[str, Set[int]] = {}
|
||||
# List of [req_id, input_id]
|
||||
self.freed: List[Tuple[str, int]] = []
|
||||
|
||||
def has_cache(self, request: Request, input_id: int) -> bool:
|
||||
req_id = request.request_id
|
||||
return req_id in self.cached and input_id in self.cached[req_id]
|
||||
|
||||
def can_allocate(self, request: Request, input_id: int) -> bool:
|
||||
num_tokens = request.get_num_encoder_tokens(input_id)
|
||||
return num_tokens <= self.num_free_slots
|
||||
|
||||
def allocate(self, request: Request, input_id: int) -> None:
|
||||
req_id = request.request_id
|
||||
if req_id not in self.cached:
|
||||
self.cached[req_id] = set()
|
||||
self.cached[req_id].add(input_id)
|
||||
self.num_free_slots -= request.get_num_encoder_tokens(input_id)
|
||||
|
||||
def get_cached_input_ids(self, request: Request) -> Set[int]:
|
||||
return self.cached.get(request.request_id, set())
|
||||
|
||||
def free(self, request: Request, input_id: int) -> None:
|
||||
req_id = request.request_id
|
||||
if req_id not in self.cached:
|
||||
return
|
||||
|
||||
self.cached[req_id].discard(input_id)
|
||||
if len(self.cached[req_id]) == 0:
|
||||
del self.cached[req_id]
|
||||
self.num_free_slots += request.get_num_encoder_tokens(input_id)
|
||||
self.freed.append((req_id, input_id))
|
||||
|
||||
def get_freed_ids(self) -> List[Tuple[str, int]]:
|
||||
freed = self.freed
|
||||
self.freed = []
|
||||
return freed
|
||||
397
vllm-v0.6.2/vllm/v1/core/kv_cache_manager.py
Normal file
397
vllm-v0.6.2/vllm/v1/core/kv_cache_manager.py
Normal file
@@ -0,0 +1,397 @@
|
||||
from collections import defaultdict
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import cdiv
|
||||
from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
|
||||
KVCacheBlock, hash_block_tokens,
|
||||
hash_request_tokens)
|
||||
from vllm.v1.request import Request
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class KVCacheManager:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
block_size: int,
|
||||
num_gpu_blocks: int,
|
||||
sliding_window: Optional[int] = None,
|
||||
enable_caching: bool = True,
|
||||
num_preallocate_tokens: int = 64,
|
||||
) -> None:
|
||||
self.block_size = block_size
|
||||
self.num_gpu_blocks = num_gpu_blocks
|
||||
self.sliding_window = sliding_window
|
||||
self.enable_caching = enable_caching
|
||||
# NOTE(woosuk): To avoid frequent block allocation, we preallocate some
|
||||
# blocks for each request. For example, when a request reaches the end
|
||||
# of its block table, we preallocate N blocks in advance. This way, we
|
||||
# reduce the overhead of updating free_block_ids and ref_cnts for each
|
||||
# request every step (at the cost of some memory waste).
|
||||
# NOTE(woosuk): This is different from the "lookahead" slots since this
|
||||
# does not guarantee that the request always has N empty blocks. After
|
||||
# the request gets N empty blocks, it starts to use the blocks without
|
||||
# further allocation. When it uses up all the N empty blocks, it gets
|
||||
# N new empty blocks.
|
||||
self.num_preallocate_tokens = num_preallocate_tokens
|
||||
self.num_preallocate_blocks = cdiv(num_preallocate_tokens, block_size)
|
||||
|
||||
# A Block pool of all kv-cache blocks.
|
||||
self.block_pool: List[KVCacheBlock] = [
|
||||
KVCacheBlock(idx) for idx in range(num_gpu_blocks)
|
||||
]
|
||||
# Free block queue that constructs and manipulates a doubly linked
|
||||
# list of free blocks (including eviction candidates when caching is
|
||||
# enabled).
|
||||
self.free_block_queue = FreeKVCacheBlockQueue(self.block_pool)
|
||||
|
||||
# {block_hash: {block ID: block}}. A cached block is
|
||||
# a full block with a block hash that can be used for prefix caching.
|
||||
# The cached block may be used by running requests or in the
|
||||
# free_block_queue that could potentially be evicted.
|
||||
# NOTE: We currently don't de-duplicate the blocks in the cache,
|
||||
# meaning that if a block becomes full and is cached, we don't check
|
||||
# if there is already an identical block in the cache. This is because
|
||||
# we want to make sure the allocated block IDs won't change so that
|
||||
# block tables are append-only.
|
||||
self.cached_block_hash_to_block: Dict[BlockHashType, Dict[
|
||||
int, KVCacheBlock]] = defaultdict(dict)
|
||||
|
||||
# Mapping from request ID to blocks to track the blocks allocated
|
||||
# for each request, so that we can free the blocks when the request
|
||||
# is finished.
|
||||
self.req_to_blocks: Dict[str, List[KVCacheBlock]] = {}
|
||||
|
||||
def get_computed_blocks(self, request: Request) -> List[KVCacheBlock]:
|
||||
"""Get the computed (cached) blocks for the request.
|
||||
Note that the computed blocks must be full.
|
||||
|
||||
Args:
|
||||
request: The request to get the computed blocks.
|
||||
|
||||
Returns:
|
||||
A list of blocks that are computed for the request.
|
||||
"""
|
||||
if not self.enable_caching:
|
||||
# Prefix caching is disabled.
|
||||
return []
|
||||
|
||||
computed_blocks = []
|
||||
block_hashes = hash_request_tokens(self.block_size,
|
||||
request.all_token_ids)
|
||||
|
||||
for block_hash in block_hashes:
|
||||
# block_hashes is a chain of block hashes. If a block hash is not
|
||||
# in the cached_block_hash_to_id, the following block hashes are
|
||||
# not computed yet for sure.
|
||||
if cached_block := self._get_cached_block(block_hash):
|
||||
computed_blocks.append(cached_block)
|
||||
else:
|
||||
break
|
||||
|
||||
return computed_blocks
|
||||
|
||||
def append_slots(
|
||||
self,
|
||||
request: Request,
|
||||
num_tokens: int,
|
||||
) -> Optional[List[KVCacheBlock]]:
|
||||
"""Append slots to the block table of the request.
|
||||
We first append slots to already allocated blocks. If the allocated
|
||||
blocks are not enough, we allocate new blocks.
|
||||
|
||||
Args:
|
||||
request: The request to append slots.
|
||||
num_tokens: The number of tokens to append.
|
||||
|
||||
Returns:
|
||||
A list of new blocks if new blocks are allocated, or None
|
||||
if new blocks are required but cannot be allocated.
|
||||
"""
|
||||
num_required_blocks = cdiv(request.num_computed_tokens + num_tokens,
|
||||
self.block_size)
|
||||
req_blocks = self.req_to_blocks[request.request_id]
|
||||
|
||||
num_new_blocks = num_required_blocks - len(req_blocks)
|
||||
if num_new_blocks > self.free_block_queue.num_free_blocks:
|
||||
# Need to allocate new blocks due to insufficient pre-allocated
|
||||
# slots, but we cannot allocate new blocks due to the limit.
|
||||
return None
|
||||
|
||||
# When caching is enabled, assign token IDs to already allocated blocks.
|
||||
new_token_ids = None
|
||||
parent_block = None
|
||||
if self.enable_caching:
|
||||
# Figure out the token IDs to add to the blocks.
|
||||
new_token_ids = request.all_token_ids[
|
||||
request.num_computed_tokens:request.num_computed_tokens +
|
||||
num_tokens]
|
||||
|
||||
# Find the last full block index.
|
||||
# TODO: This may be optimized by calculating the computed tokens.
|
||||
last_full_block_idx = len(req_blocks) - 1
|
||||
while (last_full_block_idx >= 0
|
||||
and req_blocks[last_full_block_idx].block_hash is None):
|
||||
last_full_block_idx -= 1
|
||||
|
||||
parent_block = (req_blocks[last_full_block_idx]
|
||||
if last_full_block_idx >= 0 else None)
|
||||
token_id_idx = self._add_token_ids_to_blocks(
|
||||
blocks=req_blocks[last_full_block_idx + 1:],
|
||||
token_ids=new_token_ids,
|
||||
parent_block=parent_block)
|
||||
|
||||
new_token_ids = new_token_ids[token_id_idx:]
|
||||
parent_block = req_blocks[-1]
|
||||
|
||||
# No new block is needed. When caching is enabled, we make sure
|
||||
# token_id_idx is equal to len(new_token_ids), meaning that all tokens
|
||||
# are added to allocated blocks.
|
||||
if num_required_blocks <= len(req_blocks):
|
||||
assert not self.enable_caching or token_id_idx == num_tokens, \
|
||||
f"{token_id_idx=} != {num_tokens=}"
|
||||
return []
|
||||
|
||||
# Allocate new blocks considering preallocated blocks, and
|
||||
# add token IDs to them if caching is enabled.
|
||||
num_new_blocks = min(num_new_blocks + self.num_preallocate_blocks,
|
||||
self.free_block_queue.num_free_blocks)
|
||||
new_blocks = self._get_new_blocks(num_new_blocks, new_token_ids,
|
||||
parent_block)
|
||||
req_blocks.extend(new_blocks)
|
||||
return new_blocks
|
||||
|
||||
def allocate_slots(
|
||||
self,
|
||||
request: Request,
|
||||
num_tokens: int,
|
||||
computed_blocks: List[KVCacheBlock],
|
||||
) -> Optional[List[KVCacheBlock]]:
|
||||
"""Allocate slots for a new request.
|
||||
|
||||
Args:
|
||||
request: The request to allocate slots.
|
||||
num_tokens: The number of tokens to allocate. Note that this does
|
||||
not include the tokens that have already been computed.
|
||||
computed_blocks: The blocks that have already been computed.
|
||||
|
||||
Returns:
|
||||
A list of new allocated blocks.
|
||||
"""
|
||||
if num_tokens == 0:
|
||||
raise ValueError(
|
||||
f"num_tokens must be greater than 0, got {num_tokens}")
|
||||
|
||||
# If a computed block of a request is an eviction candidate (in the
|
||||
# free queue and ref_cnt == 0), it cannot be counted as a free block
|
||||
# when allocating this request.
|
||||
num_evictable_computed_blocks = len(
|
||||
[blk for blk in computed_blocks if blk.ref_cnt == 0])
|
||||
|
||||
num_required_blocks = cdiv(num_tokens, self.block_size)
|
||||
if (num_required_blocks > self.free_block_queue.num_free_blocks -
|
||||
num_evictable_computed_blocks):
|
||||
# Cannot allocate new blocks.
|
||||
return None
|
||||
|
||||
# Determine the number of new blocks to allocate considering
|
||||
# preallocated blocks.
|
||||
num_new_blocks = min(
|
||||
num_required_blocks + self.num_preallocate_blocks,
|
||||
self.free_block_queue.num_free_blocks -
|
||||
num_evictable_computed_blocks)
|
||||
|
||||
num_computed_tokens = len(computed_blocks) * self.block_size
|
||||
|
||||
# When caching is enabled, get the new token IDs and the parent block
|
||||
# ID to generate cache keys.
|
||||
new_token_ids = None
|
||||
parent_block = None
|
||||
if self.enable_caching:
|
||||
# Touch the computed blocks to make sure they won't be evicted.
|
||||
self._touch(computed_blocks)
|
||||
|
||||
# Get the token IDs for the blocks being allocated for hashing.
|
||||
new_token_ids = request.all_token_ids[
|
||||
num_computed_tokens:num_computed_tokens + num_tokens]
|
||||
if not new_token_ids:
|
||||
raise RuntimeError(
|
||||
"Failed to infer the token IDs for allocation. "
|
||||
f"#all_tokens={len(request.all_token_ids)} < "
|
||||
f"#computed_tokens={num_computed_tokens}")
|
||||
|
||||
# Get the parent block ID to construct the block chain.
|
||||
parent_block = computed_blocks[-1] if computed_blocks else None
|
||||
|
||||
new_blocks = self._get_new_blocks(num_new_blocks, new_token_ids,
|
||||
parent_block)
|
||||
|
||||
# Concatenate the computed block IDs and the new block IDs.
|
||||
self.req_to_blocks[request.request_id] = computed_blocks + new_blocks
|
||||
return new_blocks
|
||||
|
||||
def free(self, request: Request) -> None:
|
||||
"""Free the blocks allocated for the request.
|
||||
When caching is enabled, we free the blocks in reverse order so that
|
||||
the tail blocks are evicted first.
|
||||
|
||||
Args:
|
||||
request: The request to free the blocks.
|
||||
"""
|
||||
# Default to [] in case a request is freed (aborted) before alloc.
|
||||
blocks = self.req_to_blocks.pop(request.request_id, [])
|
||||
if self.enable_caching:
|
||||
# Free blocks in reverse order so that the tail blocks are
|
||||
# freed first.
|
||||
blocks = reversed(blocks)
|
||||
|
||||
for block in blocks:
|
||||
block.ref_cnt -= 1
|
||||
if block.ref_cnt == 0:
|
||||
self.free_block_queue.append(block)
|
||||
|
||||
def _get_new_blocks(
|
||||
self,
|
||||
num_blocks: int,
|
||||
token_ids: Optional[List[int]] = None,
|
||||
parent_block: Optional[int] = None) -> List[KVCacheBlock]:
|
||||
"""Get new blocks from the free block pool, and add token IDs to
|
||||
allocated blocks if caching is enabled.
|
||||
Note that we do not check block cache in this function.
|
||||
|
||||
Args:
|
||||
num_blocks: The number of blocks to allocate.
|
||||
token_ids: The token IDs in the blocks. None if caching is disabled.
|
||||
parent_block: The parent block. Used to include block chain
|
||||
in the block hash.
|
||||
|
||||
Returns:
|
||||
A list of new block.
|
||||
"""
|
||||
if num_blocks > self.free_block_queue.num_free_blocks:
|
||||
raise ValueError(
|
||||
f"Cannot get {num_blocks} free blocks from the pool")
|
||||
|
||||
# First allocate blocks.
|
||||
ret: List[KVCacheBlock] = []
|
||||
idx = 0
|
||||
while idx < num_blocks:
|
||||
curr_block = self.free_block_queue.popleft()
|
||||
assert curr_block.ref_cnt == 0
|
||||
|
||||
# Evict blocks from the cache.
|
||||
if self.enable_caching:
|
||||
block_hash = curr_block.block_hash
|
||||
if (block_hash is not None
|
||||
and block_hash in self.cached_block_hash_to_block):
|
||||
if len(self.cached_block_hash_to_block[block_hash]) == 1:
|
||||
del self.cached_block_hash_to_block[block_hash]
|
||||
else:
|
||||
del self.cached_block_hash_to_block[block_hash][
|
||||
curr_block.block_id]
|
||||
curr_block.reset()
|
||||
|
||||
curr_block.ref_cnt = 1
|
||||
ret.append(curr_block)
|
||||
idx += 1
|
||||
|
||||
# Then assign token IDs to the allocated blocks.
|
||||
if self.enable_caching:
|
||||
assert token_ids is not None
|
||||
token_id_idx = self._add_token_ids_to_blocks(
|
||||
blocks=ret, token_ids=token_ids, parent_block=parent_block)
|
||||
assert token_id_idx == len(token_ids)
|
||||
|
||||
return ret
|
||||
|
||||
def _cache_full_block(self,
|
||||
block: KVCacheBlock,
|
||||
parent_block: Optional[KVCacheBlock] = None) -> None:
|
||||
"""Cache a full block for prefix caching.
|
||||
|
||||
Args:
|
||||
block: The block to cache.
|
||||
parent_block: The parent block. None if this is the first block.
|
||||
"""
|
||||
parent_block_hash = (parent_block.block_hash
|
||||
if parent_block is not None else None)
|
||||
assert len(block.token_ids) == self.block_size
|
||||
block.token_ids = tuple(block.token_ids)
|
||||
block_hash = hash_block_tokens(parent_block_hash, block.token_ids)
|
||||
block.block_hash = block_hash
|
||||
block.num_hashed_tokens = self.block_size + (
|
||||
parent_block.num_hashed_tokens if parent_block is not None else 0)
|
||||
self.cached_block_hash_to_block[block_hash][block.block_id] = block
|
||||
|
||||
def _get_cached_block(self,
|
||||
block_hash: BlockHashType) -> Optional[KVCacheBlock]:
|
||||
"""Get a cached block by the block hash, or None if cache miss.
|
||||
If there are duplicated blocks, we return the first block in the cache.
|
||||
|
||||
Args:
|
||||
block_hash: The hash value of the block.
|
||||
|
||||
Returns:
|
||||
The cached block if it exists, or None.
|
||||
"""
|
||||
if block_hash in self.cached_block_hash_to_block:
|
||||
first_block_id = list(
|
||||
self.cached_block_hash_to_block[block_hash].keys())[0]
|
||||
return self.cached_block_hash_to_block[block_hash][first_block_id]
|
||||
return None
|
||||
|
||||
def _touch(self, blocks: List[KVCacheBlock]) -> None:
|
||||
"""Touch a block increases its reference count by 1, and may remove
|
||||
the block from the free queue. This is used when a block is hit by
|
||||
another request with the same prefix.
|
||||
|
||||
Args:
|
||||
blocks: A list of blocks to touch.
|
||||
"""
|
||||
for block in blocks:
|
||||
# ref_cnt=0 means this block is in the free list (i.e. eviction
|
||||
# candidate), so remove it.
|
||||
if block.ref_cnt == 0:
|
||||
self.free_block_queue.remove(block)
|
||||
block.ref_cnt += 1
|
||||
|
||||
def _add_token_ids_to_blocks(
|
||||
self,
|
||||
blocks: List[KVCacheBlock],
|
||||
token_ids: List[int],
|
||||
parent_block: Optional[KVCacheBlock] = None) -> int:
|
||||
"""Add token IDs to a list of allocated blocks.
|
||||
If a block becomes full after adding token IDs, cache it.
|
||||
Return the token ID index that has not been added to the blocks
|
||||
if the blocks are not enough to hold all the token IDs.
|
||||
|
||||
Args:
|
||||
blocks: A list of blocks to add token IDs.
|
||||
token_ids: A list of token IDs to add.
|
||||
parent_block: The parent block. None if this is the
|
||||
first block.
|
||||
|
||||
Returns:
|
||||
The starting token ID index that has not been added to the blocks
|
||||
due to insufficient given blocks.
|
||||
"""
|
||||
token_id_start = 0
|
||||
for curr_block in blocks:
|
||||
# If all token IDs are added, then the rest of the blocks are
|
||||
# preallocated blocks, so we only need to update the
|
||||
# parent_block_id. FIXME
|
||||
if token_id_start == len(token_ids):
|
||||
continue
|
||||
|
||||
# Add token IDs to the empty slots in the block.
|
||||
empty_slots = self.block_size - len(curr_block.token_ids)
|
||||
token_id_end = min(token_id_start + empty_slots, len(token_ids))
|
||||
curr_block.token_ids.extend(token_ids[token_id_start:token_id_end])
|
||||
# Cache the block if it becomes full.
|
||||
if len(curr_block.token_ids) == self.block_size:
|
||||
self._cache_full_block(curr_block, parent_block)
|
||||
parent_block = curr_block
|
||||
token_id_start = token_id_end
|
||||
return token_id_start
|
||||
194
vllm-v0.6.2/vllm/v1/core/kv_cache_utils.py
Normal file
194
vllm-v0.6.2/vllm/v1/core/kv_cache_utils.py
Normal file
@@ -0,0 +1,194 @@
|
||||
"""KV-Cache Utilities."""
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
BlockHashType = Tuple[int, Tuple[int]]
|
||||
|
||||
|
||||
@dataclass
|
||||
class KVCacheBlock:
|
||||
"""KV-cache block metadata."""
|
||||
# Block ID, ranging from 0 to num_gpu_blocks - 1.
|
||||
block_id: int
|
||||
# Reference count.
|
||||
ref_cnt: int = 0
|
||||
# Token IDs in the block. When the block is full, the type of token_ids
|
||||
# should be Tuple[int] for fast matching.
|
||||
token_ids: Union[List[int], Tuple[int]] = field(default_factory=list)
|
||||
# The hash of the block composed of (block hash, tuple of token IDs).
|
||||
# It is only available when the block is full.
|
||||
block_hash: Optional[BlockHashType] = None
|
||||
# The number of hashed tokens. More hashed tokens means the block
|
||||
# is closer to the end of a prompt and more likely to be evicted.
|
||||
num_hashed_tokens: int = 0
|
||||
|
||||
# Used to construct a doubly linked list for free blocks.
|
||||
# These two attributes should only be manipulated by FreeKVCacheBlockQueue.
|
||||
prev_free_block: Optional["KVCacheBlock"] = None
|
||||
next_free_block: Optional["KVCacheBlock"] = None
|
||||
|
||||
def reset(self):
|
||||
"""Reset the block metadata."""
|
||||
self.ref_cnt = 0
|
||||
self.token_ids = []
|
||||
self.block_hash = None
|
||||
self.num_hashed_tokens = 0
|
||||
|
||||
|
||||
class FreeKVCacheBlockQueue:
|
||||
"""This class organizes a list of KVCacheBlock objects to a doubly linked
|
||||
list of free blocks. We implement this class instead of using Python
|
||||
builtin deque to support removing a block in the middle of the queue
|
||||
in O(1) time. To close the performance gap to the builtin deque which is
|
||||
implemented in C++, this class does not allocate any Python objects when
|
||||
manipulating the linked list. Instead, this class manipulates the
|
||||
prev_free_block and next_free_block attributes of the given blocks.
|
||||
|
||||
The queue is ordered by block ID in the beginning. When a block is allocated
|
||||
and then freed, it will be appended back with the eviction order:
|
||||
1. The least recent used block is at the front (LRU).
|
||||
2. If two blocks have the same last accessed time (allocated by the
|
||||
same sequence), the one with more hash tokens (the tail of a block
|
||||
chain) is at the front.
|
||||
Note that we maintain this order by reversing the block order when free
|
||||
blocks of a request. This operation is outside of this class.
|
||||
|
||||
Args:
|
||||
blocks: A list of KVCacheBlock objects.
|
||||
"""
|
||||
|
||||
def __init__(self, blocks: List[KVCacheBlock]) -> None:
|
||||
self.num_free_blocks = len(blocks)
|
||||
|
||||
# Initialize the doubly linked list of free blocks.
|
||||
self.free_list_head = blocks[0]
|
||||
self.free_list_tail = blocks[-1]
|
||||
for i in range(self.num_free_blocks):
|
||||
if i > 0:
|
||||
blocks[i].prev_free_block = blocks[i - 1]
|
||||
if i < self.num_free_blocks - 1:
|
||||
blocks[i].next_free_block = blocks[i + 1]
|
||||
|
||||
def popleft(self) -> KVCacheBlock:
|
||||
"""Pop the first free block and reduce num_free_blocks by 1.
|
||||
|
||||
Returns:
|
||||
The first free block.
|
||||
"""
|
||||
if not self.free_list_head:
|
||||
raise ValueError("No free blocks available")
|
||||
|
||||
block = self.free_list_head
|
||||
self.remove(block)
|
||||
return block
|
||||
|
||||
def remove(self, block: KVCacheBlock) -> None:
|
||||
"""Remove a block in the free list and reduce num_free_blocks by 1.
|
||||
|
||||
Args:
|
||||
block: The block to remove.
|
||||
"""
|
||||
if block.prev_free_block is not None:
|
||||
# Link the previous block to the next block.
|
||||
block.prev_free_block.next_free_block = block.next_free_block
|
||||
if block.next_free_block is not None:
|
||||
# Link the next block to the previous block.
|
||||
block.next_free_block.prev_free_block = block.prev_free_block
|
||||
|
||||
if block == self.free_list_head:
|
||||
# Update the head if the block is the head.
|
||||
self.free_list_head = block.next_free_block
|
||||
if block == self.free_list_tail:
|
||||
# Update the tail if the block is the tail.
|
||||
self.free_list_tail = block.prev_free_block
|
||||
|
||||
# Remove the block from the linked list.
|
||||
block.prev_free_block = block.next_free_block = None
|
||||
self.num_free_blocks -= 1
|
||||
|
||||
def append(self, block: KVCacheBlock) -> None:
|
||||
"""Put a block back into the free list and increase
|
||||
num_free_blocks by 1.
|
||||
|
||||
Args:
|
||||
block: The block to append.
|
||||
"""
|
||||
if self.free_list_tail is not None:
|
||||
# Link the last block to the new block.
|
||||
self.free_list_tail.next_free_block = block
|
||||
block.prev_free_block = self.free_list_tail
|
||||
self.free_list_tail = block
|
||||
else:
|
||||
# The free list is empty.
|
||||
assert self.free_list_head is None
|
||||
self.free_list_head = self.free_list_tail = block
|
||||
|
||||
block.next_free_block = None
|
||||
self.num_free_blocks += 1
|
||||
|
||||
def get_all_free_blocks(self) -> List[KVCacheBlock]:
|
||||
"""Get all free blocks in the free list. Mainly used for testing.
|
||||
|
||||
Returns:
|
||||
A list of free blocks.
|
||||
"""
|
||||
ret = []
|
||||
curr_block = self.free_list_head
|
||||
while curr_block is not None:
|
||||
ret.append(curr_block)
|
||||
curr_block = curr_block.next_free_block
|
||||
return ret
|
||||
|
||||
|
||||
def hash_block_tokens(parent_block_hash: Optional[int],
|
||||
curr_block_token_ids: Tuple[int]) -> BlockHashType:
|
||||
"""Computes a hash value corresponding to the contents of a block and
|
||||
the contents of the preceding block(s). The hash value is used for
|
||||
prefix caching. We use LRU cache for this function to avoid recomputing
|
||||
hash values for the same block contents.
|
||||
|
||||
TODO: Support arbitrary metadata so that we could support more
|
||||
features such as LoRA adapter.
|
||||
|
||||
Args:
|
||||
parent_block_hash: The hash of the parent block. None
|
||||
if this is the first block.
|
||||
curr_block_token_ids: A tuple of token ids in the current
|
||||
block. The current block is assumed to be full.
|
||||
|
||||
Returns:
|
||||
The hash value of the block and the token ids in the block.
|
||||
The entire tuple is used as the hash key of the block.
|
||||
"""
|
||||
return (hash(
|
||||
(parent_block_hash, *curr_block_token_ids)), curr_block_token_ids)
|
||||
|
||||
|
||||
def hash_request_tokens(block_size: int,
|
||||
token_ids: List[int]) -> List[BlockHashType]:
|
||||
"""Computes hash values of a chain of blocks given a sequence of
|
||||
token IDs. The hash value is used for prefix caching.
|
||||
|
||||
Args:
|
||||
block_size: The size of each block.
|
||||
token_ids: A sequence of token ids in the request.
|
||||
|
||||
Returns:
|
||||
The list of computed hash values.
|
||||
"""
|
||||
ret = []
|
||||
parent_block_hash = None
|
||||
for start in range(0, len(token_ids), block_size):
|
||||
end = start + block_size
|
||||
block_token_ids = tuple(token_ids[start:end])
|
||||
# Do not hash the block if it is not full.
|
||||
if len(block_token_ids) < block_size:
|
||||
break
|
||||
block_hash = hash_block_tokens(parent_block_hash, block_token_ids)
|
||||
ret.append(block_hash)
|
||||
parent_block_hash = block_hash
|
||||
return ret
|
||||
591
vllm-v0.6.2/vllm/v1/core/scheduler.py
Normal file
591
vllm-v0.6.2/vllm/v1/core/scheduler.py
Normal file
@@ -0,0 +1,591 @@
|
||||
from collections import deque
|
||||
from dataclasses import dataclass
|
||||
from typing import (TYPE_CHECKING, Deque, Dict, Iterable, List, Optional, Set,
|
||||
Tuple, Union)
|
||||
|
||||
from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
|
||||
from vllm.v1.core.kv_cache_manager import KVCacheManager
|
||||
from vllm.v1.engine import EngineCoreOutput
|
||||
from vllm.v1.outputs import ModelRunnerOutput
|
||||
from vllm.v1.request import Request, RequestStatus
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.multimodal import MultiModalKwargs
|
||||
from vllm.multimodal.base import PlaceholderRange
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class Scheduler:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
scheduler_config: SchedulerConfig,
|
||||
cache_config: CacheConfig,
|
||||
lora_config: Optional[LoRAConfig],
|
||||
) -> None:
|
||||
self.scheduler_config = scheduler_config
|
||||
self.cache_config = cache_config
|
||||
self.lora_config = lora_config
|
||||
# TODO: Support LoRA.
|
||||
assert lora_config is None, "V1 does not support LoRA yet."
|
||||
|
||||
num_gpu_blocks = cache_config.num_gpu_blocks
|
||||
assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0
|
||||
# Create the block space manager.
|
||||
self.kv_cache_manager = KVCacheManager(
|
||||
block_size=self.cache_config.block_size,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
sliding_window=self.cache_config.sliding_window,
|
||||
enable_caching=self.cache_config.enable_prefix_caching)
|
||||
self.block_size = self.cache_config.block_size
|
||||
|
||||
# Scheduling constraints.
|
||||
self.max_num_running_reqs = self.scheduler_config.max_num_seqs
|
||||
self.max_num_scheduled_tokens = \
|
||||
self.scheduler_config.max_num_batched_tokens
|
||||
self.max_model_len = self.scheduler_config.max_model_len
|
||||
|
||||
# req_id -> Request
|
||||
self.requests: Dict[str, Request] = {}
|
||||
# Priority queues for requests.
|
||||
self.waiting: Deque[Request] = deque()
|
||||
self.running: List[Request] = []
|
||||
|
||||
# The request IDs that are finished in between the previous and the
|
||||
# current steps. This is used to notify the workers about the finished
|
||||
# requests so that they can free the cached states for those requests.
|
||||
# This is flushed at the end of each scheduling step.
|
||||
self.finished_req_ids: Set[str] = set()
|
||||
|
||||
# OPTIMIZATION: Cache the RunningRequestData objects to avoid creating
|
||||
# them at each scheduling step.
|
||||
# Request id -> RunningRequestData
|
||||
self.running_reqs_data: Dict[str, RunningRequestData] = {}
|
||||
|
||||
# Encoder-related.
|
||||
# NOTE(woosuk): Here, "encoder" includes the vision encoder (and
|
||||
# projector if needed). Currently, we assume that the encoder also
|
||||
# has the Transformer architecture (e.g., ViT).
|
||||
# FIXME(woosuk): Below are placeholder values. We need to calculate the
|
||||
# actual values from the configurations.
|
||||
self.max_num_encoder_input_tokens = 2048
|
||||
# NOTE(woosuk): For the models without encoder (e.g., text-only models),
|
||||
# the encoder cache will not be initialized and used, regardless of
|
||||
# the cache size. This is because the memory space for the encoder cache
|
||||
# is preallocated in the profiling run.
|
||||
self.encoder_cache_manager = EncoderCacheManager(cache_size=2048)
|
||||
|
||||
def schedule(self) -> "SchedulerOutput":
|
||||
# NOTE(woosuk) on the scheduling algorithm:
|
||||
# There's no "decoding phase" nor "prefill phase" in the scheduler.
|
||||
# Each request just has the num_computed_tokens and num_tokens,
|
||||
# which is equal to len(prompt_token_ids) + len(output_token_ids).
|
||||
# At each step, the scheduler tries to assign tokens to the requests
|
||||
# so that each request's num_computed_tokens can catch up its
|
||||
# num_tokens. This is general enough to cover chunked prefills,
|
||||
# prefix caching, and the "jump decoding" optimization in the future.
|
||||
|
||||
scheduled_new_reqs: List[Request] = []
|
||||
scheduled_resumed_reqs: List[Request] = []
|
||||
scheduled_running_reqs: List[Request] = []
|
||||
preempted_reqs: List[Request] = []
|
||||
|
||||
req_to_new_block_ids: Dict[str, List[int]] = {}
|
||||
num_scheduled_tokens: Dict[str, int] = {}
|
||||
token_budget = self.max_num_scheduled_tokens
|
||||
# Encoder-related.
|
||||
scheduled_encoder_inputs: Dict[str, List[int]] = {}
|
||||
encoder_budget = self.max_num_encoder_input_tokens
|
||||
|
||||
# First, schedule the RUNNING requests.
|
||||
# NOTE(woosuk): At most 1 request in the RUNNING queue is allowed to be
|
||||
# in the "partial" state, where the request has some tokens computed
|
||||
# but not all. The constraint is due to the persistent batch in the
|
||||
# V1 model runner.
|
||||
# TODO(woosuk): Remove this constraint after refactoring model runner.
|
||||
has_partial_request = False
|
||||
req_index = 0
|
||||
while req_index < len(self.running):
|
||||
# Only the last request in the RUNNING queue can be "partial".
|
||||
assert not has_partial_request
|
||||
assert token_budget > 0
|
||||
request = self.running[req_index]
|
||||
num_new_tokens = request.num_tokens - request.num_computed_tokens
|
||||
num_new_tokens = min(num_new_tokens, token_budget)
|
||||
assert num_new_tokens > 0
|
||||
|
||||
# Schedule encoder inputs.
|
||||
encoder_inputs_to_schedule, num_new_tokens, new_encoder_budget = (
|
||||
self._try_schedule_encoder_inputs(request,
|
||||
request.num_computed_tokens,
|
||||
num_new_tokens,
|
||||
encoder_budget))
|
||||
assert num_new_tokens > 0
|
||||
|
||||
while True:
|
||||
new_blocks = self.kv_cache_manager.append_slots(
|
||||
request, num_new_tokens)
|
||||
if new_blocks is None:
|
||||
# The request cannot be scheduled.
|
||||
# Preempt the lowest-priority request.
|
||||
preempted_req = self.running.pop()
|
||||
self.kv_cache_manager.free(preempted_req)
|
||||
preempted_req.status = RequestStatus.PREEMPTED
|
||||
preempted_req.num_computed_tokens = 0
|
||||
|
||||
self.waiting.appendleft(preempted_req)
|
||||
preempted_reqs.append(preempted_req)
|
||||
if preempted_req == request:
|
||||
# No more request to preempt.
|
||||
can_schedule = False
|
||||
break
|
||||
else:
|
||||
# The request can be scheduled.
|
||||
can_schedule = True
|
||||
break
|
||||
if not can_schedule:
|
||||
break
|
||||
|
||||
# Schedule the request.
|
||||
scheduled_running_reqs.append(request)
|
||||
req_to_new_block_ids[request.request_id] = [
|
||||
b.block_id for b in new_blocks
|
||||
]
|
||||
num_scheduled_tokens[request.request_id] = num_new_tokens
|
||||
token_budget -= num_new_tokens
|
||||
req_index += 1
|
||||
has_partial_request = (request.num_computed_tokens + num_new_tokens
|
||||
< request.num_tokens)
|
||||
|
||||
# Encoder-related.
|
||||
if encoder_inputs_to_schedule:
|
||||
scheduled_encoder_inputs[request.request_id] = (
|
||||
encoder_inputs_to_schedule)
|
||||
# Allocate the encoder cache.
|
||||
for i in encoder_inputs_to_schedule:
|
||||
self.encoder_cache_manager.allocate(request, i)
|
||||
encoder_budget = new_encoder_budget
|
||||
|
||||
# Next, schedule the WAITING requests.
|
||||
if not preempted_reqs:
|
||||
while self.waiting:
|
||||
if has_partial_request:
|
||||
break
|
||||
if len(self.running) == self.max_num_running_reqs:
|
||||
break
|
||||
if token_budget == 0:
|
||||
break
|
||||
|
||||
request = self.waiting[0]
|
||||
# Get already-cached tokens.
|
||||
computed_blocks = self.kv_cache_manager.get_computed_blocks(
|
||||
request)
|
||||
# NOTE(woosuk): Since incomplete blocks are not eligible for
|
||||
# sharing, `num_computed_tokens` is always a multiple of
|
||||
# `block_size`.
|
||||
num_computed_tokens = len(computed_blocks) * self.block_size
|
||||
# Number of tokens to be scheduled.
|
||||
# We use `request.num_tokens` instead of
|
||||
# `request.num_prompt_tokens` to consider the resumed requests,
|
||||
# which have output tokens.
|
||||
num_new_tokens = request.num_tokens - num_computed_tokens
|
||||
if num_new_tokens == 0:
|
||||
# The happens when prompt length is divisible by the block
|
||||
# size and all blocks are cached. Now we force to recompute
|
||||
# the last token.
|
||||
num_computed_tokens -= 1
|
||||
num_new_tokens = 1
|
||||
computed_blocks.pop()
|
||||
num_new_tokens = min(num_new_tokens, token_budget)
|
||||
assert num_new_tokens > 0
|
||||
|
||||
# Schedule encoder inputs.
|
||||
(encoder_inputs_to_schedule, num_new_tokens,
|
||||
new_encoder_budget) = self._try_schedule_encoder_inputs(
|
||||
request, num_computed_tokens, num_new_tokens,
|
||||
encoder_budget)
|
||||
if num_new_tokens == 0:
|
||||
# The request cannot be scheduled.
|
||||
break
|
||||
|
||||
new_blocks = self.kv_cache_manager.allocate_slots(
|
||||
request, num_new_tokens, computed_blocks)
|
||||
if new_blocks is None:
|
||||
# The request cannot be scheduled.
|
||||
break
|
||||
|
||||
self.waiting.popleft()
|
||||
self.running.append(request)
|
||||
if request.status == RequestStatus.WAITING:
|
||||
scheduled_new_reqs.append(request)
|
||||
elif request.status == RequestStatus.PREEMPTED:
|
||||
scheduled_resumed_reqs.append(request)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Invalid request status: {request.status}")
|
||||
|
||||
req_to_new_block_ids[request.request_id] = [
|
||||
b.block_id for b in computed_blocks + new_blocks
|
||||
]
|
||||
num_scheduled_tokens[request.request_id] = num_new_tokens
|
||||
token_budget -= num_new_tokens
|
||||
request.status = RequestStatus.RUNNING
|
||||
request.num_computed_tokens = num_computed_tokens
|
||||
has_partial_request = (num_computed_tokens + num_new_tokens <
|
||||
request.num_tokens)
|
||||
|
||||
# Encoder-related.
|
||||
if encoder_inputs_to_schedule:
|
||||
scheduled_encoder_inputs[request.request_id] = (
|
||||
encoder_inputs_to_schedule)
|
||||
# Allocate the encoder cache.
|
||||
for i in encoder_inputs_to_schedule:
|
||||
self.encoder_cache_manager.allocate(request, i)
|
||||
encoder_budget = new_encoder_budget
|
||||
|
||||
# Check if the scheduling constraints are satisfied.
|
||||
total_num_scheduled_tokens = sum(num_scheduled_tokens.values())
|
||||
assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens
|
||||
assert token_budget >= 0
|
||||
assert len(self.running) <= self.max_num_running_reqs
|
||||
assert (len(scheduled_new_reqs) + len(scheduled_resumed_reqs) +
|
||||
len(scheduled_running_reqs) == len(self.running))
|
||||
|
||||
# Construct the scheduler output.
|
||||
new_reqs_data = [
|
||||
NewRequestData.from_request(req,
|
||||
req_to_new_block_ids[req.request_id],
|
||||
req.num_computed_tokens)
|
||||
for req in scheduled_new_reqs
|
||||
]
|
||||
resumed_reqs_data = [
|
||||
ResumedRequestData.from_request(
|
||||
req, req_to_new_block_ids[req.request_id],
|
||||
req.num_computed_tokens) for req in scheduled_resumed_reqs
|
||||
]
|
||||
running_reqs_data = [
|
||||
self._make_running_request_data(
|
||||
req, req_to_new_block_ids[req.request_id],
|
||||
req.num_computed_tokens) for req in scheduled_running_reqs
|
||||
]
|
||||
preempted_req_ids = {req.request_id for req in preempted_reqs}
|
||||
scheduler_output = SchedulerOutput(
|
||||
scheduled_new_reqs=new_reqs_data,
|
||||
scheduled_resumed_reqs=resumed_reqs_data,
|
||||
scheduled_running_reqs=running_reqs_data,
|
||||
num_scheduled_tokens=num_scheduled_tokens,
|
||||
total_num_scheduled_tokens=total_num_scheduled_tokens,
|
||||
scheduled_encoder_inputs=scheduled_encoder_inputs,
|
||||
preempted_req_ids=preempted_req_ids,
|
||||
# finished_req_ids is an existing state in the scheduler,
|
||||
# instead of being newly scheduled in this step.
|
||||
# It contains the request IDs that are finished in between
|
||||
# the previous and the current steps.
|
||||
finished_req_ids=self.finished_req_ids,
|
||||
free_encoder_input_ids=self.encoder_cache_manager.get_freed_ids(),
|
||||
)
|
||||
|
||||
self.finished_req_ids = set()
|
||||
return scheduler_output
|
||||
|
||||
def _make_running_request_data(
|
||||
self,
|
||||
request: Request,
|
||||
new_block_ids: List[int],
|
||||
num_computed_tokens: int,
|
||||
) -> "RunningRequestData":
|
||||
# OPTIMIZATION: Cache the RunningRequestData objects to avoid creating
|
||||
# them at each scheduling step.
|
||||
if request.request_id in self.running_reqs_data:
|
||||
req_data = self.running_reqs_data[request.request_id]
|
||||
req_data.new_block_ids = new_block_ids
|
||||
req_data.num_computed_tokens = num_computed_tokens
|
||||
else:
|
||||
req_data = RunningRequestData.from_request(request, new_block_ids,
|
||||
num_computed_tokens)
|
||||
self.running_reqs_data[request.request_id] = req_data
|
||||
return req_data
|
||||
|
||||
def _try_schedule_encoder_inputs(
|
||||
self,
|
||||
request: Request,
|
||||
num_computed_tokens: int,
|
||||
num_new_tokens: int,
|
||||
encoder_budget: int,
|
||||
) -> Tuple[List[int], int, int]:
|
||||
"""
|
||||
Determine which encoder inputs need to be scheduled in the current step,
|
||||
and update `num_new_tokens` and encoder token budget accordingly.
|
||||
|
||||
An encoder input will be scheduled if:
|
||||
- Its output tokens overlap with the range of tokens being computed
|
||||
in this step, i.e.,
|
||||
[num_computed_tokens, num_computed_tokens + num_new_tokens).
|
||||
- It is not already computed and stored in the encoder cache.
|
||||
- There is sufficient encoder token budget to process it.
|
||||
- The encoder cache has space to store it.
|
||||
|
||||
If an encoder input cannot be scheduled due to cache or budget
|
||||
limitations, the method adjusts `num_new_tokens` to schedule only the
|
||||
decoder tokens up to just before the unschedulable encoder input.
|
||||
"""
|
||||
if not request.has_encoder_inputs():
|
||||
return [], num_new_tokens, encoder_budget
|
||||
|
||||
encoder_inputs_to_schedule: List[int] = []
|
||||
mm_positions = request.mm_positions
|
||||
assert mm_positions is not None
|
||||
assert len(mm_positions) > 0
|
||||
for i, pos_info in enumerate(mm_positions):
|
||||
start_pos = pos_info["offset"]
|
||||
num_encoder_tokens = pos_info["length"]
|
||||
|
||||
# The encoder output is needed if the two ranges overlap:
|
||||
# [num_computed_tokens, num_computed_tokens + num_new_tokens) and
|
||||
# [start_pos, start_pos + num_encoder_tokens)
|
||||
if start_pos >= num_computed_tokens + num_new_tokens:
|
||||
# The encoder input is not needed in this step.
|
||||
break
|
||||
if start_pos + num_encoder_tokens <= num_computed_tokens:
|
||||
# The encoder input is already computed and stored
|
||||
# in the decoder's KV cache.
|
||||
continue
|
||||
|
||||
if self.encoder_cache_manager.has_cache(request, i):
|
||||
# The encoder input is already computed and cached.
|
||||
continue
|
||||
if not self.encoder_cache_manager.can_allocate(request, i):
|
||||
# The encoder cache is full. We can only schedule the decoder
|
||||
# tokens just before the encoder input.
|
||||
num_new_tokens = start_pos - num_computed_tokens
|
||||
break
|
||||
if num_encoder_tokens > encoder_budget:
|
||||
# The encoder budget is exhausted. We can only schedule the
|
||||
# decoder tokens up until the encoder input.
|
||||
# NOTE(woosuk): We assume that the encoder tokens should be
|
||||
# processed altogether, as the encoder usually uses
|
||||
# bidirectional attention.
|
||||
num_new_tokens = start_pos - num_computed_tokens
|
||||
break
|
||||
|
||||
encoder_budget -= num_encoder_tokens
|
||||
encoder_inputs_to_schedule.append(i)
|
||||
return encoder_inputs_to_schedule, num_new_tokens, encoder_budget
|
||||
|
||||
def update_from_output(
|
||||
self,
|
||||
scheduler_output: "SchedulerOutput",
|
||||
model_runner_output: "ModelRunnerOutput",
|
||||
) -> List[EngineCoreOutput]:
|
||||
# NOTE(woosuk): This method doesn't consider speculative decoding.
|
||||
sampled_token_ids = model_runner_output.sampled_token_ids_cpu.tolist()
|
||||
num_scheduled_tokens = scheduler_output.num_scheduled_tokens
|
||||
new_running: List[Request] = []
|
||||
engine_core_outputs: List[EngineCoreOutput] = []
|
||||
for request in self.running:
|
||||
req_id = request.request_id
|
||||
request.num_computed_tokens += num_scheduled_tokens[req_id]
|
||||
# When the request's num_computed_tokens catches up its num_tokens,
|
||||
# the request generates output tokens. Otherwise, we ignore the
|
||||
# sampler output for the request.
|
||||
assert request.num_computed_tokens <= request.num_tokens
|
||||
|
||||
cached_encoder_input_ids = (
|
||||
self.encoder_cache_manager.get_cached_input_ids(request))
|
||||
for input_id in list(cached_encoder_input_ids):
|
||||
start_pos = request.mm_positions[input_id]["offset"]
|
||||
num_tokens = request.mm_positions[input_id]["length"]
|
||||
if start_pos + num_tokens <= request.num_computed_tokens:
|
||||
# The encoder output is already processed and stored
|
||||
# in the decoder's KV cache.
|
||||
self.encoder_cache_manager.free(request, input_id)
|
||||
|
||||
if request.num_computed_tokens == request.num_tokens:
|
||||
req_index = model_runner_output.req_id_to_index[req_id]
|
||||
# NOTE(woosuk): Currently, we assume that each request
|
||||
# generates at most one token at each step.
|
||||
token_id = sampled_token_ids[req_index]
|
||||
request.append_output_token_ids(token_id)
|
||||
num_new_tokens = 1
|
||||
# TODO: Update the KV cache manager for prefix caching.
|
||||
|
||||
# Check for stop and update request state.
|
||||
# This must be called before me make the EngineCoreOutput.
|
||||
stopped = self._check_stop(request)
|
||||
|
||||
# Add EngineCoreOutput for this Request.
|
||||
output = EngineCoreOutput(
|
||||
request_id=req_id,
|
||||
new_token_ids=request.output_token_ids[-num_new_tokens:],
|
||||
finished=request.is_finished(),
|
||||
finish_reason=request.get_finished_reason(),
|
||||
stop_reason=request.stop_reason)
|
||||
engine_core_outputs.append(output)
|
||||
|
||||
# Breakout of the loop.
|
||||
if stopped:
|
||||
continue
|
||||
|
||||
new_running.append(request)
|
||||
self.running = new_running
|
||||
return engine_core_outputs
|
||||
|
||||
def _check_stop(self, request: Request) -> bool:
|
||||
if (request.num_tokens >= self.max_model_len
|
||||
or request.num_output_tokens >= request.max_tokens):
|
||||
request.status = RequestStatus.FINISHED_LENGTH_CAPPED
|
||||
self._free_request(request)
|
||||
return True
|
||||
|
||||
sampling_params = request.sampling_params
|
||||
last_token_id = request.output_token_ids[-1]
|
||||
if (not sampling_params.ignore_eos
|
||||
and last_token_id == request.eos_token_id):
|
||||
request.status = RequestStatus.FINISHED_STOPPED
|
||||
self._free_request(request)
|
||||
return True
|
||||
|
||||
if last_token_id in (sampling_params.stop_token_ids or ()):
|
||||
request.status = RequestStatus.FINISHED_STOPPED
|
||||
request.stop_reason = last_token_id
|
||||
self._free_request(request)
|
||||
return True
|
||||
return False
|
||||
|
||||
def add_request(self, request: Request) -> None:
|
||||
self.waiting.append(request)
|
||||
self.requests[request.request_id] = request
|
||||
|
||||
def finish_requests(
|
||||
self,
|
||||
request_ids: Union[str, Iterable[str]],
|
||||
finished_status: RequestStatus,
|
||||
) -> None:
|
||||
"""Handles the finish signal from outside the scheduler.
|
||||
|
||||
For example, the API server can abort a request when the client
|
||||
disconnects.
|
||||
"""
|
||||
assert RequestStatus.is_finished(finished_status)
|
||||
if isinstance(request_ids, str):
|
||||
request_ids = (request_ids, )
|
||||
request_ids = set(request_ids)
|
||||
|
||||
for req_id in request_ids:
|
||||
request = self.requests.get(req_id)
|
||||
if request is None:
|
||||
# Invalid request ID.
|
||||
continue
|
||||
|
||||
if request.status == RequestStatus.RUNNING:
|
||||
self.running.remove(request)
|
||||
else:
|
||||
self.waiting.remove(request)
|
||||
request.status = finished_status
|
||||
self._free_request(request)
|
||||
|
||||
def _free_request(self, request: Request) -> None:
|
||||
assert request.is_finished()
|
||||
self.kv_cache_manager.free(request)
|
||||
self.running_reqs_data.pop(request.request_id, None)
|
||||
del self.requests[request.request_id]
|
||||
self.finished_req_ids.add(request.request_id)
|
||||
|
||||
def get_num_unfinished_requests(self) -> int:
|
||||
return len(self.waiting) + len(self.running)
|
||||
|
||||
def has_unfinished_requests(self) -> bool:
|
||||
return self.get_num_unfinished_requests() > 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class NewRequestData:
|
||||
|
||||
req_id: str
|
||||
prompt_token_ids: List[int]
|
||||
prompt: Optional[str]
|
||||
mm_inputs: List["MultiModalKwargs"]
|
||||
mm_positions: List["PlaceholderRange"]
|
||||
sampling_params: SamplingParams
|
||||
block_ids: List[int]
|
||||
num_computed_tokens: int
|
||||
|
||||
@classmethod
|
||||
def from_request(
|
||||
cls,
|
||||
request: Request,
|
||||
block_ids: List[int],
|
||||
num_computed_tokens: int,
|
||||
) -> "NewRequestData":
|
||||
return cls(
|
||||
req_id=request.request_id,
|
||||
prompt_token_ids=request.prompt_token_ids,
|
||||
prompt=request.prompt,
|
||||
mm_inputs=request.mm_inputs,
|
||||
mm_positions=request.mm_positions,
|
||||
sampling_params=request.sampling_params,
|
||||
block_ids=block_ids,
|
||||
num_computed_tokens=num_computed_tokens,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ResumedRequestData:
|
||||
|
||||
req_id: str
|
||||
block_ids: List[int]
|
||||
num_computed_tokens: int
|
||||
|
||||
@classmethod
|
||||
def from_request(
|
||||
cls,
|
||||
request: Request,
|
||||
block_ids: List[int],
|
||||
num_computed_tokens: int,
|
||||
) -> "ResumedRequestData":
|
||||
return cls(
|
||||
req_id=request.request_id,
|
||||
block_ids=block_ids,
|
||||
num_computed_tokens=num_computed_tokens,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class RunningRequestData:
|
||||
|
||||
req_id: str
|
||||
new_block_ids: List[int]
|
||||
num_computed_tokens: int
|
||||
|
||||
@classmethod
|
||||
def from_request(
|
||||
cls,
|
||||
request: Request,
|
||||
new_block_ids: List[int],
|
||||
num_computed_tokens: int,
|
||||
) -> "RunningRequestData":
|
||||
return cls(
|
||||
req_id=request.request_id,
|
||||
new_block_ids=new_block_ids,
|
||||
num_computed_tokens=num_computed_tokens,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SchedulerOutput:
|
||||
|
||||
scheduled_new_reqs: List[NewRequestData]
|
||||
scheduled_resumed_reqs: List[ResumedRequestData]
|
||||
scheduled_running_reqs: List[RunningRequestData]
|
||||
|
||||
num_scheduled_tokens: Dict[str, int]
|
||||
total_num_scheduled_tokens: int
|
||||
scheduled_encoder_inputs: Dict[str, List[int]]
|
||||
|
||||
preempted_req_ids: Set[str]
|
||||
finished_req_ids: Set[str]
|
||||
free_encoder_input_ids: List[Tuple[str, int]]
|
||||
Reference in New Issue
Block a user