add qwen3

2026-02-04 17:22:39 +08:00
parent d1c0f68ab4
commit 8511fe8530
1932 changed files with 300426 additions and 0 deletions
--- a/vllm-v0.6.2/vllm/v1/core/init.py
+++ b/vllm-v0.6.2/vllm/v1/core/init.py
--- a/vllm-v0.6.2/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm-v0.6.2/vllm/v1/core/encoder_cache_manager.py
@@ -0,0 +1,48 @@
+from typing import Dict, List, Set, Tuple
+
+from vllm.v1.request import Request
+
+
+class EncoderCacheManager:
+
+    def __init__(self, cache_size: int):
+        self.cache_size = cache_size
+        self.num_free_slots = cache_size
+        # req_id -> cached input ids
+        self.cached: Dict[str, Set[int]] = {}
+        # List of [req_id, input_id]
+        self.freed: List[Tuple[str, int]] = []
+
+    def has_cache(self, request: Request, input_id: int) -> bool:
+        req_id = request.request_id
+        return req_id in self.cached and input_id in self.cached[req_id]
+
+    def can_allocate(self, request: Request, input_id: int) -> bool:
+        num_tokens = request.get_num_encoder_tokens(input_id)
+        return num_tokens <= self.num_free_slots
+
+    def allocate(self, request: Request, input_id: int) -> None:
+        req_id = request.request_id
+        if req_id not in self.cached:
+            self.cached[req_id] = set()
+        self.cached[req_id].add(input_id)
+        self.num_free_slots -= request.get_num_encoder_tokens(input_id)
+
+    def get_cached_input_ids(self, request: Request) -> Set[int]:
+        return self.cached.get(request.request_id, set())
+
+    def free(self, request: Request, input_id: int) -> None:
+        req_id = request.request_id
+        if req_id not in self.cached:
+            return
+
+        self.cached[req_id].discard(input_id)
+        if len(self.cached[req_id]) == 0:
+            del self.cached[req_id]
+        self.num_free_slots += request.get_num_encoder_tokens(input_id)
+        self.freed.append((req_id, input_id))
+
+    def get_freed_ids(self) -> List[Tuple[str, int]]:
+        freed = self.freed
+        self.freed = []
+        return freed
--- a/vllm-v0.6.2/vllm/v1/core/kv_cache_manager.py
+++ b/vllm-v0.6.2/vllm/v1/core/kv_cache_manager.py
@@ -0,0 +1,397 @@
+from collections import defaultdict
+from typing import Dict, List, Optional
+
+from vllm.logger import init_logger
+from vllm.utils import cdiv
+from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
+                                         KVCacheBlock, hash_block_tokens,
+                                         hash_request_tokens)
+from vllm.v1.request import Request
+
+logger = init_logger(__name__)
+
+
+class KVCacheManager:
+
+    def __init__(
+        self,
+        block_size: int,
+        num_gpu_blocks: int,
+        sliding_window: Optional[int] = None,
+        enable_caching: bool = True,
+        num_preallocate_tokens: int = 64,
+    ) -> None:
+        self.block_size = block_size
+        self.num_gpu_blocks = num_gpu_blocks
+        self.sliding_window = sliding_window
+        self.enable_caching = enable_caching
+        # NOTE(woosuk): To avoid frequent block allocation, we preallocate some
+        # blocks for each request. For example, when a request reaches the end
+        # of its block table, we preallocate N blocks in advance. This way, we
+        # reduce the overhead of updating free_block_ids and ref_cnts for each
+        # request every step (at the cost of some memory waste).
+        # NOTE(woosuk): This is different from the "lookahead" slots since this
+        # does not guarantee that the request always has N empty blocks. After
+        # the request gets N empty blocks, it starts to use the blocks without
+        # further allocation. When it uses up all the N empty blocks, it gets
+        # N new empty blocks.
+        self.num_preallocate_tokens = num_preallocate_tokens
+        self.num_preallocate_blocks = cdiv(num_preallocate_tokens, block_size)
+
+        # A Block pool of all kv-cache blocks.
+        self.block_pool: List[KVCacheBlock] = [
+            KVCacheBlock(idx) for idx in range(num_gpu_blocks)
+        ]
+        # Free block queue that constructs and manipulates a doubly linked
+        # list of free blocks (including eviction candidates when caching is
+        # enabled).
+        self.free_block_queue = FreeKVCacheBlockQueue(self.block_pool)
+
+        # {block_hash: {block ID: block}}. A cached block is
+        # a full block with a block hash that can be used for prefix caching.
+        # The cached block may be used by running requests or in the
+        # free_block_queue that could potentially be evicted.
+        # NOTE: We currently don't de-duplicate the blocks in the cache,
+        # meaning that if a block becomes full and is cached, we don't check
+        # if there is already an identical block in the cache. This is because
+        # we want to make sure the allocated block IDs won't change so that
+        # block tables are append-only.
+        self.cached_block_hash_to_block: Dict[BlockHashType, Dict[
+            int, KVCacheBlock]] = defaultdict(dict)
+
+        # Mapping from request ID to blocks to track the blocks allocated
+        # for each request, so that we can free the blocks when the request
+        # is finished.
+        self.req_to_blocks: Dict[str, List[KVCacheBlock]] = {}
+
+    def get_computed_blocks(self, request: Request) -> List[KVCacheBlock]:
+        """Get the computed (cached) blocks for the request.
+        Note that the computed blocks must be full.
+
+        Args:
+            request: The request to get the computed blocks.
+
+        Returns:
+            A list of blocks that are computed for the request.
+        """
+        if not self.enable_caching:
+            # Prefix caching is disabled.
+            return []
+
+        computed_blocks = []
+        block_hashes = hash_request_tokens(self.block_size,
+                                           request.all_token_ids)
+
+        for block_hash in block_hashes:
+            # block_hashes is a chain of block hashes. If a block hash is not
+            # in the cached_block_hash_to_id, the following block hashes are
+            # not computed yet for sure.
+            if cached_block := self._get_cached_block(block_hash):
+                computed_blocks.append(cached_block)
+            else:
+                break
+
+        return computed_blocks
+
+    def append_slots(
+        self,
+        request: Request,
+        num_tokens: int,
+    ) -> Optional[List[KVCacheBlock]]:
+        """Append slots to the block table of the request.
+        We first append slots to already allocated blocks. If the allocated
+        blocks are not enough, we allocate new blocks.
+
+        Args:
+            request: The request to append slots.
+            num_tokens: The number of tokens to append.
+
+        Returns:
+            A list of new blocks if new blocks are allocated, or None
+            if new blocks are required but cannot be allocated.
+        """
+        num_required_blocks = cdiv(request.num_computed_tokens + num_tokens,
+                                   self.block_size)
+        req_blocks = self.req_to_blocks[request.request_id]
+
+        num_new_blocks = num_required_blocks - len(req_blocks)
+        if num_new_blocks > self.free_block_queue.num_free_blocks:
+            # Need to allocate new blocks due to insufficient pre-allocated
+            # slots, but we cannot allocate new blocks due to the limit.
+            return None
+
+        # When caching is enabled, assign token IDs to already allocated blocks.
+        new_token_ids = None
+        parent_block = None
+        if self.enable_caching:
+            # Figure out the token IDs to add to the blocks.
+            new_token_ids = request.all_token_ids[
+                request.num_computed_tokens:request.num_computed_tokens +
+                num_tokens]
+
+            # Find the last full block index.
+            # TODO: This may be optimized by calculating the computed tokens.
+            last_full_block_idx = len(req_blocks) - 1
+            while (last_full_block_idx >= 0
+                   and req_blocks[last_full_block_idx].block_hash is None):
+                last_full_block_idx -= 1
+
+            parent_block = (req_blocks[last_full_block_idx]
+                            if last_full_block_idx >= 0 else None)
+            token_id_idx = self._add_token_ids_to_blocks(
+                blocks=req_blocks[last_full_block_idx + 1:],
+                token_ids=new_token_ids,
+                parent_block=parent_block)
+
+            new_token_ids = new_token_ids[token_id_idx:]
+            parent_block = req_blocks[-1]
+
+        # No new block is needed. When caching is enabled, we make sure
+        # token_id_idx is equal to len(new_token_ids), meaning that all tokens
+        # are added to allocated blocks.
+        if num_required_blocks <= len(req_blocks):
+            assert not self.enable_caching or token_id_idx == num_tokens, \
+                    f"{token_id_idx=} != {num_tokens=}"
+            return []
+
+        # Allocate new blocks considering preallocated blocks, and
+        # add token IDs to them if caching is enabled.
+        num_new_blocks = min(num_new_blocks + self.num_preallocate_blocks,
+                             self.free_block_queue.num_free_blocks)
+        new_blocks = self._get_new_blocks(num_new_blocks, new_token_ids,
+                                          parent_block)
+        req_blocks.extend(new_blocks)
+        return new_blocks
+
+    def allocate_slots(
+        self,
+        request: Request,
+        num_tokens: int,
+        computed_blocks: List[KVCacheBlock],
+    ) -> Optional[List[KVCacheBlock]]:
+        """Allocate slots for a new request.
+
+        Args:
+            request: The request to allocate slots.
+            num_tokens: The number of tokens to allocate. Note that this does
+                not include the tokens that have already been computed.
+            computed_blocks: The blocks that have already been computed.
+
+        Returns:
+            A list of new allocated blocks.
+        """
+        if num_tokens == 0:
+            raise ValueError(
+                f"num_tokens must be greater than 0, got {num_tokens}")
+
+        # If a computed block of a request is an eviction candidate (in the
+        # free queue and ref_cnt == 0), it cannot be counted as a free block
+        # when allocating this request.
+        num_evictable_computed_blocks = len(
+            [blk for blk in computed_blocks if blk.ref_cnt == 0])
+
+        num_required_blocks = cdiv(num_tokens, self.block_size)
+        if (num_required_blocks > self.free_block_queue.num_free_blocks -
+                num_evictable_computed_blocks):
+            # Cannot allocate new blocks.
+            return None
+
+        # Determine the number of new blocks to allocate considering
+        # preallocated blocks.
+        num_new_blocks = min(
+            num_required_blocks + self.num_preallocate_blocks,
+            self.free_block_queue.num_free_blocks -
+            num_evictable_computed_blocks)
+
+        num_computed_tokens = len(computed_blocks) * self.block_size
+
+        # When caching is enabled, get the new token IDs and the parent block
+        # ID to generate cache keys.
+        new_token_ids = None
+        parent_block = None
+        if self.enable_caching:
+            # Touch the computed blocks to make sure they won't be evicted.
+            self._touch(computed_blocks)
+
+            # Get the token IDs for the blocks being allocated for hashing.
+            new_token_ids = request.all_token_ids[
+                num_computed_tokens:num_computed_tokens + num_tokens]
+            if not new_token_ids:
+                raise RuntimeError(
+                    "Failed to infer the token IDs for allocation. "
+                    f"#all_tokens={len(request.all_token_ids)} < "
+                    f"#computed_tokens={num_computed_tokens}")
+
+            # Get the parent block ID to construct the block chain.
+            parent_block = computed_blocks[-1] if computed_blocks else None
+
+        new_blocks = self._get_new_blocks(num_new_blocks, new_token_ids,
+                                          parent_block)
+
+        # Concatenate the computed block IDs and the new block IDs.
+        self.req_to_blocks[request.request_id] = computed_blocks + new_blocks
+        return new_blocks
+
+    def free(self, request: Request) -> None:
+        """Free the blocks allocated for the request.
+        When caching is enabled, we free the blocks in reverse order so that
+        the tail blocks are evicted first.
+
+        Args:
+            request: The request to free the blocks.
+        """
+        # Default to [] in case a request is freed (aborted) before alloc.
+        blocks = self.req_to_blocks.pop(request.request_id, [])
+        if self.enable_caching:
+            # Free blocks in reverse order so that the tail blocks are
+            # freed first.
+            blocks = reversed(blocks)
+
+        for block in blocks:
+            block.ref_cnt -= 1
+            if block.ref_cnt == 0:
+                self.free_block_queue.append(block)
+
+    def _get_new_blocks(
+            self,
+            num_blocks: int,
+            token_ids: Optional[List[int]] = None,
+            parent_block: Optional[int] = None) -> List[KVCacheBlock]:
+        """Get new blocks from the free block pool, and add token IDs to
+        allocated blocks if caching is enabled.
+        Note that we do not check block cache in this function.
+
+        Args:
+            num_blocks: The number of blocks to allocate.
+            token_ids: The token IDs in the blocks. None if caching is disabled.
+            parent_block: The parent block. Used to include block chain
+                in the block hash.
+
+        Returns:
+            A list of new block.
+        """
+        if num_blocks > self.free_block_queue.num_free_blocks:
+            raise ValueError(
+                f"Cannot get {num_blocks} free blocks from the pool")
+
+        # First allocate blocks.
+        ret: List[KVCacheBlock] = []
+        idx = 0
+        while idx < num_blocks:
+            curr_block = self.free_block_queue.popleft()
+            assert curr_block.ref_cnt == 0
+
+            # Evict blocks from the cache.
+            if self.enable_caching:
+                block_hash = curr_block.block_hash
+                if (block_hash is not None
+                        and block_hash in self.cached_block_hash_to_block):
+                    if len(self.cached_block_hash_to_block[block_hash]) == 1:
+                        del self.cached_block_hash_to_block[block_hash]
+                    else:
+                        del self.cached_block_hash_to_block[block_hash][
+                            curr_block.block_id]
+                curr_block.reset()
+
+            curr_block.ref_cnt = 1
+            ret.append(curr_block)
+            idx += 1
+
+        # Then assign token IDs to the allocated blocks.
+        if self.enable_caching:
+            assert token_ids is not None
+            token_id_idx = self._add_token_ids_to_blocks(
+                blocks=ret, token_ids=token_ids, parent_block=parent_block)
+            assert token_id_idx == len(token_ids)
+
+        return ret
+
+    def _cache_full_block(self,
+                          block: KVCacheBlock,
+                          parent_block: Optional[KVCacheBlock] = None) -> None:
+        """Cache a full block for prefix caching.
+
+        Args:
+            block: The block to cache.
+            parent_block: The parent block. None if this is the first block.
+        """
+        parent_block_hash = (parent_block.block_hash
+                             if parent_block is not None else None)
+        assert len(block.token_ids) == self.block_size
+        block.token_ids = tuple(block.token_ids)
+        block_hash = hash_block_tokens(parent_block_hash, block.token_ids)
+        block.block_hash = block_hash
+        block.num_hashed_tokens = self.block_size + (
+            parent_block.num_hashed_tokens if parent_block is not None else 0)
+        self.cached_block_hash_to_block[block_hash][block.block_id] = block
+
+    def _get_cached_block(self,
+                          block_hash: BlockHashType) -> Optional[KVCacheBlock]:
+        """Get a cached block by the block hash, or None if cache miss.
+        If there are duplicated blocks, we return the first block in the cache.
+
+        Args:
+            block_hash: The hash value of the block.
+
+        Returns:
+            The cached block if it exists, or None.
+        """
+        if block_hash in self.cached_block_hash_to_block:
+            first_block_id = list(
+                self.cached_block_hash_to_block[block_hash].keys())[0]
+            return self.cached_block_hash_to_block[block_hash][first_block_id]
+        return None
+
+    def _touch(self, blocks: List[KVCacheBlock]) -> None:
+        """Touch a block increases its reference count by 1, and may remove
+        the block from the free queue. This is used when a block is hit by
+        another request with the same prefix.
+
+        Args:
+            blocks: A list of blocks to touch.
+        """
+        for block in blocks:
+            # ref_cnt=0 means this block is in the free list (i.e. eviction
+            # candidate), so remove it.
+            if block.ref_cnt == 0:
+                self.free_block_queue.remove(block)
+            block.ref_cnt += 1
+
+    def _add_token_ids_to_blocks(
+            self,
+            blocks: List[KVCacheBlock],
+            token_ids: List[int],
+            parent_block: Optional[KVCacheBlock] = None) -> int:
+        """Add token IDs to a list of allocated blocks.
+        If a block becomes full after adding token IDs, cache it.
+        Return the token ID index that has not been added to the blocks
+        if the blocks are not enough to hold all the token IDs.
+
+        Args:
+            blocks: A list of blocks to add token IDs.
+            token_ids: A list of token IDs to add.
+            parent_block: The parent block. None if this is the
+                first block.
+
+        Returns:
+            The starting token ID index that has not been added to the blocks
+            due to insufficient given blocks.
+        """
+        token_id_start = 0
+        for curr_block in blocks:
+            # If all token IDs are added, then the rest of the blocks are
+            # preallocated blocks, so we only need to update the
+            # parent_block_id. FIXME
+            if token_id_start == len(token_ids):
+                continue
+
+            # Add token IDs to the empty slots in the block.
+            empty_slots = self.block_size - len(curr_block.token_ids)
+            token_id_end = min(token_id_start + empty_slots, len(token_ids))
+            curr_block.token_ids.extend(token_ids[token_id_start:token_id_end])
+            # Cache the block if it becomes full.
+            if len(curr_block.token_ids) == self.block_size:
+                self._cache_full_block(curr_block, parent_block)
+            parent_block = curr_block
+            token_id_start = token_id_end
+        return token_id_start
--- a/vllm-v0.6.2/vllm/v1/core/kv_cache_utils.py
+++ b/vllm-v0.6.2/vllm/v1/core/kv_cache_utils.py
@@ -0,0 +1,194 @@
+"""KV-Cache Utilities."""
+from dataclasses import dataclass, field
+from typing import List, Optional, Tuple, Union
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+BlockHashType = Tuple[int, Tuple[int]]
+
+
+@dataclass
+class KVCacheBlock:
+    """KV-cache block metadata."""
+    # Block ID, ranging from 0 to num_gpu_blocks - 1.
+    block_id: int
+    # Reference count.
+    ref_cnt: int = 0
+    # Token IDs in the block. When the block is full, the type of token_ids
+    # should be Tuple[int] for fast matching.
+    token_ids: Union[List[int], Tuple[int]] = field(default_factory=list)
+    # The hash of the block composed of (block hash, tuple of token IDs).
+    # It is only available when the block is full.
+    block_hash: Optional[BlockHashType] = None
+    # The number of hashed tokens. More hashed tokens means the block
+    # is closer to the end of a prompt and more likely to be evicted.
+    num_hashed_tokens: int = 0
+
+    # Used to construct a doubly linked list for free blocks.
+    # These two attributes should only be manipulated by FreeKVCacheBlockQueue.
+    prev_free_block: Optional["KVCacheBlock"] = None
+    next_free_block: Optional["KVCacheBlock"] = None
+
+    def reset(self):
+        """Reset the block metadata."""
+        self.ref_cnt = 0
+        self.token_ids = []
+        self.block_hash = None
+        self.num_hashed_tokens = 0
+
+
+class FreeKVCacheBlockQueue:
+    """This class organizes a list of KVCacheBlock objects to a doubly linked
+    list of free blocks. We implement this class instead of using Python
+    builtin deque to support removing a block in the middle of the queue
+    in O(1) time. To close the performance gap to the builtin deque which is
+    implemented in C++, this class does not allocate any Python objects when
+    manipulating the linked list. Instead, this class manipulates the 
+    prev_free_block and next_free_block attributes of the given blocks.
+
+    The queue is ordered by block ID in the beginning. When a block is allocated
+    and then freed, it will be appended back with the eviction order:
+    1. The least recent used block is at the front (LRU).
+    2. If two blocks have the same last accessed time (allocated by the
+       same sequence), the one with more hash tokens (the tail of a block
+       chain) is at the front.
+    Note that we maintain this order by reversing the block order when free
+    blocks of a request. This operation is outside of this class.
+
+    Args:
+        blocks: A list of KVCacheBlock objects.
+    """
+
+    def __init__(self, blocks: List[KVCacheBlock]) -> None:
+        self.num_free_blocks = len(blocks)
+
+        # Initialize the doubly linked list of free blocks.
+        self.free_list_head = blocks[0]
+        self.free_list_tail = blocks[-1]
+        for i in range(self.num_free_blocks):
+            if i > 0:
+                blocks[i].prev_free_block = blocks[i - 1]
+            if i < self.num_free_blocks - 1:
+                blocks[i].next_free_block = blocks[i + 1]
+
+    def popleft(self) -> KVCacheBlock:
+        """Pop the first free block and reduce num_free_blocks by 1.
+        
+        Returns:
+            The first free block.
+        """
+        if not self.free_list_head:
+            raise ValueError("No free blocks available")
+
+        block = self.free_list_head
+        self.remove(block)
+        return block
+
+    def remove(self, block: KVCacheBlock) -> None:
+        """Remove a block in the free list and reduce num_free_blocks by 1.
+        
+        Args:
+            block: The block to remove.
+        """
+        if block.prev_free_block is not None:
+            # Link the previous block to the next block.
+            block.prev_free_block.next_free_block = block.next_free_block
+        if block.next_free_block is not None:
+            # Link the next block to the previous block.
+            block.next_free_block.prev_free_block = block.prev_free_block
+
+        if block == self.free_list_head:
+            # Update the head if the block is the head.
+            self.free_list_head = block.next_free_block
+        if block == self.free_list_tail:
+            # Update the tail if the block is the tail.
+            self.free_list_tail = block.prev_free_block
+
+        # Remove the block from the linked list.
+        block.prev_free_block = block.next_free_block = None
+        self.num_free_blocks -= 1
+
+    def append(self, block: KVCacheBlock) -> None:
+        """Put a block back into the free list and increase
+        num_free_blocks by 1.
+
+        Args:
+            block: The block to append.
+        """
+        if self.free_list_tail is not None:
+            # Link the last block to the new block.
+            self.free_list_tail.next_free_block = block
+            block.prev_free_block = self.free_list_tail
+            self.free_list_tail = block
+        else:
+            # The free list is empty.
+            assert self.free_list_head is None
+            self.free_list_head = self.free_list_tail = block
+
+        block.next_free_block = None
+        self.num_free_blocks += 1
+
+    def get_all_free_blocks(self) -> List[KVCacheBlock]:
+        """Get all free blocks in the free list. Mainly used for testing.
+        
+        Returns:
+            A list of free blocks.
+        """
+        ret = []
+        curr_block = self.free_list_head
+        while curr_block is not None:
+            ret.append(curr_block)
+            curr_block = curr_block.next_free_block
+        return ret
+
+
+def hash_block_tokens(parent_block_hash: Optional[int],
+                      curr_block_token_ids: Tuple[int]) -> BlockHashType:
+    """Computes a hash value corresponding to the contents of a block and
+    the contents of the preceding block(s). The hash value is used for
+    prefix caching. We use LRU cache for this function to avoid recomputing
+    hash values for the same block contents.
+
+    TODO: Support arbitrary metadata so that we could support more
+    features such as LoRA adapter.
+
+    Args:
+        parent_block_hash: The hash of the parent block. None
+            if this is the first block.
+        curr_block_token_ids: A tuple of token ids in the current
+            block. The current block is assumed to be full.
+
+    Returns:
+        The hash value of the block and the token ids in the block.
+        The entire tuple is used as the hash key of the block.
+    """
+    return (hash(
+        (parent_block_hash, *curr_block_token_ids)), curr_block_token_ids)
+
+
+def hash_request_tokens(block_size: int,
+                        token_ids: List[int]) -> List[BlockHashType]:
+    """Computes hash values of a chain of blocks given a sequence of
+    token IDs. The hash value is used for prefix caching.
+
+    Args:
+        block_size: The size of each block.
+        token_ids: A sequence of token ids in the request.
+
+    Returns:
+        The list of computed hash values.
+    """
+    ret = []
+    parent_block_hash = None
+    for start in range(0, len(token_ids), block_size):
+        end = start + block_size
+        block_token_ids = tuple(token_ids[start:end])
+        # Do not hash the block if it is not full.
+        if len(block_token_ids) < block_size:
+            break
+        block_hash = hash_block_tokens(parent_block_hash, block_token_ids)
+        ret.append(block_hash)
+        parent_block_hash = block_hash
+    return ret
--- a/vllm-v0.6.2/vllm/v1/core/scheduler.py
+++ b/vllm-v0.6.2/vllm/v1/core/scheduler.py
@@ -0,0 +1,591 @@
+from collections import deque
+from dataclasses import dataclass
+from typing import (TYPE_CHECKING, Deque, Dict, Iterable, List, Optional, Set,
+                    Tuple, Union)
+
+from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
+from vllm.logger import init_logger
+from vllm.sampling_params import SamplingParams
+from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
+from vllm.v1.core.kv_cache_manager import KVCacheManager
+from vllm.v1.engine import EngineCoreOutput
+from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.request import Request, RequestStatus
+
+if TYPE_CHECKING:
+    from vllm.multimodal import MultiModalKwargs
+    from vllm.multimodal.base import PlaceholderRange
+
+logger = init_logger(__name__)
+
+
+class Scheduler:
+
+    def __init__(
+        self,
+        scheduler_config: SchedulerConfig,
+        cache_config: CacheConfig,
+        lora_config: Optional[LoRAConfig],
+    ) -> None:
+        self.scheduler_config = scheduler_config
+        self.cache_config = cache_config
+        self.lora_config = lora_config
+        # TODO: Support LoRA.
+        assert lora_config is None, "V1 does not support LoRA yet."
+
+        num_gpu_blocks = cache_config.num_gpu_blocks
+        assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0
+        # Create the block space manager.
+        self.kv_cache_manager = KVCacheManager(
+            block_size=self.cache_config.block_size,
+            num_gpu_blocks=num_gpu_blocks,
+            sliding_window=self.cache_config.sliding_window,
+            enable_caching=self.cache_config.enable_prefix_caching)
+        self.block_size = self.cache_config.block_size
+
+        # Scheduling constraints.
+        self.max_num_running_reqs = self.scheduler_config.max_num_seqs
+        self.max_num_scheduled_tokens = \
+            self.scheduler_config.max_num_batched_tokens
+        self.max_model_len = self.scheduler_config.max_model_len
+
+        # req_id -> Request
+        self.requests: Dict[str, Request] = {}
+        # Priority queues for requests.
+        self.waiting: Deque[Request] = deque()
+        self.running: List[Request] = []
+
+        # The request IDs that are finished in between the previous and the
+        # current steps. This is used to notify the workers about the finished
+        # requests so that they can free the cached states for those requests.
+        # This is flushed at the end of each scheduling step.
+        self.finished_req_ids: Set[str] = set()
+
+        # OPTIMIZATION: Cache the RunningRequestData objects to avoid creating
+        # them at each scheduling step.
+        # Request id -> RunningRequestData
+        self.running_reqs_data: Dict[str, RunningRequestData] = {}
+
+        # Encoder-related.
+        # NOTE(woosuk): Here, "encoder" includes the vision encoder (and
+        # projector if needed). Currently, we assume that the encoder also
+        # has the Transformer architecture (e.g., ViT).
+        # FIXME(woosuk): Below are placeholder values. We need to calculate the
+        # actual values from the configurations.
+        self.max_num_encoder_input_tokens = 2048
+        # NOTE(woosuk): For the models without encoder (e.g., text-only models),
+        # the encoder cache will not be initialized and used, regardless of
+        # the cache size. This is because the memory space for the encoder cache
+        # is preallocated in the profiling run.
+        self.encoder_cache_manager = EncoderCacheManager(cache_size=2048)
+
+    def schedule(self) -> "SchedulerOutput":
+        # NOTE(woosuk) on the scheduling algorithm:
+        # There's no "decoding phase" nor "prefill phase" in the scheduler.
+        # Each request just has the num_computed_tokens and num_tokens,
+        # which is equal to len(prompt_token_ids) + len(output_token_ids).
+        # At each step, the scheduler tries to assign tokens to the requests
+        # so that each request's num_computed_tokens can catch up its
+        # num_tokens. This is general enough to cover chunked prefills,
+        # prefix caching, and the "jump decoding" optimization in the future.
+
+        scheduled_new_reqs: List[Request] = []
+        scheduled_resumed_reqs: List[Request] = []
+        scheduled_running_reqs: List[Request] = []
+        preempted_reqs: List[Request] = []
+
+        req_to_new_block_ids: Dict[str, List[int]] = {}
+        num_scheduled_tokens: Dict[str, int] = {}
+        token_budget = self.max_num_scheduled_tokens
+        # Encoder-related.
+        scheduled_encoder_inputs: Dict[str, List[int]] = {}
+        encoder_budget = self.max_num_encoder_input_tokens
+
+        # First, schedule the RUNNING requests.
+        # NOTE(woosuk): At most 1 request in the RUNNING queue is allowed to be
+        # in the "partial" state, where the request has some tokens computed
+        # but not all. The constraint is due to the persistent batch in the
+        # V1 model runner.
+        # TODO(woosuk): Remove this constraint after refactoring model runner.
+        has_partial_request = False
+        req_index = 0
+        while req_index < len(self.running):
+            # Only the last request in the RUNNING queue can be "partial".
+            assert not has_partial_request
+            assert token_budget > 0
+            request = self.running[req_index]
+            num_new_tokens = request.num_tokens - request.num_computed_tokens
+            num_new_tokens = min(num_new_tokens, token_budget)
+            assert num_new_tokens > 0
+
+            # Schedule encoder inputs.
+            encoder_inputs_to_schedule, num_new_tokens, new_encoder_budget = (
+                self._try_schedule_encoder_inputs(request,
+                                                  request.num_computed_tokens,
+                                                  num_new_tokens,
+                                                  encoder_budget))
+            assert num_new_tokens > 0
+
+            while True:
+                new_blocks = self.kv_cache_manager.append_slots(
+                    request, num_new_tokens)
+                if new_blocks is None:
+                    # The request cannot be scheduled.
+                    # Preempt the lowest-priority request.
+                    preempted_req = self.running.pop()
+                    self.kv_cache_manager.free(preempted_req)
+                    preempted_req.status = RequestStatus.PREEMPTED
+                    preempted_req.num_computed_tokens = 0
+
+                    self.waiting.appendleft(preempted_req)
+                    preempted_reqs.append(preempted_req)
+                    if preempted_req == request:
+                        # No more request to preempt.
+                        can_schedule = False
+                        break
+                else:
+                    # The request can be scheduled.
+                    can_schedule = True
+                    break
+            if not can_schedule:
+                break
+
+            # Schedule the request.
+            scheduled_running_reqs.append(request)
+            req_to_new_block_ids[request.request_id] = [
+                b.block_id for b in new_blocks
+            ]
+            num_scheduled_tokens[request.request_id] = num_new_tokens
+            token_budget -= num_new_tokens
+            req_index += 1
+            has_partial_request = (request.num_computed_tokens + num_new_tokens
+                                   < request.num_tokens)
+
+            # Encoder-related.
+            if encoder_inputs_to_schedule:
+                scheduled_encoder_inputs[request.request_id] = (
+                    encoder_inputs_to_schedule)
+                # Allocate the encoder cache.
+                for i in encoder_inputs_to_schedule:
+                    self.encoder_cache_manager.allocate(request, i)
+                encoder_budget = new_encoder_budget
+
+        # Next, schedule the WAITING requests.
+        if not preempted_reqs:
+            while self.waiting:
+                if has_partial_request:
+                    break
+                if len(self.running) == self.max_num_running_reqs:
+                    break
+                if token_budget == 0:
+                    break
+
+                request = self.waiting[0]
+                # Get already-cached tokens.
+                computed_blocks = self.kv_cache_manager.get_computed_blocks(
+                    request)
+                # NOTE(woosuk): Since incomplete blocks are not eligible for
+                # sharing, `num_computed_tokens` is always a multiple of
+                # `block_size`.
+                num_computed_tokens = len(computed_blocks) * self.block_size
+                # Number of tokens to be scheduled.
+                # We use `request.num_tokens` instead of
+                # `request.num_prompt_tokens` to consider the resumed requests,
+                # which have output tokens.
+                num_new_tokens = request.num_tokens - num_computed_tokens
+                if num_new_tokens == 0:
+                    # The happens when prompt length is divisible by the block
+                    # size and all blocks are cached. Now we force to recompute
+                    # the last token.
+                    num_computed_tokens -= 1
+                    num_new_tokens = 1
+                    computed_blocks.pop()
+                num_new_tokens = min(num_new_tokens, token_budget)
+                assert num_new_tokens > 0
+
+                # Schedule encoder inputs.
+                (encoder_inputs_to_schedule, num_new_tokens,
+                 new_encoder_budget) = self._try_schedule_encoder_inputs(
+                     request, num_computed_tokens, num_new_tokens,
+                     encoder_budget)
+                if num_new_tokens == 0:
+                    # The request cannot be scheduled.
+                    break
+
+                new_blocks = self.kv_cache_manager.allocate_slots(
+                    request, num_new_tokens, computed_blocks)
+                if new_blocks is None:
+                    # The request cannot be scheduled.
+                    break
+
+                self.waiting.popleft()
+                self.running.append(request)
+                if request.status == RequestStatus.WAITING:
+                    scheduled_new_reqs.append(request)
+                elif request.status == RequestStatus.PREEMPTED:
+                    scheduled_resumed_reqs.append(request)
+                else:
+                    raise RuntimeError(
+                        f"Invalid request status: {request.status}")
+
+                req_to_new_block_ids[request.request_id] = [
+                    b.block_id for b in computed_blocks + new_blocks
+                ]
+                num_scheduled_tokens[request.request_id] = num_new_tokens
+                token_budget -= num_new_tokens
+                request.status = RequestStatus.RUNNING
+                request.num_computed_tokens = num_computed_tokens
+                has_partial_request = (num_computed_tokens + num_new_tokens <
+                                       request.num_tokens)
+
+                # Encoder-related.
+                if encoder_inputs_to_schedule:
+                    scheduled_encoder_inputs[request.request_id] = (
+                        encoder_inputs_to_schedule)
+                    # Allocate the encoder cache.
+                    for i in encoder_inputs_to_schedule:
+                        self.encoder_cache_manager.allocate(request, i)
+                    encoder_budget = new_encoder_budget
+
+        # Check if the scheduling constraints are satisfied.
+        total_num_scheduled_tokens = sum(num_scheduled_tokens.values())
+        assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens
+        assert token_budget >= 0
+        assert len(self.running) <= self.max_num_running_reqs
+        assert (len(scheduled_new_reqs) + len(scheduled_resumed_reqs) +
+                len(scheduled_running_reqs) == len(self.running))
+
+        # Construct the scheduler output.
+        new_reqs_data = [
+            NewRequestData.from_request(req,
+                                        req_to_new_block_ids[req.request_id],
+                                        req.num_computed_tokens)
+            for req in scheduled_new_reqs
+        ]
+        resumed_reqs_data = [
+            ResumedRequestData.from_request(
+                req, req_to_new_block_ids[req.request_id],
+                req.num_computed_tokens) for req in scheduled_resumed_reqs
+        ]
+        running_reqs_data = [
+            self._make_running_request_data(
+                req, req_to_new_block_ids[req.request_id],
+                req.num_computed_tokens) for req in scheduled_running_reqs
+        ]
+        preempted_req_ids = {req.request_id for req in preempted_reqs}
+        scheduler_output = SchedulerOutput(
+            scheduled_new_reqs=new_reqs_data,
+            scheduled_resumed_reqs=resumed_reqs_data,
+            scheduled_running_reqs=running_reqs_data,
+            num_scheduled_tokens=num_scheduled_tokens,
+            total_num_scheduled_tokens=total_num_scheduled_tokens,
+            scheduled_encoder_inputs=scheduled_encoder_inputs,
+            preempted_req_ids=preempted_req_ids,
+            # finished_req_ids is an existing state in the scheduler,
+            # instead of being newly scheduled in this step.
+            # It contains the request IDs that are finished in between
+            # the previous and the current steps.
+            finished_req_ids=self.finished_req_ids,
+            free_encoder_input_ids=self.encoder_cache_manager.get_freed_ids(),
+        )
+
+        self.finished_req_ids = set()
+        return scheduler_output
+
+    def _make_running_request_data(
+        self,
+        request: Request,
+        new_block_ids: List[int],
+        num_computed_tokens: int,
+    ) -> "RunningRequestData":
+        # OPTIMIZATION: Cache the RunningRequestData objects to avoid creating
+        # them at each scheduling step.
+        if request.request_id in self.running_reqs_data:
+            req_data = self.running_reqs_data[request.request_id]
+            req_data.new_block_ids = new_block_ids
+            req_data.num_computed_tokens = num_computed_tokens
+        else:
+            req_data = RunningRequestData.from_request(request, new_block_ids,
+                                                       num_computed_tokens)
+            self.running_reqs_data[request.request_id] = req_data
+        return req_data
+
+    def _try_schedule_encoder_inputs(
+        self,
+        request: Request,
+        num_computed_tokens: int,
+        num_new_tokens: int,
+        encoder_budget: int,
+    ) -> Tuple[List[int], int, int]:
+        """
+        Determine which encoder inputs need to be scheduled in the current step,
+        and update `num_new_tokens` and encoder token budget accordingly.
+
+        An encoder input will be scheduled if:
+        - Its output tokens overlap with the range of tokens being computed
+        in this step, i.e.,
+        [num_computed_tokens, num_computed_tokens + num_new_tokens).
+        - It is not already computed and stored in the encoder cache.
+        - There is sufficient encoder token budget to process it.
+        - The encoder cache has space to store it.
+
+        If an encoder input cannot be scheduled due to cache or budget
+        limitations, the method adjusts `num_new_tokens` to schedule only the
+        decoder tokens up to just before the unschedulable encoder input.
+        """
+        if not request.has_encoder_inputs():
+            return [], num_new_tokens, encoder_budget
+
+        encoder_inputs_to_schedule: List[int] = []
+        mm_positions = request.mm_positions
+        assert mm_positions is not None
+        assert len(mm_positions) > 0
+        for i, pos_info in enumerate(mm_positions):
+            start_pos = pos_info["offset"]
+            num_encoder_tokens = pos_info["length"]
+
+            # The encoder output is needed if the two ranges overlap:
+            # [num_computed_tokens, num_computed_tokens + num_new_tokens) and
+            # [start_pos, start_pos + num_encoder_tokens)
+            if start_pos >= num_computed_tokens + num_new_tokens:
+                # The encoder input is not needed in this step.
+                break
+            if start_pos + num_encoder_tokens <= num_computed_tokens:
+                # The encoder input is already computed and stored
+                # in the decoder's KV cache.
+                continue
+
+            if self.encoder_cache_manager.has_cache(request, i):
+                # The encoder input is already computed and cached.
+                continue
+            if not self.encoder_cache_manager.can_allocate(request, i):
+                # The encoder cache is full. We can only schedule the decoder
+                # tokens just before the encoder input.
+                num_new_tokens = start_pos - num_computed_tokens
+                break
+            if num_encoder_tokens > encoder_budget:
+                # The encoder budget is exhausted. We can only schedule the
+                # decoder tokens up until the encoder input.
+                # NOTE(woosuk): We assume that the encoder tokens should be
+                # processed altogether, as the encoder usually uses
+                # bidirectional attention.
+                num_new_tokens = start_pos - num_computed_tokens
+                break
+
+            encoder_budget -= num_encoder_tokens
+            encoder_inputs_to_schedule.append(i)
+        return encoder_inputs_to_schedule, num_new_tokens, encoder_budget
+
+    def update_from_output(
+        self,
+        scheduler_output: "SchedulerOutput",
+        model_runner_output: "ModelRunnerOutput",
+    ) -> List[EngineCoreOutput]:
+        # NOTE(woosuk): This method doesn't consider speculative decoding.
+        sampled_token_ids = model_runner_output.sampled_token_ids_cpu.tolist()
+        num_scheduled_tokens = scheduler_output.num_scheduled_tokens
+        new_running: List[Request] = []
+        engine_core_outputs: List[EngineCoreOutput] = []
+        for request in self.running:
+            req_id = request.request_id
+            request.num_computed_tokens += num_scheduled_tokens[req_id]
+            # When the request's num_computed_tokens catches up its num_tokens,
+            # the request generates output tokens. Otherwise, we ignore the
+            # sampler output for the request.
+            assert request.num_computed_tokens <= request.num_tokens
+
+            cached_encoder_input_ids = (
+                self.encoder_cache_manager.get_cached_input_ids(request))
+            for input_id in list(cached_encoder_input_ids):
+                start_pos = request.mm_positions[input_id]["offset"]
+                num_tokens = request.mm_positions[input_id]["length"]
+                if start_pos + num_tokens <= request.num_computed_tokens:
+                    # The encoder output is already processed and stored
+                    # in the decoder's KV cache.
+                    self.encoder_cache_manager.free(request, input_id)
+
+            if request.num_computed_tokens == request.num_tokens:
+                req_index = model_runner_output.req_id_to_index[req_id]
+                # NOTE(woosuk): Currently, we assume that each request
+                # generates at most one token at each step.
+                token_id = sampled_token_ids[req_index]
+                request.append_output_token_ids(token_id)
+                num_new_tokens = 1
+                # TODO: Update the KV cache manager for prefix caching.
+
+                # Check for stop and update request state.
+                # This must be called before me make the EngineCoreOutput.
+                stopped = self._check_stop(request)
+
+                # Add EngineCoreOutput for this Request.
+                output = EngineCoreOutput(
+                    request_id=req_id,
+                    new_token_ids=request.output_token_ids[-num_new_tokens:],
+                    finished=request.is_finished(),
+                    finish_reason=request.get_finished_reason(),
+                    stop_reason=request.stop_reason)
+                engine_core_outputs.append(output)
+
+                # Breakout of the loop.
+                if stopped:
+                    continue
+
+            new_running.append(request)
+        self.running = new_running
+        return engine_core_outputs
+
+    def _check_stop(self, request: Request) -> bool:
+        if (request.num_tokens >= self.max_model_len
+                or request.num_output_tokens >= request.max_tokens):
+            request.status = RequestStatus.FINISHED_LENGTH_CAPPED
+            self._free_request(request)
+            return True
+
+        sampling_params = request.sampling_params
+        last_token_id = request.output_token_ids[-1]
+        if (not sampling_params.ignore_eos
+                and last_token_id == request.eos_token_id):
+            request.status = RequestStatus.FINISHED_STOPPED
+            self._free_request(request)
+            return True
+
+        if last_token_id in (sampling_params.stop_token_ids or ()):
+            request.status = RequestStatus.FINISHED_STOPPED
+            request.stop_reason = last_token_id
+            self._free_request(request)
+            return True
+        return False
+
+    def add_request(self, request: Request) -> None:
+        self.waiting.append(request)
+        self.requests[request.request_id] = request
+
+    def finish_requests(
+        self,
+        request_ids: Union[str, Iterable[str]],
+        finished_status: RequestStatus,
+    ) -> None:
+        """Handles the finish signal from outside the scheduler.
+
+        For example, the API server can abort a request when the client
+        disconnects.
+        """
+        assert RequestStatus.is_finished(finished_status)
+        if isinstance(request_ids, str):
+            request_ids = (request_ids, )
+        request_ids = set(request_ids)
+
+        for req_id in request_ids:
+            request = self.requests.get(req_id)
+            if request is None:
+                # Invalid request ID.
+                continue
+
+            if request.status == RequestStatus.RUNNING:
+                self.running.remove(request)
+            else:
+                self.waiting.remove(request)
+            request.status = finished_status
+            self._free_request(request)
+
+    def _free_request(self, request: Request) -> None:
+        assert request.is_finished()
+        self.kv_cache_manager.free(request)
+        self.running_reqs_data.pop(request.request_id, None)
+        del self.requests[request.request_id]
+        self.finished_req_ids.add(request.request_id)
+
+    def get_num_unfinished_requests(self) -> int:
+        return len(self.waiting) + len(self.running)
+
+    def has_unfinished_requests(self) -> bool:
+        return self.get_num_unfinished_requests() > 0
+
+
+@dataclass
+class NewRequestData:
+
+    req_id: str
+    prompt_token_ids: List[int]
+    prompt: Optional[str]
+    mm_inputs: List["MultiModalKwargs"]
+    mm_positions: List["PlaceholderRange"]
+    sampling_params: SamplingParams
+    block_ids: List[int]
+    num_computed_tokens: int
+
+    @classmethod
+    def from_request(
+        cls,
+        request: Request,
+        block_ids: List[int],
+        num_computed_tokens: int,
+    ) -> "NewRequestData":
+        return cls(
+            req_id=request.request_id,
+            prompt_token_ids=request.prompt_token_ids,
+            prompt=request.prompt,
+            mm_inputs=request.mm_inputs,
+            mm_positions=request.mm_positions,
+            sampling_params=request.sampling_params,
+            block_ids=block_ids,
+            num_computed_tokens=num_computed_tokens,
+        )
+
+
+@dataclass
+class ResumedRequestData:
+
+    req_id: str
+    block_ids: List[int]
+    num_computed_tokens: int
+
+    @classmethod
+    def from_request(
+        cls,
+        request: Request,
+        block_ids: List[int],
+        num_computed_tokens: int,
+    ) -> "ResumedRequestData":
+        return cls(
+            req_id=request.request_id,
+            block_ids=block_ids,
+            num_computed_tokens=num_computed_tokens,
+        )
+
+
+@dataclass
+class RunningRequestData:
+
+    req_id: str
+    new_block_ids: List[int]
+    num_computed_tokens: int
+
+    @classmethod
+    def from_request(
+        cls,
+        request: Request,
+        new_block_ids: List[int],
+        num_computed_tokens: int,
+    ) -> "RunningRequestData":
+        return cls(
+            req_id=request.request_id,
+            new_block_ids=new_block_ids,
+            num_computed_tokens=num_computed_tokens,
+        )
+
+
+@dataclass
+class SchedulerOutput:
+
+    scheduled_new_reqs: List[NewRequestData]
+    scheduled_resumed_reqs: List[ResumedRequestData]
+    scheduled_running_reqs: List[RunningRequestData]
+
+    num_scheduled_tokens: Dict[str, int]
+    total_num_scheduled_tokens: int
+    scheduled_encoder_inputs: Dict[str, List[int]]
+
+    preempted_req_ids: Set[str]
+    finished_req_ids: Set[str]
+    free_encoder_input_ids: List[Tuple[str, int]]