Iluvatar-mrv100 SDK 4.3.0

2025-09-15 14:58:11 +08:00
parent 9efe891f99
commit 8af8290b1d
1052 changed files with 294967 additions and 1 deletions
--- a/vllm/v1/core/init.py
+++ b/vllm/v1/core/init.py
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -0,0 +1,281 @@
+# SPDX-License-Identifier: Apache-2.0
+from collections import defaultdict
+from collections.abc import Iterable
+from typing import Callable, Optional
+
+from vllm.logger import init_logger
+from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
+                                         KVCacheBlock,
+                                         generate_block_hash_extra_keys,
+                                         hash_block_tokens)
+from vllm.v1.request import Request
+
+logger = init_logger(__name__)
+
+
+class BlockPool:
+    """BlockPool that manages KVCacheBlocks.
+    It provides methods to allocate, free and cache the kv cache blocks. The
+    free_block_queue stores the free blocks in eviction order to enable
+    allocation, free, and cache eviction. The cached_block_hash_to_block
+    maps between block hash and cached block to support finding cached blocks
+    by their block hash.
+
+    Args:
+        num_gpu_blocks: The number of blocks in the pool.
+        enable_caching: Whether to enable prefix caching.
+    """
+
+    def __init__(self, num_gpu_blocks: int, enable_caching: bool):
+        assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0
+        self.num_gpu_blocks = num_gpu_blocks
+        self.enable_caching = enable_caching
+        # All kv-cache blocks.
+        self.blocks: list[KVCacheBlock] = [
+            KVCacheBlock(idx) for idx in range(num_gpu_blocks)
+        ]
+        # Free block queue that constructs and manipulates a doubly linked
+        # list of free blocks (including eviction candidates when caching is
+        # enabled).
+        self.free_block_queue = FreeKVCacheBlockQueue(self.blocks)
+
+        # {block_hash: {block ID: block}}. A cached block is
+        # a full block with a block hash that can be used for prefix caching.
+        # The cached block may be used by running requests or in the
+        # free_block_queue that could potentially be evicted.
+        # NOTE: We currently don't de-duplicate the blocks in the cache,
+        # meaning that if a block becomes full and is cached, we don't check
+        # if there is already an identical block in the cache. This is because
+        # we want to make sure the allocated block IDs won't change so that
+        # block tables are append-only.
+        self.cached_block_hash_to_block: dict[BlockHashType, dict[
+            int, KVCacheBlock]] = defaultdict(dict)
+
+        # To represent a placeholder block with block_id=0.
+        # The ref_cnt of null_block is not maintained, needs special care to
+        # avoid freeing it.
+        self.null_block = self.free_block_queue.popleft()
+
+    def get_cached_block(self,
+                         block_hash: BlockHashType) -> Optional[KVCacheBlock]:
+        """Get a cached block by the block hash, or None if cache miss.
+        If there are duplicated blocks, we return the first block in the cache.
+
+        Args:
+            block_hash: The hash value of the block.
+
+        Returns:
+            The cached block if it exists, or None.
+        """
+        if block_hash in self.cached_block_hash_to_block:
+            first_block_id = list(
+                self.cached_block_hash_to_block[block_hash].keys())[0]
+            return self.cached_block_hash_to_block[block_hash][first_block_id]
+        return None
+
+    def cache_full_blocks(
+        self,
+        request: Request,
+        blocks: list[KVCacheBlock],
+        block_hashes: list[BlockHashType],
+        num_cached_blocks: int,
+        num_full_blocks: int,
+        block_size: int,
+        hash_fn: Callable,
+    ) -> None:
+        """Cache a list of full blocks for prefix caching.
+        This function takes a list of blocks that will have their block hash
+        metadata to be updated and cached. Given a request, it computes the
+        block hashes for the blocks starting from `num_cached_blocks` to
+        `num_full_blocks`, updating the metadata for each block
+        and caching them in the `cached_block_hash_to_block`.
+
+        Args:
+            request: The request to cache the blocks.
+            blocks: All blocks in the request.
+            block_hashes: Block hashes of the blocks in the request. Note that
+            this list may be shorter than the blocks list. In this case the
+            missed block hash will be computed in this function.
+            num_cached_blocks: The number of blocks that are already cached.
+            num_full_blocks: The number of blocks that are full and should
+                be cached after this function.
+            block_size: Number of tokens in each block.
+            hash_fn: The hash function to use for block hashes.
+        """
+        if num_cached_blocks == num_full_blocks:
+            return
+        new_full_blocks = blocks[num_cached_blocks:num_full_blocks]
+        assert len(block_hashes) >= num_cached_blocks
+        new_block_hashes = block_hashes[num_cached_blocks:]
+
+        # Update the new blocks with the block hashes through the chain.
+        if num_cached_blocks == 0:
+            prev_block_hash_value = None
+        else:
+            prev_block = blocks[num_cached_blocks - 1]
+            assert prev_block.block_hash is not None
+            prev_block_hash_value = prev_block.block_hash.hash_value
+
+        for i, blk in enumerate(new_full_blocks):
+            assert blk.block_hash is None
+
+            if i < len(new_block_hashes):
+                # The block hash may already be computed in
+                # "get_computed_blocks" if the tokens are not generated by
+                # this request (either the prompt tokens or the previously
+                # generated tokens with preemption). In this case we simply
+                # reuse the block hash.
+                block_hash = new_block_hashes[i]
+            else:
+                # Otherwise compute the block hash and cache it in the request
+                # in case it will be preempted in the future.
+                blk_idx = num_cached_blocks + i
+                start_token_idx = blk_idx * block_size
+                end_token_idx = (blk_idx + 1) * block_size
+                block_tokens = request.all_token_ids[
+                    start_token_idx:end_token_idx]
+                assert len(block_tokens) == block_size, (
+                    f"Expected {block_size} tokens, got "
+                    f"{len(block_tokens)} at {blk_idx}th block for request "
+                    f"{request.request_id}({request})")
+
+                # Generate extra keys for multi-modal inputs. Note that since
+                # we reach to this branch only when the block is completed with
+                # generated tokens, we only need to consider the last mm input.
+                extra_keys, _ = generate_block_hash_extra_keys(
+                    request, start_token_idx, end_token_idx, -1)
+
+                # Compute the hash of the current block.
+                block_hash = hash_block_tokens(hash_fn, prev_block_hash_value,
+                                               block_tokens, extra_keys)
+                block_hashes.append(block_hash)
+
+            # Update and added the full block to the cache.
+            blk.block_hash = block_hash
+            self.cached_block_hash_to_block[block_hash][blk.block_id] = blk
+            prev_block_hash_value = block_hash.hash_value
+
+    def get_new_blocks(self, num_blocks: int) -> list[KVCacheBlock]:
+        """Get new blocks from the free block pool.
+
+        Note that we do not check block cache in this function.
+
+        Args:
+            num_blocks: The number of blocks to allocate.
+
+        Returns:
+            A list of new block.
+        """
+        if num_blocks > self.get_num_free_blocks():
+            raise ValueError(
+                f"Cannot get {num_blocks} free blocks from the pool")
+
+        ret: list[KVCacheBlock] = []
+        idx = 0
+        while idx < num_blocks:
+            # First allocate blocks.
+            curr_block = self.free_block_queue.popleft()
+            assert curr_block.ref_cnt == 0
+
+            # If the block is cached, evict it.
+            if self.enable_caching:
+                self._maybe_evict_cached_block(curr_block)
+
+            curr_block.incr_ref()
+            ret.append(curr_block)
+            idx += 1
+
+        return ret
+
+    def _maybe_evict_cached_block(self, block: KVCacheBlock) -> bool:
+        """
+        If a block is cached in `cached_block_hash_to_block`, we reset its hash
+        metadata and evict it from the cache.
+
+        Args:
+            block: The block to evict.
+
+        Returns:
+            True if the block is evicted, False otherwise.
+        """
+        block_hash = block.block_hash
+        if block_hash and block_hash in self.cached_block_hash_to_block:
+            block.reset_hash()
+            del self.cached_block_hash_to_block[block_hash][block.block_id]
+
+            if len(self.cached_block_hash_to_block[block_hash]) == 0:
+                del self.cached_block_hash_to_block[block_hash]
+
+            return True
+        return False
+
+    def touch(self, blocks: list[KVCacheBlock]) -> None:
+        """Touch a block increases its reference count by 1, and may remove
+        the block from the free queue. This is used when a block is hit by
+        another request with the same prefix.
+
+        Args:
+            blocks: A list of blocks to touch.
+        """
+        for block in blocks:
+            # ref_cnt=0 means this block is in the free list (i.e. eviction
+            # candidate), so remove it.
+            if block.ref_cnt == 0 and block != self.null_block:
+                self.free_block_queue.remove(block)
+            block.incr_ref()
+
+    def free_blocks(self, ordered_blocks: Iterable[KVCacheBlock]) -> None:
+        """Free a list of blocks. The blocks should be ordered by their
+        eviction priority, where the first block will be evicted first.
+
+        Args:
+            ordered_blocks: A list of blocks to free ordered by their eviction
+                priority.
+        """
+        for block in ordered_blocks:
+            block.decr_ref()
+            # null_block should not be added to the free list.
+            if block.ref_cnt == 0 and block != self.null_block:
+                self.free_block_queue.append(block)
+
+    def reset_prefix_cache(self) -> bool:
+        """Reset prefix cache. This function may be used in RLHF
+        flows to invalid prefix caching after the weights are updated,
+        or used for resetting prefix caching status for benchmarking.
+
+        Returns:
+            bool: True if the prefix cache is successfully reset,
+            False otherwise.
+        """
+        num_used_blocks = (self.num_gpu_blocks - self.get_num_free_blocks())
+        if num_used_blocks != 1:  # The null block is always marked as used
+            logger.warning(
+                "Failed to reset prefix cache because some "
+                "blocks (%d) are not freed yet", num_used_blocks - 1)
+            return False
+
+        # Remove all hashes so that no new blocks will hit.
+        self.cached_block_hash_to_block = defaultdict(dict)
+
+        # Remove all hashes from all blocks.
+        for block in self.blocks:
+            block.reset_hash()
+
+        logger.info("Successfully reset prefix cache")
+        return True
+
+    def get_num_free_blocks(self) -> int:
+        """Get the number of free blocks in the pool.
+
+        Returns:
+            The number of free blocks.
+        """
+        return self.free_block_queue.num_free_blocks
+
+    def get_usage(self) -> float:
+        """Get the KV cache usage.
+
+        Returns:
+            The KV cache usage (between 0.0 and 1.0).
+        """
+        return 1.0 - (self.get_num_free_blocks() / self.num_gpu_blocks)
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -0,0 +1,141 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import TYPE_CHECKING
+
+from vllm.logger import init_logger
+from vllm.multimodal import MultiModalRegistry
+from vllm.v1.request import Request
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig, SchedulerConfig
+
+logger = init_logger(__name__)
+
+
+class EncoderCacheManager:
+
+    def __init__(self, cache_size: int):
+        self.cache_size = cache_size
+        self.num_free_slots = cache_size
+        # req_id -> cached input ids
+        self.cached: dict[str, set[int]] = {}
+        # list of [req_id, input_id]
+        self.freed: list[tuple[str, int]] = []
+
+    def has_cache(self, request: Request, input_id: int) -> bool:
+        req_id = request.request_id
+        return req_id in self.cached and input_id in self.cached[req_id]
+
+    def can_allocate(self, request: Request, input_id: int) -> bool:
+        num_tokens = request.get_num_encoder_tokens(input_id)
+        return num_tokens <= self.num_free_slots
+
+    def allocate(self, request: Request, input_id: int) -> None:
+        req_id = request.request_id
+        if req_id not in self.cached:
+            self.cached[req_id] = set()
+        self.cached[req_id].add(input_id)
+        self.num_free_slots -= request.get_num_encoder_tokens(input_id)
+
+    def get_cached_input_ids(self, request: Request) -> set[int]:
+        return self.cached.get(request.request_id, set())
+
+    def free_encoder_input(self, request: Request, input_id: int) -> None:
+        """Free a single encoder input id for the request."""
+        req_id = request.request_id
+        if req_id not in self.cached:
+            return
+
+        self.cached[req_id].discard(input_id)
+        if len(self.cached[req_id]) == 0:
+            del self.cached[req_id]
+        self.num_free_slots += request.get_num_encoder_tokens(input_id)
+        self.freed.append((req_id, input_id))
+
+    def free(self, request: Request) -> None:
+        """Free all cached input ids for the request."""
+        input_ids = self.get_cached_input_ids(request).copy()
+        for input_id in input_ids:
+            self.free_encoder_input(request, input_id)
+
+    def get_freed_ids(self) -> list[tuple[str, int]]:
+        freed = self.freed
+        self.freed = []
+        return freed
+
+
+def compute_encoder_budget(
+    model_config: "ModelConfig",
+    scheduler_config: "SchedulerConfig",
+    mm_registry: MultiModalRegistry,
+) -> tuple[int, int]:
+    """Compute the encoder cache budget based on the model and scheduler 
+    configurations.
+
+    Args:
+        model_config: Model configuration.
+        scheduler_config: Scheduler configuration.
+        mm_registry: Provides information about the token cost.
+
+    Returns:
+        - Compute budget for encoder execution, in unit of number of tokens 
+            in the input sequence.
+        - Space budget for encoder cache size, in unit of number of tokens 
+            in the input sequence.
+    """
+
+    if not model_config.is_multimodal_model:
+        return 0, 0
+
+    # TODO: handle encoder-decoder models once we support them.
+    (
+        encoder_compute_budget,
+        encoder_cache_size,
+    ) = _compute_encoder_budget_multimodal(
+        model_config,
+        scheduler_config,
+        mm_registry,
+    )
+
+    return encoder_compute_budget, encoder_cache_size
+
+
+def _compute_encoder_budget_multimodal(
+    model_config: "ModelConfig",
+    scheduler_config: "SchedulerConfig",
+    mm_registry: MultiModalRegistry,
+) -> tuple[int, int]:
+    """Compute the encoder cache budget based on the model and scheduler 
+    configurations for a multimodal model.
+
+    Args:
+        model_config: Model configuration.
+        scheduler_config: Scheduler configuration.
+        mm_registry: Provides information about the token cost.
+
+    Returns:
+        - Compute budget for encoder execution, in unit of number of tokens 
+            in the input sequence.
+        - Space budget for encoder cache size, in unit of number of tokens 
+            in the input sequence.
+    """
+
+    max_tokens_by_modality_dict = mm_registry \
+        .get_max_tokens_per_item_by_nonzero_modality(model_config)
+
+    if not max_tokens_by_modality_dict:
+        logger.warning(
+            "All non-text modalities supported by the model have been "
+            "explicitly disabled via limit_mm_per_prompt. Encoder cache will "
+            "not be initialized.")
+        return 0, 0
+
+    _, max_tokens_per_mm_item = max(max_tokens_by_modality_dict.items(),
+                                    key=lambda item: item[1])
+
+    encoder_compute_budget = max(scheduler_config.max_num_encoder_input_tokens,
+                                 max_tokens_per_mm_item)
+    encoder_cache_size = max(scheduler_config.encoder_cache_size,
+                             max_tokens_per_mm_item)
+
+    return encoder_compute_budget, encoder_cache_size
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -0,0 +1,376 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from collections import defaultdict
+from collections.abc import Iterable
+from typing import Optional
+
+from vllm.logger import init_logger
+from vllm.utils import cdiv, sha256
+from vllm.v1.core.block_pool import BlockPool
+from vllm.v1.core.kv_cache_utils import (BlockHashType, KVCacheBlock,
+                                         hash_request_tokens)
+from vllm.v1.core.specialized_manager import get_specialized_manager
+from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.metrics.stats import PrefixCacheStats
+from vllm.v1.request import Request, RequestStatus
+
+logger = init_logger(__name__)
+
+
+class KVCacheManager:
+
+    def __init__(
+        self,
+        kv_cache_config: KVCacheConfig,
+        max_model_len: int,
+        enable_caching: bool = True,
+        caching_hash_algo: str = "builtin",
+        num_preallocate_tokens: int = 64,
+        log_stats: bool = False,
+    ) -> None:
+        assert len(kv_cache_config.kv_cache_groups) == 1, (
+            "KVCacheManager does not support hybrid models with more than 1 "
+            "kv cache group")
+        kv_cache_spec = kv_cache_config.kv_cache_groups[0].kv_cache_spec
+        self.block_size = kv_cache_spec.block_size
+        self.num_gpu_blocks = kv_cache_config.num_blocks
+        self.max_model_len = max_model_len
+        self.max_num_blocks_per_req = cdiv(max_model_len, self.block_size)
+
+        self.enable_caching = enable_caching
+        self.caching_hash_fn = sha256 if caching_hash_algo == "sha256" else hash
+        # FIXME: make prefix cache stats conditional on log_stats
+        self.log_stats = log_stats
+        # NOTE(woosuk): To avoid frequent block allocation, we preallocate some
+        # blocks for each request. For example, when a request reaches the end
+        # of its block table, we preallocate N blocks in advance. This way, we
+        # reduce the overhead of updating free_block_ids and ref_cnts for each
+        # request every step (at the cost of some memory waste).
+        # NOTE(woosuk): This is different from the "lookahead" slots since this
+        # does not guarantee that the request always has N empty blocks. After
+        # the request gets N empty blocks, it starts to use the blocks without
+        # further allocation. When it uses up all the N empty blocks, it gets
+        # N new empty blocks.
+        self.num_preallocate_tokens = num_preallocate_tokens
+        self.num_preallocate_blocks = cdiv(num_preallocate_tokens,
+                                           self.block_size)
+
+        self.block_pool = BlockPool(self.num_gpu_blocks, enable_caching)
+
+        self.specialized_manager = get_specialized_manager(
+            kv_cache_spec=kv_cache_spec,
+            block_pool=self.block_pool,
+        )
+
+        # Mapping from request ID to blocks to track the blocks allocated
+        # for each request, so that we can free the blocks when the request
+        # is finished.
+        self.req_to_blocks: defaultdict[str,
+                                        list[KVCacheBlock]] = defaultdict(list)
+
+        # Mapping from request ID to kv block hashes.
+        # This is to avoid recomputing the block hashes for each call of
+        # `get_computed_blocks` or `allocate_slots`.
+        self.req_to_block_hashes: defaultdict[
+            str, list[BlockHashType]] = defaultdict(list)
+
+        # {req_id: The number of cached blocks for this given request}
+        # This is used to track the number of cached blocks for each request.
+        # This is only used to track the RUNNING requests, we do not track the
+        # data for reempted ones.
+        self.num_cached_block: dict[str, int] = {}
+        self.prefix_cache_stats = PrefixCacheStats()
+
+    @property
+    def usage(self) -> float:
+        """Get the KV cache usage.
+
+        Returns:
+            The KV cache usage (between 0.0 and 1.0).
+        """
+        return self.block_pool.get_usage()
+
+    def make_prefix_cache_stats(self) -> PrefixCacheStats:
+        """Get (and reset) the prefix cache stats.
+
+        Returns:
+            The current prefix caching stats.
+        """
+        stats = self.prefix_cache_stats
+        self.prefix_cache_stats = PrefixCacheStats()
+        return stats
+
+    def get_computed_blocks(
+            self, request: Request) -> tuple[list[KVCacheBlock], int]:
+        """Get the computed (cached) blocks for the request.
+        Note that the computed blocks must be full.
+
+        Args:
+            request: The request to get the computed blocks.
+
+        Returns:
+            A tuple containing:
+                - A list of blocks that are computed for the request.
+                - The number of computed tokens.
+        """
+        if not self.enable_caching:
+            # Prefix caching is disabled.
+            return [], 0
+
+        # The block hashes for the request may already be computed
+        # if the scheduler has tried to schedule the request before.
+        block_hashes = self.req_to_block_hashes[request.request_id]
+        if not block_hashes:
+            block_hashes = hash_request_tokens(self.caching_hash_fn,
+                                               self.block_size, request)
+            self.req_to_block_hashes[request.request_id] = block_hashes
+
+        self.prefix_cache_stats.requests += 1
+        if request.sampling_params.prompt_logprobs is None:
+            if len(block_hashes) * self.block_size == request.num_tokens:
+                # When prompt length is divisible by the block size and all
+                # blocks are cached, we need to recompute the last token. This
+                # have to be achieved by re-computing an entire block because
+                # allocate_slots() assumes num_computed_tokens is always a
+                # multiple of the block size. To achieve this, remove the last
+                # block hash from the block_hashes for find_longest_cache_hit
+                # This limitation can potentially be removed in the future to
+                # slightly improve the performance.
+                last_block_hash = block_hashes.pop()
+            else:
+                last_block_hash = None
+
+            computed_blocks = (
+                self.specialized_manager.find_longest_cache_hit(block_hashes))
+
+            if last_block_hash is not None:
+                # Add back the last block hash if it was removed.
+                block_hashes.append(last_block_hash)
+
+            self.prefix_cache_stats.queries += len(block_hashes)
+            self.prefix_cache_stats.hits += len(computed_blocks)
+
+            # NOTE(woosuk): Since incomplete blocks are not eligible for
+            # sharing, `num_computed_tokens` is always a multiple of
+            # `block_size`.
+            num_computed_tokens = len(computed_blocks) * self.block_size
+            return computed_blocks, num_computed_tokens
+        else:
+            # Skip cache hits for prompt logprobs
+            return [], 0
+
+    def allocate_slots(
+        self,
+        request: Request,
+        num_tokens: int,
+        new_computed_blocks: Optional[list[KVCacheBlock]] = None
+    ) -> Optional[list[KVCacheBlock]]:
+        """Add slots for a request with new tokens to append.
+
+        Args:
+            request: The request to allocate slots.
+            num_tokens: The number of tokens to allocate. Note that this does
+                not include the tokens that have already been computed.
+            new_computed_blocks: A list of new computed blocks just hitting the
+                prefix caching.
+
+        Blocks layout:
+        -----------------------------------------------------------------------
+        | < computed > | < new computed > |    < new >    | < pre-allocated > |
+        -----------------------------------------------------------------------
+        |                  < required >                   |
+        --------------------------------------------------
+        |                    < full >                  |
+        ------------------------------------------------
+                                          | <new full> |
+                                          --------------
+        The following *_blocks are illustrated in this layout.
+
+        Returns:
+            A list of new allocated blocks.
+        """
+        if num_tokens == 0:
+            raise ValueError("num_tokens must be greater than 0")
+
+        new_computed_blocks = new_computed_blocks or []
+
+        req_blocks = self.req_to_blocks[request.request_id]
+
+        # Free the blocks that are skipped during the attention computation
+        # (e.g., tokens outside the sliding window).
+        # We can do this even if we cannot schedule this request due to
+        # insufficient free blocks.
+        # Should call this function before allocating new blocks to reduce
+        # the number of evicted blocks.
+        removed_blocks = self.specialized_manager.remove_skipped_blocks(
+            req_blocks, request.num_computed_tokens)
+        self.block_pool.free_blocks(removed_blocks)
+
+        # The number of computed tokens is the number of computed tokens plus
+        # the new prefix caching hits
+        num_computed_tokens = (request.num_computed_tokens +
+                               len(new_computed_blocks) * self.block_size)
+        num_required_blocks = cdiv(num_computed_tokens + num_tokens,
+                                   self.block_size)
+        num_new_blocks = (num_required_blocks - len(req_blocks) -
+                          len(new_computed_blocks))
+
+        # If a computed block of a request is an eviction candidate (in the
+        # free queue and ref_cnt == 0), it cannot be counted as a free block
+        # when allocating this request.
+        num_evictable_computed_blocks = sum(1 for blk in new_computed_blocks
+                                            if blk.ref_cnt == 0)
+        if (num_new_blocks > self.block_pool.get_num_free_blocks() -
+                num_evictable_computed_blocks):
+            # Cannot allocate new blocks
+            return None
+
+        # Touch the computed blocks to make sure they won't be evicted.
+        if self.enable_caching:
+            self.block_pool.touch(new_computed_blocks)
+        else:
+            assert not new_computed_blocks, (
+                "Computed blocks should be empty when "
+                "prefix caching is disabled")
+
+        # Append the new computed blocks to the request blocks until now to
+        # avoid the case where the new blocks cannot be allocated.
+        req_blocks.extend(new_computed_blocks)
+
+        # Start to handle new blocks
+
+        if num_new_blocks <= 0:
+            # No new block is needed.
+            new_blocks = []
+        else:
+            # Get new blocks from the free block pool considering
+            # preallocated blocks.
+            num_new_blocks = min(
+                num_new_blocks + self.num_preallocate_blocks,
+                self.block_pool.get_num_free_blocks(),
+                # Should not exceed the maximum number of blocks per request.
+                # This is especially because the block table has the shape
+                # [..., max_num_blocks_per_req].
+                self.max_num_blocks_per_req - len(req_blocks),
+            )
+            assert num_new_blocks > 0
+
+            # Concatenate the computed block IDs and the new block IDs.
+            new_blocks = self.block_pool.get_new_blocks(num_new_blocks)
+            req_blocks.extend(new_blocks)
+
+        if not self.enable_caching:
+            return new_blocks
+
+        # Use `new_computed_blocks` for a new request, and `num_cached_block`
+        # for a running request.
+        num_cached_blocks = self.num_cached_block.get(request.request_id,
+                                                      len(new_computed_blocks))
+        # Speculated tokens might be rejected in the future, so we does
+        # not cache any speculated tokens. We only cache blocks with
+        # generated (accepted) tokens.
+        num_full_blocks_after_append = (num_computed_tokens + num_tokens - len(
+            request.spec_token_ids)) // self.block_size
+
+        self.block_pool.cache_full_blocks(
+            request=request,
+            blocks=req_blocks,
+            block_hashes=self.req_to_block_hashes[request.request_id],
+            num_cached_blocks=num_cached_blocks,
+            num_full_blocks=num_full_blocks_after_append,
+            block_size=self.block_size,
+            hash_fn=self.caching_hash_fn,
+        )
+
+        self.num_cached_block[
+            request.request_id] = num_full_blocks_after_append
+        return new_blocks
+
+    def free(self, request: Request) -> None:
+        """Free the blocks allocated for the request.
+        When caching is enabled, we free the blocks in reverse order so that
+        the tail blocks are evicted first.
+
+        Args:
+            request: The request to free the blocks.
+        """
+        # Default to [] in case a request is freed (aborted) before alloc.
+        blocks = self.req_to_blocks.pop(request.request_id, [])
+        ordered_blocks: Iterable[KVCacheBlock] = blocks
+        if self.enable_caching:
+            # Free blocks in reverse order so that the tail blocks are
+            # freed first.
+            ordered_blocks = reversed(blocks)
+
+        self.block_pool.free_blocks(ordered_blocks)
+        self.num_cached_block.pop(request.request_id, None)
+
+    def reset_prefix_cache(self) -> bool:
+        """Reset prefix cache. This function may be used in RLHF
+        flows to invalid prefix caching after the weights are updated,
+        or used for resetting prefix caching status for benchmarking.
+
+        Returns:
+            bool: True if the prefix cache is successfully reset,
+            False otherwise.
+        """
+        if self.block_pool.reset_prefix_cache():
+            self.prefix_cache_stats.reset = True
+            return True
+        return False
+
+    def get_num_common_prefix_blocks(
+        self,
+        request: Request,
+        num_running_requests: int,
+    ) -> int:
+        """Calculate the number of common prefix blocks shared by all requests
+        in the RUNNING state.
+
+        The function determines this by selecting any request and iterating
+        through its blocks.  A block is considered a common prefix block if its
+        `ref_cnt` equals the total number of requests in the RUNNING state.
+
+        NOTE(woosuk): The number of requests in the RUNNING state is **greater
+        than or equal to** the number of requests scheduled in the current step.
+        This is because the RUNNING state only indicates that:
+        1. The request has not yet finished, and
+        2. The request holds its blocks unfreed.
+
+        While all scheduled requests must be in the RUNNING state, the inverse
+        is not necessarily true. There may be RUNNING requests that are not
+        scheduled in the current step.
+
+        This can result in an edge case where the number of common prefix blocks
+        is 0, even though all scheduled requests share a common prefix. This
+        occurs because there may be unscheduled RUNNING requests that do not
+        share the common prefix. Currently, this case cannot be easily detected,
+        so the function returns 0 in such cases.
+
+        Args:
+            request: Any request in the RUNNING state, used to identify the
+                common prefix blocks.
+            num_running_requests: The total number of requests in the RUNNING
+                state. This can be different from the number of scheduled
+                requests in the current step.
+
+        Returns:
+            int: The number of common prefix blocks.
+        """
+        assert request.status == RequestStatus.RUNNING
+        blocks = self.req_to_blocks[request.request_id]
+        num_common_blocks = 0
+        for block in blocks:
+            if block.ref_cnt == num_running_requests:
+                num_common_blocks += 1
+            else:
+                break
+        return num_common_blocks
+
+    def free_block_hashes(self, request: Request) -> None:
+        """Discard the block hashes for the request.
+
+        NOTE: Unlike `free`, this method should be called only when the request
+        is finished, not when it is preempted.
+        """
+        self.req_to_block_hashes.pop(request.request_id, None)
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -0,0 +1,690 @@
+# SPDX-License-Identifier: Apache-2.0
+"""KV-Cache Utilities."""
+import os
+from collections import deque
+from collections.abc import Sequence
+from dataclasses import dataclass
+from typing import Any, Callable, NamedTuple, Optional
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.utils import sha256
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+                                        KVCacheGroupSpec, KVCacheSpec,
+                                        KVCacheTensor, SlidingWindowSpec)
+from vllm.v1.metrics.stats import PrefixCacheStats
+from vllm.v1.request import Request
+
+logger = init_logger(__name__)
+
+
+class BlockHashType(NamedTuple):
+    """Hash value of a block (int), the token IDs in the block, and extra keys.
+    We keep a tuple of token IDs and extra keys to reduce the likelihood of
+    hash collisions when the hash value is the same. By using SHA256 however,
+    hash collisions are practically impossible.
+    """
+    # Hash value of the block in an integer.
+    hash_value: int
+    # Token IDs in the block.
+    token_ids: tuple[int, ...]
+    # Extra keys for the block.
+    extra_keys: Optional[Any] = None
+
+
+# The hash seed for the first block of the prefix block sequence.
+#
+# Even if the hash function is the builtin hash(), we use sha256 to generate
+# the initial hash to simplify the code. This is not performance critical
+# as it is done one per process.
+#
+# We use a random value to avoid hash collisions or PYTHONHASHSEED environment
+# variable if set such that processes can share the seed if needed.
+# This aligns with the behavior of Python's hash() function, which also uses
+# a random seed if PYTHONHASHSEED is not set.
+NONE_HASH = int.from_bytes(os.urandom(32), byteorder="big") if os.getenv(
+    'PYTHONHASHSEED') is not None else sha256(os.getenv('PYTHONHASHSEED'))
+
+
+class PrefixCachingMetrics:
+    """Metrics for prefix caching with a hit rate of the most recent N requests.
+
+    Args:
+        interval: The number of the most recent requests to aggregate.
+            Defaults to 1000.
+    """
+
+    def __init__(self, interval: int = 1000):
+        self.interval = interval
+        # The current aggregated values.
+        self.aggregated_requests = 0
+        self.aggregated_query_total = 0
+        self.aggregated_query_hit = 0
+        # A deque of (requests, queries, hits) for the most recent requests.
+        self.query_queue: deque[tuple[int, int, int]] = deque()
+
+    def observe(self, stats: PrefixCacheStats):
+        """Observe the prefix caching for a set of requests.
+
+        This function is called with information gathered when new requests
+        are being scheduled and are looking for computed blocks.
+
+        When there are more than `interval` requests, the oldest set of
+        requestsare removed from the metrics.
+
+        Args:
+            stats: The prefix cache stats.
+        """
+        # reset_prefix_cache was invoked before the current update.
+        # Reset the metrics before aggregating the current stats.
+        if stats.reset:
+            self.reset()
+
+        # Update the metrics.
+        self.query_queue.append((stats.requests, stats.queries, stats.hits))
+        self.aggregated_requests += stats.requests
+        self.aggregated_query_total += stats.queries
+        self.aggregated_query_hit += stats.hits
+
+        # Remove the oldest stats if the number of requests exceeds.
+        if self.aggregated_requests > self.interval:
+            old_requests, old_queries, old_hits = self.query_queue.popleft()
+            self.aggregated_requests -= old_requests
+            self.aggregated_query_total -= old_queries
+            self.aggregated_query_hit -= old_hits
+
+    def reset(self):
+        """Reset the metrics."""
+        self.aggregated_requests = 0
+        self.aggregated_query_total = 0
+        self.aggregated_query_hit = 0
+        self.query_queue.clear()
+
+    @property
+    def hit_rate(self) -> float:
+        """Calculate the hit rate for the past N requests."""
+        if self.aggregated_query_total == 0:
+            return 0.0
+        return self.aggregated_query_hit / self.aggregated_query_total
+
+
+@dataclass
+class KVCacheBlock:
+    """KV-cache block metadata."""
+    # Block ID, ranging from 0 to num_gpu_blocks - 1.
+    block_id: int
+    # Reference count.
+    ref_cnt: int = 0
+    # The hash of the block composed of (block hash, tuple of token IDs).
+    # It is only available when the block is full.
+    _block_hash: Optional[BlockHashType] = None
+
+    # Used to construct a doubly linked list for free blocks.
+    # These two attributes should only be manipulated by FreeKVCacheBlockQueue.
+    prev_free_block: Optional["KVCacheBlock"] = None
+    next_free_block: Optional["KVCacheBlock"] = None
+
+    def incr_ref(self):
+        self.ref_cnt += 1
+
+    def decr_ref(self):
+        self.ref_cnt -= 1
+
+    @property
+    def block_hash(self) -> Optional[BlockHashType]:
+        return self._block_hash
+
+    @block_hash.setter
+    def block_hash(self, block_hash: BlockHashType):
+        assert self.block_hash is None, (
+            "The block already has a hash. This should not happen.")
+        self._block_hash = block_hash
+
+    def reset_hash(self):
+        """Reset the block hash when the block is evicted."""
+        self._block_hash = None
+
+    def __repr__(self) -> str:
+        # Use block_id instead of KVCacheBlock object to avoid calling __repr__
+        # on KVCacheBlock object recursively.
+        prev_block_id = self.prev_free_block.block_id \
+            if self.prev_free_block else None
+        next_block_id = self.next_free_block.block_id \
+            if self.next_free_block else None
+        return (f"KVCacheBlock(block_id={self.block_id}, "
+                f"ref_cnt={self.ref_cnt}, "
+                f"_block_hash={self._block_hash}, "
+                f"prev_free_block={prev_block_id}, "
+                f"next_free_block={next_block_id})")
+
+
+class FreeKVCacheBlockQueue:
+    """This class organizes a list of KVCacheBlock objects to a doubly linked
+    list of free blocks. We implement this class instead of using Python
+    builtin deque to support removing a block in the middle of the queue
+    in O(1) time. To close the performance gap to the builtin deque which is
+    implemented in C++, this class does not allocate any Python objects when
+    manipulating the linked list. Instead, this class manipulates the
+    prev_free_block and next_free_block attributes of the given blocks.
+
+    The queue is ordered by block ID in the beginning. When a block is allocated
+    and then freed, it will be appended back with the eviction order:
+    1. The least recent used block is at the front (LRU).
+    2. If two blocks have the same last accessed time (allocated by the
+       same sequence), the one with more hash tokens (the tail of a block
+       chain) is at the front.
+    Note that we maintain this order by reversing the block order when free
+    blocks of a request. This operation is outside of this class.
+
+    Args:
+        blocks: A list of KVCacheBlock objects.
+    """
+
+    def __init__(self, blocks: list[KVCacheBlock]) -> None:
+        self.num_free_blocks = len(blocks)
+
+        # Initialize the doubly linked list of free blocks.
+        self.free_list_head: Optional[KVCacheBlock] = blocks[0]
+        self.free_list_tail: Optional[KVCacheBlock] = blocks[-1]
+        for i in range(self.num_free_blocks):
+            if i > 0:
+                blocks[i].prev_free_block = blocks[i - 1]
+            if i < self.num_free_blocks - 1:
+                blocks[i].next_free_block = blocks[i + 1]
+
+    def popleft(self) -> KVCacheBlock:
+        """Pop the first free block and reduce num_free_blocks by 1.
+
+        Returns:
+            The first free block.
+        """
+        if not self.free_list_head:
+            raise ValueError("No free blocks available")
+
+        block = self.free_list_head
+        self.remove(block)
+        return block
+
+    def remove(self, block: KVCacheBlock) -> None:
+        """Remove a block in the free list and reduce num_free_blocks by 1.
+
+        Args:
+            block: The block to remove.
+        """
+        if block.prev_free_block is not None:
+            # Link the previous block to the next block.
+            block.prev_free_block.next_free_block = block.next_free_block
+        if block.next_free_block is not None:
+            # Link the next block to the previous block.
+            block.next_free_block.prev_free_block = block.prev_free_block
+
+        if block == self.free_list_head:
+            # Update the head if the block is the head.
+            self.free_list_head = block.next_free_block
+        if block == self.free_list_tail:
+            # Update the tail if the block is the tail.
+            self.free_list_tail = block.prev_free_block
+
+        # Remove the block from the linked list.
+        block.prev_free_block = block.next_free_block = None
+        self.num_free_blocks -= 1
+
+    def append(self, block: KVCacheBlock) -> None:
+        """Put a block back into the free list and increase
+        num_free_blocks by 1.
+
+        Args:
+            block: The block to append.
+        """
+        if self.free_list_tail is not None:
+            # Link the last block to the new block.
+            self.free_list_tail.next_free_block = block
+            block.prev_free_block = self.free_list_tail
+            self.free_list_tail = block
+        else:
+            # The free list is empty.
+            assert self.free_list_head is None
+            self.free_list_head = self.free_list_tail = block
+
+        block.next_free_block = None
+        self.num_free_blocks += 1
+
+    def get_all_free_blocks(self) -> list[KVCacheBlock]:
+        """Get all free blocks in the free list. Mainly used for testing.
+
+        Returns:
+            A list of free blocks.
+        """
+        ret = []
+        curr_block = self.free_list_head
+        while curr_block is not None:
+            ret.append(curr_block)
+            curr_block = curr_block.next_free_block
+        return ret
+
+
+def need_extra_keys(request: Request) -> bool:
+    """Check whether the blocks allocated to this request need extra hash keys.
+
+    Args:
+        request (Request): The request.
+
+    Returns:
+        bool: Whether blocks allocated to this request need extra hash keys.
+    """
+
+    # Multimodal requests need to include the MM hash.
+    # LoRA requests need to include the LoRA ID.
+    return bool(request.mm_positions) or (request.lora_request is not None)
+
+
+def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int,
+                            end_token_idx: int,
+                            start_mm_idx: int) -> tuple[list[Any], int]:
+    """Generate extra keys related to MultiModal request for block hash
+    computation. For multi-modal inputs, the extra keys are
+    (mm_hash, start_offset) that indicate a mm input contained in the
+    block and its starting offset in the block tokens.
+
+    Args:
+        request: The request object.
+        start_token_idx: The start token index of the block.
+        end_token_idx: The end token index of the block.
+        start_mm_idx: The start multi-modal index of the block.
+
+    Returns:
+        A tuple of extra keys and the next multi-modal index.
+    """
+    extra_keys: list[Any] = []
+
+    mm_positions, mm_hashes = request.mm_positions, request.mm_hashes
+    if not mm_positions:
+        return extra_keys, start_mm_idx
+
+    if mm_positions and len(mm_positions) != len(mm_hashes):
+        raise ValueError(
+            "The number of multi-modal positions and hashes must match. This "
+            "is likely because you do not enable MM preprocessor hashing. "
+            "Please set disable_mm_preprocessor_cache=False.")
+
+    # Note that we assume mm_positions is sorted by offset.
+    # We do not need to check all mm inputs if the start token index is out of
+    # range. This usually happens in the late prefill phase and decoding phase.
+    if mm_positions[-1]["offset"] + mm_positions[-1][
+            "length"] < start_token_idx:
+        return extra_keys, start_mm_idx
+
+    # Support start_mm_idx == -1 to indicate the last mm input.
+    if start_mm_idx < 0:
+        assert -start_mm_idx <= len(mm_positions)
+        start_mm_idx = len(mm_positions) + start_mm_idx
+
+    curr_mm_idx = start_mm_idx
+    while mm_positions and curr_mm_idx < len(mm_positions):
+        assert mm_hashes[curr_mm_idx] is not None
+        offset = mm_positions[curr_mm_idx]["offset"]
+        length = mm_positions[curr_mm_idx]["length"]
+        if end_token_idx > offset:
+            if start_token_idx > offset + length:
+                # This block has passed the current mm input.
+                curr_mm_idx += 1
+                continue
+
+            # The block contains the current mm input.
+            extra_keys.append(mm_hashes[curr_mm_idx])
+
+            if end_token_idx >= offset + length:
+                # If this block contains the end of the current mm input,
+                # move to the next mm input as this block may also contain
+                # the next mm input.
+                curr_mm_idx += 1
+            else:
+                # Otherwise this block is done with mm inputs.
+                break
+        else:
+            # This block has not reached the current mm input.
+            break
+    return extra_keys, curr_mm_idx
+
+
+def _gen_lora_extra_hash_keys(request: Request) -> list[int]:
+    """Generate extra keys related to LoRA for block hash computation.
+
+    Args:
+        request: The request object.
+
+    Returns:
+        Return LoRA id of the request if it is a LoRA request. Return empty
+        list otherwise.
+    """
+    if not request.lora_request:
+        return []
+    return [request.lora_request.lora_int_id]
+
+
+def generate_block_hash_extra_keys(
+        request: Request, start_token_idx: int, end_token_idx: int,
+        start_mm_idx: int) -> tuple[Optional[tuple[Any, ...]], int]:
+    """Generate extra keys for the block hash. The extra keys can come from
+    the multi-modal inputs and request specific metadata (e.g., LoRA ID).
+
+    Args:
+        request: The request object.
+        start_token_idx: The start token index of the block.
+        end_token_idx: The end token index of the block.
+        start_mm_idx: The start multi-modal index of the block.
+
+    Returns:
+        A tuple of extra keys and the next multi-modal index.
+    """
+    mm_extra_keys: list[Any]
+    mm_extra_keys, new_start_mm_idx = _gen_mm_extra_hash_keys(
+        request, start_token_idx, end_token_idx, start_mm_idx)
+    lora_extra_keys: list[int] = _gen_lora_extra_hash_keys(request)
+
+    extra_keys: list[Any] = lora_extra_keys + mm_extra_keys
+
+    if not extra_keys:
+        return None, new_start_mm_idx
+
+    return tuple(extra_keys), new_start_mm_idx
+
+
+def hash_block_tokens(
+        hash_function: Callable,
+        parent_block_hash: Optional[int],
+        curr_block_token_ids: Sequence[int],
+        extra_keys: Optional[tuple[Any, ...]] = None) -> BlockHashType:
+    """Computes a hash value corresponding to the contents of a block and
+    the contents of the preceding block(s). The hash value is used for
+    prefix caching. We use LRU cache for this function to avoid recomputing
+    hash values for the same block contents.
+
+    Args:
+        parent_block_hash: The hash of the parent block. None
+            if this is the first block.
+        curr_block_token_ids: A list of token ids in the current
+            block. The current block is assumed to be full.
+        extra_keys: Extra keys for the block.
+
+    Returns:
+        The hash value of the block and the token ids in the block.
+        The entire tuple is used as the hash key of the block.
+    """
+    if not parent_block_hash:
+        parent_block_hash = NONE_HASH
+
+    curr_block_token_ids_tuple = tuple(curr_block_token_ids)
+    return BlockHashType(
+        hash_function(
+            (parent_block_hash, curr_block_token_ids_tuple, extra_keys)),
+        curr_block_token_ids_tuple, extra_keys)
+
+
+def hash_request_tokens(hash_function: Any, block_size: int,
+                        request: Request) -> list[BlockHashType]:
+    """Computes hash values of a chain of blocks given a sequence of
+    token IDs. The hash value is used for prefix caching.
+
+    Args:
+        block_size: The size of each block.
+        request: The request object.
+
+    Returns:
+        The list of computed hash values.
+    """
+    token_ids = request.all_token_ids
+
+    req_need_extra_keys = need_extra_keys(request)
+    req_extra_keys = None
+    curr_mm_idx = 0
+
+    ret = []
+    parent_block_hash_value = None
+    for start in range(0, len(token_ids), block_size):
+        end = start + block_size
+        block_token_ids = token_ids[start:end]
+        # Do not hash the block if it is not full.
+        if len(block_token_ids) < block_size:
+            break
+
+        if req_need_extra_keys:
+            # MM and LoRA requests need extra keys for block-hash computation.
+            req_extra_keys, curr_mm_idx = generate_block_hash_extra_keys(
+                request, start, end, curr_mm_idx)
+
+        block_hash = hash_block_tokens(hash_function, parent_block_hash_value,
+                                       block_token_ids, req_extra_keys)
+        ret.append(block_hash)
+        parent_block_hash_value = block_hash.hash_value
+    return ret
+
+
+def check_enough_kv_cache_memory(vllm_config: VllmConfig,
+                                 kv_cache_spec: dict[str, KVCacheSpec],
+                                 available_memory: int):
+    """
+    Checks whether `available_memory` is enough for the KV cache to hold at
+    least one request with the model's max_model_len.
+
+    Args:
+        vllm_config: The global VllmConfig
+        kv_cache_spec: The kv cache spec of each attention layer in the model
+        available_memory: Memory available for KV cache in bytes.
+
+    Raises:
+        ValueError: If there is not enough memory available for the KV cache.
+    """
+
+    if available_memory <= 0:
+        raise ValueError("No available memory for the cache blocks. "
+                         "Try increasing `gpu_memory_utilization` when "
+                         "initializing the engine.")
+
+    max_model_len = vllm_config.model_config.max_model_len
+    needed_memory = 0
+    for layer_spec in kv_cache_spec.values():
+        needed_memory += layer_spec.max_memory_usage_bytes(vllm_config)
+
+    if needed_memory > available_memory:
+        raise ValueError(
+            f"To serve at least one request with the models's max seq len "
+            f"({max_model_len}), ({needed_memory/1024/1024/1024:.2f} GiB KV "
+            f"cache is needed, which is larger than the available KV cache "
+            f"memory ({available_memory/1024/1024/1024:.2f} GiB). Try "
+            f"increasing `gpu_memory_utilization` or decreasing "
+            f"`max_model_len` when initializing the engine.")
+
+
+def create_kv_cache_group_specs(
+        kv_cache_spec: dict[str, KVCacheSpec],
+        grouped_layer_names: list[list[str]]) -> list[KVCacheGroupSpec]:
+    """
+     Create KVCacheGroupSpec object for each kv cache group layer.
+     The layers in the same group should share the same
+     KVCacheSpec.
+
+     Args:
+         kv_cache_spec:
+             A mapping from each layer name to its corresponding KVCacheSpec.
+         grouped_layer_names:
+             A list of kv cache groups, where each element is a list of layer
+             names that belong to the same group and should share the same
+             KVCacheSpec.
+     Returns:
+         A list of KVCacheGroupSpec objects, one for each group.
+     """
+    kv_cache_groups = []
+    for layer_names_one_group in grouped_layer_names:
+        layer_spec = kv_cache_spec[layer_names_one_group[0]]
+        assert all(
+            kv_cache_spec[layer_name] == layer_spec
+            for layer_name in layer_names_one_group[1:]), (
+                "All layers in the same KV cache group must share the same "
+                "KVCacheSpec.")
+        kv_cache_groups.append(
+            KVCacheGroupSpec(layer_names_one_group, layer_spec))
+    return kv_cache_groups
+
+
+def is_kv_cache_type_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
+    """
+    Whether all layers in the given KVCacheSpec have the same type of KV cache.
+
+    Args:
+        kv_cache_spec: The kv cache spec of each attention layer in the model
+
+    Returns:
+        True if all layers have the same type, False otherwise.
+    """
+
+    layer_keys = set(layer.type_id for layer in kv_cache_spec.values())
+    return len(layer_keys) == 1
+
+
+def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
+                                      kv_cache_spec: dict[str, KVCacheSpec],
+                                      available_memory: int) -> KVCacheConfig:
+    """
+    Generates the KV cache configuration for a model with one type of KV cache.
+    Divide the available memory equally among all layers.
+
+    Args:
+        vllm_config: The global VllmConfig
+        kv_cache_spec: The kv cache spec of each attention layer in the model
+        available_memory: Memory available for KV cache in bytes.
+
+    Returns:
+        The generated KVCacheConfig
+    """
+
+    page_sizes = {layer.page_size_bytes for layer in kv_cache_spec.values()}
+    scale_page_sizes = {layer.scale_page_size_bytes for layer in kv_cache_spec.values()}
+    assert len(page_sizes) == 1
+    page_size = page_sizes.pop()
+    scale_page_size = scale_page_sizes.pop()
+
+    num_blocks = int(available_memory // (page_size + scale_page_size) // len(kv_cache_spec))
+    num_blocks = max(num_blocks, 0)
+
+    if vllm_config.cache_config.num_gpu_blocks_override is not None:
+        num_gpu_blocks_override = \
+            vllm_config.cache_config.num_gpu_blocks_override
+        logger.info(
+            "Overriding num_gpu_blocks=%d with "
+            "num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override)
+        num_blocks = num_gpu_blocks_override
+
+    num_tokens = num_blocks * vllm_config.cache_config.block_size
+    num_tokens_str = f"{num_tokens:,}"
+    logger.info("GPU KV cache size: %s tokens", num_tokens_str)
+    max_model_len_str = f"{vllm_config.model_config.max_model_len:,}"
+    max_concurrency = num_tokens / vllm_config.model_config.max_model_len
+    logger.info("Maximum concurrency for %s tokens per request: %.2fx",
+                max_model_len_str, max_concurrency)
+
+    per_layer_size = (page_size + scale_page_size) * num_blocks
+    # All layers have the same KV cache spec, so we create one kv cache group
+    # for all layers.
+    grouped_layer_names = [list(kv_cache_spec.keys())]
+
+    kv_cache_config = KVCacheConfig(
+        num_blocks=num_blocks,
+        tensors={
+            layer_name: KVCacheTensor(size=per_layer_size)
+            for layer_name in kv_cache_spec
+        },
+        kv_cache_groups=create_kv_cache_group_specs(kv_cache_spec,
+                                                    grouped_layer_names),
+    )
+    return kv_cache_config
+
+
+def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
+    """
+    Only models with one type of KV cache are supported yet. This function tries
+    to convert the KV cache specs to one type if the model is a hybrid model 
+    with multiple type of KV cache. It will convert all SlidingWindowSpec to
+    FullAttentionSpec if both types are present.
+    
+    Args:
+        kv_cache_spec: The kv cache spec of each attention layer in the model
+    """
+
+    has_full_attention = any(
+        isinstance(spec, FullAttentionSpec) for spec in kv_cache_spec.values())
+    has_sliding_window = any(
+        isinstance(spec, SlidingWindowSpec) for spec in kv_cache_spec.values())
+    if has_full_attention and has_sliding_window:
+        for layer_name, spec in kv_cache_spec.items():
+            if isinstance(spec, SlidingWindowSpec):
+                kv_cache_spec[layer_name] = FullAttentionSpec(
+                    block_size=spec.block_size,
+                    num_kv_heads=spec.num_kv_heads,
+                    head_size=spec.head_size,
+                    dtype=spec.dtype,
+                    use_mla=spec.use_mla,
+                )
+
+
+def get_kv_cache_config(vllm_config: VllmConfig,
+                        kv_cache_spec: dict[str, KVCacheSpec],
+                        available_memory: int) -> KVCacheConfig:
+    """
+    Generates the KV cache configuration for a model
+    TODO: support hybrid models with more than one type of KV cache.
+
+    Args:
+        vllm_config: The global VllmConfig
+        kv_cache_spec: The kv cache spec of each attention layer in the model
+        available_memory: Memory available for KV cache in bytes.
+
+    Returns:
+        The generated KVCacheConfigs
+    """
+    check_enough_kv_cache_memory(vllm_config, kv_cache_spec, available_memory)
+    unify_hybrid_kv_cache_specs(kv_cache_spec)
+    if is_kv_cache_type_uniform(kv_cache_spec):
+        # KV cache of all layers are the same, which is true for
+        # most models. Allocate the same amount of memory for
+        # each layer.
+        return _get_kv_cache_config_uniform_type(vllm_config, kv_cache_spec,
+                                                 available_memory)
+
+    raise NotImplementedError
+
+
+def unify_kv_cache_configs(kv_cache_configs: list[KVCacheConfig]):
+    """
+    Make the KV cache configurations for each worker consistent, so that all
+    workers can be controlled by the same KVCacheManager.
+    This function verifies that the layer group of each worker are the same,
+    and changes the num_blocks of each worker to the smallest among all workers.
+
+    Args:
+        kv_cache_configs: The KV cache configurations for each worker. Will be
+            in-place modified to make them consistent.
+    """
+
+    # Sort the kv cache groups by the type_id of their KV cache spec.
+    # This can avoid the inconsistency caused by the order of groups.
+    for kv_cache_config in kv_cache_configs:
+        kv_cache_config.kv_cache_groups.sort(
+            key=lambda x: x.kv_cache_spec.type_id)
+
+    # Verify that the groups of each rank are the same.
+    for kv_cache_config in kv_cache_configs[1:]:
+        for group_rank_0, group_rank_i in zip(
+                kv_cache_configs[0].kv_cache_groups,
+                kv_cache_config.kv_cache_groups):
+            assert group_rank_0.kv_cache_spec == group_rank_i.kv_cache_spec
+
+    # Change the num_blocks of each rank to the smallest among all ranks. We
+    # do not need to shrink the tensor size because it is valid to only use the
+    # first `num_blocks` blocks of the tensor.
+    min_num_blocks = min(kv_cache_config.num_blocks
+                         for kv_cache_config in kv_cache_configs)
+    for kv_cache_config in kv_cache_configs:
+        kv_cache_config.num_blocks = min_num_blocks
+
+    return kv_cache_configs
--- a/vllm/v1/core/sched/init.py
+++ b/vllm/v1/core/sched/init.py
--- a/vllm/v1/core/sched/interface.py
+++ b/vllm/v1/core/sched/interface.py
@@ -0,0 +1,139 @@
+# SPDX-License-Identifier: Apache-2.0
+from abc import ABC, abstractmethod
+from collections.abc import Iterable
+from typing import TYPE_CHECKING, Optional, Union
+
+if TYPE_CHECKING:
+    from vllm.v1.core.sched.output import SchedulerOutput
+    from vllm.v1.engine import EngineCoreOutputs
+    from vllm.v1.metrics.stats import SchedulerStats
+    from vllm.v1.outputs import ModelRunnerOutput
+    from vllm.v1.request import Request, RequestStatus
+
+
+class SchedulerInterface(ABC):
+
+    @abstractmethod
+    def schedule(self) -> "SchedulerOutput":
+        """Schedule the requests to process in this scheduling step.
+
+        The scheduling decision is made at the iteration level. Each scheduling
+        step corresponds to a single forward pass of the model. Therefore, this
+        method is called repeatedly by a busy loop in the engine.
+
+        Essentially, the scheduler produces a dictionary of {req_id: num_tokens}
+        that specifies how many tokens to process for each request in this
+        scheduling step. For example, num_tokens can be as large as the number
+        of prompt tokens for new requests, or it can be 1 for the requests that
+        are auto-regressively generating new tokens one by one. Otherwise, it
+        can be somewhere in between in case of chunked prefills, prefix caching,
+        speculative decoding, etc.
+
+        Additionally, the scheduler also returns useful data about each request
+        or the batch as a whole. The model runner will use this information in
+        preparing inputs to the model.
+
+        Returns:
+            A SchedulerOutput object containing information about the scheduled
+            requests.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def update_from_output(
+        self,
+        scheduler_output: "SchedulerOutput",
+        model_runner_output: "ModelRunnerOutput",
+    ) -> "EngineCoreOutputs":
+        """Update the scheduler state based on the model runner output.
+
+        This method is called after the model runner has processed the scheduled
+        requests. The model runner output includes generated token ids, draft
+        token ids for next step, etc. The scheduler uses this information to
+        update its states, checks the finished requests, and returns the output
+        for each request.
+
+        Returns:
+            A EngineCoreOutputs object containing the outputs for each request.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_request(self, request: "Request") -> None:
+        """Add a new request to the scheduler's internal queue.
+        
+        Args:
+            request: The new request being added.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def finish_requests(
+        self,
+        request_ids: Union[str, Iterable[str]],
+        finished_status: "RequestStatus",
+    ) -> None:
+        """Finish the requests in the scheduler's internal queue. If the request
+        is not in the queue, this method will do nothing.
+
+        This method is called in two cases:
+        1. When the request is aborted by the client.
+        2. When the frontend process detects a stop string of the request after
+           de-tokenizing its generated tokens.
+           
+        Args:
+            request_ids: A single or a list of request IDs.
+            finished_status: The finished status of the given requests.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_num_unfinished_requests(self) -> int:
+        """Number of unfinished requests in the scheduler's internal queue."""
+        raise NotImplementedError
+
+    def has_unfinished_requests(self) -> bool:
+        """Returns True if there are unfinished requests in the scheduler's
+        internal queue."""
+        return self.get_num_unfinished_requests() > 0
+
+    @abstractmethod
+    def has_finished_requests(self) -> bool:
+        """Returns True if there are finished requests that need to be cleared.
+        NOTE: This is different from `not self.has_unfinished_requests()`.
+
+        The scheduler maintains an internal list of the requests finished in the
+        previous step. This list is returned from the next call to schedule(),
+        to be sent to the model runner in the next step to clear cached states
+        for these finished requests.
+
+        This method checks if this internal list of finished requests is
+        non-empty. This information is useful for DP attention.
+        """
+        raise NotImplementedError
+
+    def has_requests(self) -> bool:
+        """Returns True if there are unfinished requests, or finished requests
+        not yet returned in SchedulerOutputs."""
+        return self.has_unfinished_requests() or self.has_finished_requests()
+
+    @abstractmethod
+    def get_num_unscheduled_requests(self) -> int:
+        """Number of requests that are not being processed by the executor."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def reset_prefix_cache(self) -> bool:
+        """Reset the prefix cache for KV cache.
+
+        This is particularly required when the model weights are live-updated.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def make_stats(self) -> Optional["SchedulerStats"]:
+        """Make a SchedulerStats object for logging.
+
+        The SchedulerStats object is created for every scheduling step.
+        """
+        raise NotImplementedError
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -0,0 +1,123 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional
+
+if TYPE_CHECKING:
+    import numpy as np
+    import numpy.typing as npt
+
+    from vllm.lora.request import LoRARequest
+    from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+    from vllm.sampling_params import SamplingParams
+    from vllm.v1.request import Request
+
+
+@dataclass
+class NewRequestData:
+
+    req_id: str
+    prompt_token_ids: list[int]
+    prompt: Optional[str]
+    mm_inputs: list[MultiModalKwargs]
+    mm_hashes: list[str]
+    mm_positions: list[PlaceholderRange]
+    sampling_params: SamplingParams
+    block_ids: list[int]
+    num_computed_tokens: int
+    lora_request: Optional[LoRARequest]
+
+    @classmethod
+    def from_request(
+        cls,
+        request: Request,
+        block_ids: list[int],
+    ) -> NewRequestData:
+        return cls(
+            req_id=request.request_id,
+            prompt_token_ids=request.prompt_token_ids,
+            prompt=request.prompt,
+            mm_inputs=request.mm_inputs,
+            mm_hashes=request.mm_hashes,
+            mm_positions=request.mm_positions,
+            sampling_params=request.sampling_params,
+            block_ids=block_ids,
+            num_computed_tokens=request.num_computed_tokens,
+            lora_request=request.lora_request,
+        )
+
+
+@dataclass
+class CachedRequestData:
+
+    req_id: str
+    # If resumed_from_preemption is False, new_block_ids will be appended to
+    # the request's block IDs. If True, new_block_ids will be used as the
+    # request's block IDs instead of appending to the existing block IDs.
+    resumed_from_preemption: bool
+    new_token_ids: list[int]
+    new_block_ids: list[int]
+    num_computed_tokens: int
+
+    @classmethod
+    def from_request(
+        cls,
+        request: Request,
+        resumed_from_preemption: bool,
+        new_token_ids: list[int],
+        new_block_ids: list[int],
+    ) -> CachedRequestData:
+        return cls(
+            req_id=request.request_id,
+            resumed_from_preemption=resumed_from_preemption,
+            new_token_ids=new_token_ids,
+            new_block_ids=new_block_ids,
+            num_computed_tokens=request.num_computed_tokens,
+        )
+
+
+@dataclass
+class SchedulerOutput:
+
+    # list of the requests that are scheduled for the first time.
+    # We cache the request's data in each worker process, so that we don't
+    # need to re-send it every scheduling step.
+    scheduled_new_reqs: list[NewRequestData]
+    # list of the requests that have been scheduled before.
+    # Since the request's data is already cached in the worker processes,
+    # we only send the diff to minimize the communication cost.
+    scheduled_cached_reqs: list[CachedRequestData]
+
+    # req_id -> num_scheduled_tokens
+    # Number of tokens scheduled for each request.
+    num_scheduled_tokens: dict[str, int]
+    # Total number of tokens scheduled for all requests.
+    # Equal to sum(num_scheduled_tokens.values())
+    total_num_scheduled_tokens: int
+    # req_id -> spec_token_ids
+    # If a request does not have any spec decode tokens, it will not be
+    # included in the dictionary.
+    scheduled_spec_decode_tokens: dict[str, list[int]]
+    # req_id -> encoder input indices that need processing.
+    # E.g., if a request has [0, 1], it could mean the vision encoder needs
+    # to process that the request's 0-th and 1-th images in the current step.
+    scheduled_encoder_inputs: dict[str, list[int]]
+    # Number of common prefix blocks for all requests.
+    # This can be used for cascade attention.
+    num_common_prefix_blocks: int
+
+    # Request IDs that are finished in between the previous and the current
+    # steps. This is used to notify the workers about the finished requests
+    # so that they can free the cached states for those requests.
+    finished_req_ids: set[str]
+    # list of (req_id, encoder_input_index) tuples.
+    # Used to free the encoder cache.
+    free_encoder_input_ids: list[tuple[str, int]]
+
+    # Dict of request ids to their index within the batch
+    # for filling the next token bitmask
+    structured_output_request_ids: dict[str, int]
+    # the bitmask for the whole batch
+    grammar_bitmask: Optional[npt.NDArray[np.int32]]
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -0,0 +1,759 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import time
+from collections import deque
+from collections.abc import Iterable
+from typing import Optional, Union
+
+from vllm.config import CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig
+from vllm.logger import init_logger
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
+                                                compute_encoder_budget)
+from vllm.v1.core.kv_cache_manager import KVCacheManager
+from vllm.v1.core.sched.interface import SchedulerInterface
+from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
+                                       SchedulerOutput)
+from vllm.v1.core.sched.utils import check_stop
+from vllm.v1.engine import (EngineCoreEventType, EngineCoreOutput,
+                            EngineCoreOutputs)
+from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.metrics.stats import SchedulerStats
+from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.request import Request, RequestStatus
+from vllm.v1.spec_decode.metrics import SpecDecodingStats
+from vllm.v1.structured_output import StructuredOutputManager
+
+logger = init_logger(__name__)
+
+
+class Scheduler(SchedulerInterface):
+
+    def __init__(
+        self,
+        scheduler_config: SchedulerConfig,
+        model_config: ModelConfig,
+        cache_config: CacheConfig,
+        lora_config: Optional[LoRAConfig],
+        kv_cache_config: KVCacheConfig,
+        structured_output_manager: StructuredOutputManager,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+        include_finished_set: bool = False,
+        log_stats: bool = False,
+    ) -> None:
+        self.scheduler_config = scheduler_config
+        self.cache_config = cache_config
+        self.lora_config = lora_config
+        self.kv_cache_config = kv_cache_config
+        self.log_stats = log_stats
+        self.structured_output_manager = structured_output_manager
+
+        # include_finished_set controls whether a separate set of finished
+        # request ids should be included in the EngineCoreOutputs returned
+        # by update_from_outputs(). This is currently used in the multi-engine
+        # case to track request lifetimes efficiently.
+        self.include_finished_set = include_finished_set
+
+        # Scheduling constraints.
+        self.max_num_running_reqs = self.scheduler_config.max_num_seqs
+        self.max_num_scheduled_tokens = \
+            self.scheduler_config.max_num_batched_tokens
+        self.max_model_len = self.scheduler_config.max_model_len
+
+        # Create the KV cache manager.
+        self.kv_cache_manager = KVCacheManager(
+            kv_cache_config=kv_cache_config,
+            max_model_len=self.max_model_len,
+            enable_caching=cache_config.enable_prefix_caching,
+            caching_hash_algo=self.cache_config.prefix_caching_hash_algo,
+            log_stats=self.log_stats)
+        self.block_size = self.cache_config.block_size
+
+        # req_id -> Request
+        self.requests: dict[str, Request] = {}
+        # Priority queues for requests.
+        self.waiting: deque[Request] = deque()
+        self.running: list[Request] = []
+        # The requests that have been scheduled and are being executed
+        # by the executor.
+        self.scheduled_req_ids: set[str] = set()
+
+        # The request IDs that are finished in between the previous and the
+        # current steps. This is used to notify the workers about the finished
+        # requests so that they can free the cached states for those requests.
+        # This is flushed at the end of each scheduling step.
+        self.finished_req_ids: set[str] = set()
+
+        # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating
+        # them at each scheduling step.
+        # Request id -> CachedRequestData
+        self._cached_reqs_data: dict[str, CachedRequestData] = {}
+
+        # Encoder-related.
+        # Calculate encoder cache size if applicable
+        # NOTE: For now we use the same budget for both compute and space.
+        # This can be changed when we make encoder cache for embedding caching
+        # across requests.
+        encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
+            model_config=model_config,
+            scheduler_config=scheduler_config,
+            mm_registry=mm_registry,
+        )
+
+        # NOTE(woosuk): Here, "encoder" includes the vision encoder (and
+        # projector if needed). Currently, we assume that the encoder also
+        # has the Transformer architecture (e.g., ViT).
+        self.max_num_encoder_input_tokens = encoder_compute_budget
+        # NOTE: For the models without encoder (e.g., text-only models),
+        # the encoder cache will not be initialized because cache size is 0
+        # for these models.
+        self.encoder_cache_manager = EncoderCacheManager(
+            cache_size=encoder_cache_size)
+
+    def schedule(self) -> SchedulerOutput:
+        # NOTE(woosuk) on the scheduling algorithm:
+        # There's no "decoding phase" nor "prefill phase" in the scheduler.
+        # Each request just has the num_computed_tokens and
+        # num_tokens_with_spec. num_tokens_with_spec =
+        # len(prompt_token_ids) + len(output_token_ids) + len(spec_token_ids).
+        # At each step, the scheduler tries to assign tokens to the requests
+        # so that each request's num_computed_tokens can catch up its
+        # num_tokens_with_spec. This is general enough to cover
+        # chunked prefills, prefix caching, speculative decoding,
+        # and the "jump decoding" optimization in the future.
+
+        scheduled_new_reqs: list[Request] = []
+        scheduled_resumed_reqs: list[Request] = []
+        scheduled_running_reqs: list[Request] = []
+        preempted_reqs: list[Request] = []
+
+        # NOTE: structured_output_request_ids maps
+        # a request's (request that uses structured output)
+        # request_id to the running request index.
+        # This will helps us determine to slice the grammar bitmask
+        # and only applies valid mask for requests that
+        # uses structured decoding.
+        structured_output_request_ids: dict[str, int] = {}
+
+        req_to_new_block_ids: dict[str, list[int]] = {}
+        num_scheduled_tokens: dict[str, int] = {}
+        token_budget = self.max_num_scheduled_tokens
+        # Encoder-related.
+        scheduled_encoder_inputs: dict[str, list[int]] = {}
+        encoder_budget = self.max_num_encoder_input_tokens
+        # Spec decode-related.
+        scheduled_spec_decode_tokens: dict[str, list[int]] = {}
+
+        # For logging.
+        scheduled_timestamp = time.monotonic()
+
+        # First, schedule the RUNNING requests.
+        req_index = 0
+        while req_index < len(self.running) and token_budget > 0:
+            request = self.running[req_index]
+            if request.request_id in self.scheduled_req_ids:
+                # This request has already been scheduled.
+                req_index += 1
+                continue
+
+            num_new_tokens = (request.num_tokens_with_spec -
+                              request.num_computed_tokens)
+            if (0 < self.scheduler_config.long_prefill_token_threshold <
+                    num_new_tokens):
+                num_new_tokens = (
+                    self.scheduler_config.long_prefill_token_threshold)
+            num_new_tokens = min(num_new_tokens, token_budget)
+            assert num_new_tokens > 0
+
+            # Schedule encoder inputs.
+            if request.has_encoder_inputs:
+                (encoder_inputs_to_schedule, num_new_tokens,
+                 new_encoder_budget) = self._try_schedule_encoder_inputs(
+                     request, request.num_computed_tokens, num_new_tokens,
+                     encoder_budget)
+                if num_new_tokens == 0:
+                    # The request cannot be scheduled because the encoder budget
+                    # or the encoder cache is exhausted.
+                    # NOTE(woosuk): By using `continue` instead of `break` here,
+                    # we intentionally relax the strict FCFS scheduling policy
+                    # to allow lower-priority requests to be scheduled when a
+                    # higher-priority request is blocked by encoder constraints.
+                    req_index += 1
+                    continue
+            else:
+                encoder_inputs_to_schedule = None
+                new_encoder_budget = encoder_budget
+
+            while True:
+                new_blocks = self.kv_cache_manager.allocate_slots(
+                    request, num_new_tokens)
+                if new_blocks is None:
+                    # The request cannot be scheduled.
+                    # Preempt the lowest-priority request.
+                    preempted_req = self.running.pop()
+                    self.kv_cache_manager.free(preempted_req)
+                    preempted_req.status = RequestStatus.PREEMPTED
+                    preempted_req.num_computed_tokens = 0
+                    if self.log_stats:
+                        preempted_req.record_event(
+                            EngineCoreEventType.PREEMPTED, scheduled_timestamp)
+
+                    self.waiting.appendleft(preempted_req)
+                    preempted_reqs.append(preempted_req)
+                    if preempted_req == request:
+                        # No more request to preempt.
+                        can_schedule = False
+                        break
+                else:
+                    # The request can be scheduled.
+                    can_schedule = True
+                    break
+            if not can_schedule:
+                break
+            assert new_blocks is not None
+
+            # Schedule the request.
+            scheduled_running_reqs.append(request)
+            self.scheduled_req_ids.add(request.request_id)
+            if request.use_structured_output:
+                # PERF: in case of chunked prefill,
+                # request might not include any new tokens.
+                # Therefore, we might introduce some additional
+                # cycle to fill in the bitmask, which could be a big no-op.
+                structured_output_request_ids[request.request_id] = req_index
+            req_to_new_block_ids[request.request_id] = [
+                b.block_id for b in new_blocks
+            ]
+            num_scheduled_tokens[request.request_id] = num_new_tokens
+            token_budget -= num_new_tokens
+            req_index += 1
+
+            # Speculative decode related.
+            if request.spec_token_ids:
+                num_scheduled_spec_tokens = (num_new_tokens +
+                                             request.num_computed_tokens -
+                                             request.num_tokens)
+                if num_scheduled_spec_tokens > 0:
+                    # Trim spec_token_ids list to num_scheduled_spec_tokens.
+                    del request.spec_token_ids[num_scheduled_spec_tokens:]
+                    scheduled_spec_decode_tokens[request.request_id] = (
+                        request.spec_token_ids)
+
+            # Encoder-related.
+            if encoder_inputs_to_schedule:
+                scheduled_encoder_inputs[request.request_id] = (
+                    encoder_inputs_to_schedule)
+                # Allocate the encoder cache.
+                for i in encoder_inputs_to_schedule:
+                    self.encoder_cache_manager.allocate(request, i)
+                encoder_budget = new_encoder_budget
+
+        # Record the LoRAs in scheduled_running_reqs
+        scheduled_loras: set[int] = set()
+        if self.lora_config:
+            scheduled_loras = set(
+                req.lora_request.lora_int_id for req in scheduled_running_reqs
+                if req.lora_request and req.lora_request.lora_int_id > 0)
+            assert len(scheduled_loras) <= self.lora_config.max_loras
+
+        # Use a temporary deque to collect requests that need to be skipped
+        # and put back at the head of the waiting queue later
+        skipped_waiting_requests: deque[Request] = deque()
+
+        # Next, schedule the WAITING requests.
+        if not preempted_reqs:
+            while self.waiting and token_budget > 0:
+                if len(self.running) == self.max_num_running_reqs:
+                    break
+
+                request = self.waiting[0]
+
+                # Skip request if the structured output request is still waiting
+                # for FSM compilation.
+                if request.status == RequestStatus.WAITING_FOR_FSM:
+                    structured_output_req = request.structured_output_request
+                    if structured_output_req and structured_output_req.grammar:
+                        request.status = RequestStatus.WAITING
+                    else:
+                        self.waiting.popleft()
+                        skipped_waiting_requests.appendleft(request)
+                        continue
+
+                # Check that adding the request still respects the max_loras
+                # constraint.
+                if self.lora_config and request.lora_request and (
+                        len(scheduled_loras) == self.lora_config.max_loras
+                        and request.lora_request.lora_int_id
+                        not in scheduled_loras):
+                    # Scheduling would exceed max_loras, skip.
+                    self.waiting.popleft()
+                    skipped_waiting_requests.appendleft(request)
+                    continue
+
+                # Get already-cached tokens.
+                computed_blocks, num_computed_tokens = \
+                    self.kv_cache_manager.get_computed_blocks(request)
+                # Number of tokens to be scheduled.
+                # We use `request.num_tokens` instead of
+                # `request.num_prompt_tokens` to consider the resumed requests,
+                # which have output tokens.
+                num_new_tokens = request.num_tokens - num_computed_tokens
+                if (0 < self.scheduler_config.long_prefill_token_threshold <
+                        num_new_tokens):
+                    num_new_tokens = (
+                        self.scheduler_config.long_prefill_token_threshold)
+                num_new_tokens = min(num_new_tokens, token_budget)
+                assert num_new_tokens > 0
+
+                # Schedule encoder inputs.
+                if request.has_encoder_inputs:
+                    (encoder_inputs_to_schedule, num_new_tokens,
+                     new_encoder_budget) = self._try_schedule_encoder_inputs(
+                         request, num_computed_tokens, num_new_tokens,
+                         encoder_budget)
+                    if num_new_tokens == 0:
+                        # The request cannot be scheduled.
+                        break
+                else:
+                    encoder_inputs_to_schedule = None
+                    new_encoder_budget = encoder_budget
+
+                new_blocks = self.kv_cache_manager.allocate_slots(
+                    request, num_new_tokens, computed_blocks)
+                if new_blocks is None:
+                    # The request cannot be scheduled.
+                    break
+
+                self.waiting.popleft()
+                if request.use_structured_output:
+                    structured_output_request_ids[
+                        request.request_id] = req_index
+                req_index += 1
+                self.running.append(request)
+                self.scheduled_req_ids.add(request.request_id)
+                if self.log_stats:
+                    request.record_event(EngineCoreEventType.SCHEDULED,
+                                         scheduled_timestamp)
+                if request.status == RequestStatus.WAITING:
+                    scheduled_new_reqs.append(request)
+                elif request.status == RequestStatus.PREEMPTED:
+                    scheduled_resumed_reqs.append(request)
+                else:
+                    raise RuntimeError(
+                        f"Invalid request status: {request.status}")
+
+                if self.lora_config and request.lora_request:
+                    scheduled_loras.add(request.lora_request.lora_int_id)
+                req_to_new_block_ids[request.request_id] = [
+                    b.block_id for b in computed_blocks + new_blocks
+                ]
+                num_scheduled_tokens[request.request_id] = num_new_tokens
+                token_budget -= num_new_tokens
+                request.status = RequestStatus.RUNNING
+                request.num_computed_tokens = num_computed_tokens
+
+                # Encoder-related.
+                if encoder_inputs_to_schedule:
+                    scheduled_encoder_inputs[request.request_id] = (
+                        encoder_inputs_to_schedule)
+                    # Allocate the encoder cache.
+                    for i in encoder_inputs_to_schedule:
+                        self.encoder_cache_manager.allocate(request, i)
+                    encoder_budget = new_encoder_budget
+
+        # Put back any skipped requests at the head of the waiting queue
+        if skipped_waiting_requests:
+            self.waiting.extendleft(skipped_waiting_requests)
+
+        # Check if the scheduling constraints are satisfied.
+        total_num_scheduled_tokens = sum(num_scheduled_tokens.values())
+        assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens
+        assert token_budget >= 0
+        assert len(self.running) <= self.max_num_running_reqs
+        # Since some requests in the RUNNING queue may not be scheduled in
+        # this step, the total number of scheduled requests can be smaller than
+        # len(self.running).
+        assert (len(scheduled_new_reqs) + len(scheduled_resumed_reqs) +
+                len(scheduled_running_reqs) <= len(self.running))
+
+        # Get the longest common prefix among all requests in the running queue.
+        # This can be potentially used for cascade attention.
+        num_common_prefix_blocks = 0
+        if self.running:
+            any_request = self.running[0]
+            num_common_prefix_blocks = (
+                self.kv_cache_manager.get_num_common_prefix_blocks(
+                    any_request, len(self.running)))
+
+        grammar_bitmask = self.structured_output_manager.grammar_bitmask(
+            self.requests,
+            structured_output_request_ids,
+            len(self.running),
+        )
+        # Construct the scheduler output.
+        new_reqs_data = [
+            NewRequestData.from_request(req,
+                                        req_to_new_block_ids[req.request_id])
+            for req in scheduled_new_reqs
+        ]
+        resumed_reqs_data = [
+            self._make_cached_request_data(
+                req,
+                num_scheduled_tokens[req.request_id],
+                len(scheduled_spec_decode_tokens.get(req.request_id, ())),
+                req_to_new_block_ids[req.request_id],
+                resumed_from_preemption=True,
+            ) for req in scheduled_resumed_reqs
+        ]
+        running_reqs_data = [
+            self._make_cached_request_data(
+                req,
+                num_scheduled_tokens[req.request_id],
+                len(scheduled_spec_decode_tokens.get(req.request_id, ())),
+                req_to_new_block_ids[req.request_id],
+                resumed_from_preemption=False,
+            ) for req in scheduled_running_reqs
+        ]
+        scheduler_output = SchedulerOutput(
+            scheduled_new_reqs=new_reqs_data,
+            scheduled_cached_reqs=resumed_reqs_data + running_reqs_data,
+            num_scheduled_tokens=num_scheduled_tokens,
+            total_num_scheduled_tokens=total_num_scheduled_tokens,
+            scheduled_spec_decode_tokens=scheduled_spec_decode_tokens,
+            scheduled_encoder_inputs=scheduled_encoder_inputs,
+            num_common_prefix_blocks=num_common_prefix_blocks,
+            # finished_req_ids is an existing state in the scheduler,
+            # instead of being newly scheduled in this step.
+            # It contains the request IDs that are finished in between
+            # the previous and the current steps.
+            finished_req_ids=self.finished_req_ids,
+            free_encoder_input_ids=self.encoder_cache_manager.get_freed_ids(),
+            structured_output_request_ids=structured_output_request_ids,
+            grammar_bitmask=grammar_bitmask,
+        )
+
+        # Advance the number of computed tokens for the request AFTER
+        # the request is scheduled.
+        # 1. The scheduler_output of the current step has to include the
+        #    original number of scheduled tokens to determine input IDs.
+        # 2. Advance the number of computed tokens here allowing us to
+        #    schedule the prefill request again immediately in the next
+        #    scheduling step.
+        # 3. If some tokens (e.g. spec tokens) are rejected later, the number of
+        #    computed tokens will be adjusted in update_from_output.
+        for req_id, num_scheduled_token in num_scheduled_tokens.items():
+            self.requests[req_id].num_computed_tokens += num_scheduled_token
+
+        self.finished_req_ids = set()
+        return scheduler_output
+
+    def _make_cached_request_data(
+        self,
+        request: Request,
+        num_scheduled_tokens: int,
+        num_scheduled_spec_tokens: int,
+        new_block_ids: list[int],
+        resumed_from_preemption: bool,
+    ) -> CachedRequestData:
+        # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating
+        # them at each scheduling step.
+        num_computed_tokens = request.num_computed_tokens
+        num_regular_tokens = num_scheduled_tokens - num_scheduled_spec_tokens
+        new_token_ids = request.all_token_ids[
+            num_computed_tokens:num_computed_tokens + num_regular_tokens]
+        req_data = self._cached_reqs_data.get(request.request_id)
+        if req_data is not None:
+            req_data.resumed_from_preemption = resumed_from_preemption
+            req_data.new_token_ids = new_token_ids
+            req_data.new_block_ids = new_block_ids
+            req_data.num_computed_tokens = num_computed_tokens
+        else:
+            req_data = CachedRequestData.from_request(request,
+                                                      resumed_from_preemption,
+                                                      new_token_ids,
+                                                      new_block_ids)
+            self._cached_reqs_data[request.request_id] = req_data
+        return req_data
+
+    def _try_schedule_encoder_inputs(
+        self,
+        request: Request,
+        num_computed_tokens: int,
+        num_new_tokens: int,
+        encoder_budget: int,
+    ) -> tuple[list[int], int, int]:
+        """
+        Determine which encoder inputs need to be scheduled in the current step,
+        and update `num_new_tokens` and encoder token budget accordingly.
+
+        An encoder input will be scheduled if:
+        - Its output tokens overlap with the range of tokens being computed
+        in this step, i.e.,
+        [num_computed_tokens, num_computed_tokens + num_new_tokens).
+        - It is not already computed and stored in the encoder cache.
+        - There is sufficient encoder token budget to process it.
+        - The encoder cache has space to store it.
+
+        If an encoder input cannot be scheduled due to cache or budget
+        limitations, the method adjusts `num_new_tokens` to schedule only the
+        decoder tokens up to just before the unschedulable encoder input.
+        """
+        encoder_inputs_to_schedule: list[int] = []
+        mm_positions = request.mm_positions
+        assert mm_positions is not None
+        assert len(mm_positions) > 0
+        for i, pos_info in enumerate(mm_positions):
+            start_pos = pos_info["offset"]
+            num_encoder_tokens = pos_info["length"]
+
+            # The encoder output is needed if the two ranges overlap:
+            # [num_computed_tokens, num_computed_tokens + num_new_tokens) and
+            # [start_pos, start_pos + num_encoder_tokens)
+            if start_pos >= num_computed_tokens + num_new_tokens:
+                # The encoder input is not needed in this step.
+                break
+            if start_pos + num_encoder_tokens <= num_computed_tokens:
+                # The encoder input is already computed and stored
+                # in the decoder's KV cache.
+                continue
+
+            if self.encoder_cache_manager.has_cache(request, i):
+                # The encoder input is already computed and cached.
+                continue
+            if (not self.encoder_cache_manager.can_allocate(request, i)
+                    or num_encoder_tokens > encoder_budget):
+                # The encoder cache is full or the encoder budget is exhausted.
+                # NOTE(woosuk): We assume that the encoder input tokens should
+                # be processed altogether, as the encoder usually uses
+                # bidirectional attention.
+                if num_computed_tokens < start_pos:
+                    # We only schedule the decoder tokens just before the
+                    # encoder input.
+                    num_new_tokens = start_pos - num_computed_tokens
+                else:
+                    # Because of prefix caching, num_computed_tokens is greater
+                    # than start_pos even though its encoder input is not
+                    # available. In this case, we can't schedule any token for
+                    # the request in this step.
+                    num_new_tokens = 0
+                break
+
+            encoder_budget -= num_encoder_tokens
+            encoder_inputs_to_schedule.append(i)
+        return encoder_inputs_to_schedule, num_new_tokens, encoder_budget
+
+    def update_from_output(
+        self,
+        scheduler_output: SchedulerOutput,
+        model_runner_output: ModelRunnerOutput,
+    ) -> EngineCoreOutputs:
+        sampled_token_ids = model_runner_output.sampled_token_ids
+        spec_token_ids = model_runner_output.spec_token_ids
+        logprobs = model_runner_output.logprobs
+        prompt_logprobs_dict = model_runner_output.prompt_logprobs_dict
+        num_scheduled_tokens = scheduler_output.num_scheduled_tokens
+
+        new_running: list[Request] = []
+        outputs: list[EngineCoreOutput] = []
+        spec_decoding_stats: Optional[SpecDecodingStats] = None
+
+        # NOTE(woosuk): As len(self.running) can be up to 1K or more, the below
+        # loop can be a performance bottleneck. We should do our best to avoid
+        # expensive operations inside the loop.
+        for request in self.running:
+            req_id = request.request_id
+            num_tokens_scheduled = num_scheduled_tokens.get(req_id, 0)
+            if num_tokens_scheduled == 0:
+                # The request was not scheduled in this step.
+                new_running.append(request)
+                continue
+
+            req_index = model_runner_output.req_id_to_index[req_id]
+            generated_token_ids = sampled_token_ids[req_index]
+
+            scheduled_spec_token_ids = (
+                scheduler_output.scheduled_spec_decode_tokens.get(req_id))
+            if scheduled_spec_token_ids:
+                # num_computed_tokens represents the number of tokens
+                # processed in the current step, considering scheduled
+                # tokens and rejections. If some tokens are rejected,
+                # num_computed_tokens is decreased by the number of rejected
+                # tokens, where is given by:
+                # len(scheduled_spec_token_ids) + 1 - len(generated_token_ids).
+                num_tokens_rejected = (len(scheduled_spec_token_ids) + 1 -
+                                       len(generated_token_ids))
+                request.num_computed_tokens -= num_tokens_rejected
+                spec_decoding_stats = self.make_spec_decoding_stats(
+                    spec_decoding_stats,
+                    num_draft_tokens=len(scheduled_spec_token_ids),
+                    num_accepted_tokens=len(generated_token_ids) - 1)
+
+            cached_encoder_input_ids = (
+                self.encoder_cache_manager.get_cached_input_ids(request))
+            # OPTIMIZATION: Avoid list(set) if the set is empty.
+            if cached_encoder_input_ids:
+                for input_id in list(cached_encoder_input_ids):
+                    mm_positions = request.mm_positions[input_id]
+                    start_pos = mm_positions["offset"]
+                    num_tokens = mm_positions["length"]
+                    if start_pos + num_tokens <= request.num_computed_tokens:
+                        # The encoder output is already processed and stored
+                        # in the decoder's KV cache.
+                        self.encoder_cache_manager.free_encoder_input(
+                            request, input_id)
+
+            # Add newly generated spec token ids to the request.
+            if spec_token_ids is not None:
+                request.spec_token_ids = spec_token_ids[req_index]
+
+            stopped = False
+            new_logprobs = None
+            new_token_ids = generated_token_ids
+
+            # Append generated tokens and check for stop. Note that if
+            # a request is still being prefilled, we expect the model runner
+            # to return empty token ids for the request.
+            for num_new, output_token_id in enumerate(new_token_ids, 1):
+                request.append_output_token_ids(output_token_id)
+
+                # Check for stop and update request state.
+                # This must be called before we make the EngineCoreOutput.
+                stopped = check_stop(request, self.max_model_len)
+                if stopped:
+                    self._free_request(request)
+                    del new_token_ids[num_new:]  # Trim new tokens if needed.
+                    break
+
+            # Extract sample logprobs if needed.
+            if request.sampling_params.logprobs is not None and logprobs:
+                # NOTE: once we support N tokens per step (spec decode),
+                # the outer lists can be of length > 1.
+                new_logprobs = logprobs.slice(req_index, req_index + 1)
+
+            if new_token_ids and request.use_structured_output:
+                # NOTE: structured_output_request
+                # should not be None if use_structured_output, we have
+                # check above, so safe to ignore type warning
+                request.structured_output_request.grammar.accept_tokens(  # type: ignore[union-attr]
+                    req_id, new_token_ids)
+
+            # Get prompt logprobs for this request.
+            prompt_logprobs_tensors = prompt_logprobs_dict.get(req_id)
+            if new_token_ids:
+                # Add EngineCoreOutput for this Request.
+                outputs.append(
+                    EngineCoreOutput(
+                        request_id=req_id,
+                        new_token_ids=new_token_ids,
+                        finish_reason=request.get_finished_reason(),
+                        new_logprobs=new_logprobs,
+                        new_prompt_logprobs_tensors=prompt_logprobs_tensors,
+                        stop_reason=request.stop_reason,
+                        events=request.take_events()))
+            else:
+                # Invariant: EngineCore returns no partial prefill outputs.
+                assert not prompt_logprobs_tensors
+
+            self.scheduled_req_ids.remove(req_id)
+            if not stopped:
+                new_running.append(request)
+
+        self.running = new_running
+        engine_core_outputs = EngineCoreOutputs(
+            outputs=outputs,
+            scheduler_stats=self.make_stats(spec_decoding_stats),
+        )
+        if self.include_finished_set:
+            #TODO currently sending duplicates here, improve this
+            engine_core_outputs.finished_requests = (
+                scheduler_output.finished_req_ids | self.finished_req_ids)
+
+        return engine_core_outputs
+
+    def add_request(self, request: Request) -> None:
+        self.waiting.append(request)
+        self.requests[request.request_id] = request
+        if self.log_stats:
+            request.record_event(EngineCoreEventType.QUEUED)
+
+    def finish_requests(
+        self,
+        request_ids: Union[str, Iterable[str]],
+        finished_status: RequestStatus,
+    ) -> None:
+        """Handles the finish signal from outside the scheduler.
+
+        For example, the API server can abort a request when the client
+        disconnects.
+        """
+        assert RequestStatus.is_finished(finished_status)
+        if isinstance(request_ids, str):
+            request_ids = (request_ids, )
+        else:
+            request_ids = set(request_ids)
+
+        for req_id in request_ids:
+            request = self.requests.get(req_id)
+            if request is None:
+                # Invalid request ID.
+                continue
+
+            if request.status == RequestStatus.RUNNING:
+                self.running.remove(request)
+                self.scheduled_req_ids.discard(request.request_id)
+            else:
+                self.waiting.remove(request)
+            request.status = finished_status
+            self._free_request(request)
+
+    def _free_request(self, request: Request) -> None:
+        assert request.is_finished()
+        self.kv_cache_manager.free(request)
+        self.kv_cache_manager.free_block_hashes(request)
+        self.encoder_cache_manager.free(request)
+        self._cached_reqs_data.pop(request.request_id, None)
+        del self.requests[request.request_id]
+        self.finished_req_ids.add(request.request_id)
+
+    def get_num_unfinished_requests(self) -> int:
+        return len(self.waiting) + len(self.running)
+
+    def has_finished_requests(self) -> bool:
+        return len(self.finished_req_ids) > 0
+
+    def get_num_unscheduled_requests(self) -> int:
+        """Number of requests that are not being processed by the executor."""
+        return self.get_num_unfinished_requests() - len(self.scheduled_req_ids)
+
+    def reset_prefix_cache(self) -> bool:
+        return self.kv_cache_manager.reset_prefix_cache()
+
+    def make_stats(
+        self,
+        spec_decoding_stats: Optional[SpecDecodingStats] = None,
+    ) -> Optional[SchedulerStats]:
+        if not self.log_stats:
+            return None
+        return SchedulerStats(
+            num_running_reqs=len(self.running),
+            num_waiting_reqs=len(self.waiting),
+            gpu_cache_usage=self.kv_cache_manager.usage,
+            prefix_cache_stats=self.kv_cache_manager.make_prefix_cache_stats(),
+            spec_decoding_stats=spec_decoding_stats,
+        )
+
+    def make_spec_decoding_stats(
+        self,
+        spec_decoding_stats: Optional[SpecDecodingStats],
+        num_draft_tokens: int,
+        num_accepted_tokens: int,
+    ) -> Optional[SpecDecodingStats]:
+        if not self.log_stats:
+            return None
+        if spec_decoding_stats is None:
+            spec_decoding_stats = SpecDecodingStats()
+        spec_decoding_stats.observe(num_draft_tokens=num_draft_tokens,
+                                    num_accepted_tokens=num_accepted_tokens)
+        return spec_decoding_stats
--- a/vllm/v1/core/sched/utils.py
+++ b/vllm/v1/core/sched/utils.py
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: Apache-2.0
+from vllm.v1.request import Request, RequestStatus
+
+
+def check_stop(request: Request, max_model_len: int) -> bool:
+    if (request.num_tokens >= max_model_len
+            or request.num_output_tokens >= request.max_tokens):
+        request.status = RequestStatus.FINISHED_LENGTH_CAPPED
+        return True
+
+    sampling_params = request.sampling_params
+    last_token_id = request.output_token_ids[-1]
+    if (not sampling_params.ignore_eos
+            and last_token_id == request.eos_token_id):
+        request.status = RequestStatus.FINISHED_STOPPED
+        return True
+
+    if last_token_id in (sampling_params.stop_token_ids or ()):
+        request.status = RequestStatus.FINISHED_STOPPED
+        request.stop_reason = last_token_id
+        return True
+    return False
--- a/vllm/v1/core/specialized_manager.py
+++ b/vllm/v1/core/specialized_manager.py
@@ -0,0 +1,161 @@
+# SPDX-License-Identifier: Apache-2.0
+from abc import ABC, abstractmethod
+
+from vllm.utils import cdiv
+from vllm.v1.core.block_pool import BlockPool
+from vllm.v1.core.kv_cache_utils import BlockHashType, KVCacheBlock
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheSpec,
+                                        SlidingWindowSpec)
+
+
+class SpecializedManager(ABC):
+    """
+    An abstract base class for specialized managers that handle the kv
+    cache management logic of different attention layers.
+    """
+
+    def __init__(
+        self,
+        kv_cache_spec: KVCacheSpec,
+        block_pool: BlockPool,
+    ) -> None:
+        """
+        Initializes the SpecializedManager.
+        Args:
+            kv_cache_spec: The kv_cache_spec for this manager.
+            block_pool: The block pool.
+        """
+
+        self.block_size = kv_cache_spec.block_size
+        self.kv_cache_spec = kv_cache_spec
+        self.block_pool = block_pool
+
+    @abstractmethod
+    def find_longest_cache_hit(
+            self, block_hashes: list[BlockHashType]) -> list[KVCacheBlock]:
+        """
+        Get the longest cache hit prefix of the blocks. If no cache hit is 
+        found, return an empty list.
+
+        Args:
+            block_hashes: The block hashes of the request.
+        Returns:
+            A list of cached blocks with skipped blocks replaced by null block.
+            For example, sliding window manager should return a list like
+            [NULL, NULL, KVCacheBlock(7), KVCacheBlock(8)] for block size 4 and 
+            sliding window 8. 
+        """
+
+        raise NotImplementedError
+
+    @abstractmethod
+    def remove_skipped_blocks(self, blocks: list[KVCacheBlock],
+                              num_computed_tokens: int) -> list[KVCacheBlock]:
+        """
+        Remove the blocks that are no longer needed from `blocks`. The removed 
+        blocks should be replaced by null_block. Return the removed blocks in 
+        eviction order, where the first returned block should be evicted first.
+        Don't free the removed blocks in this function.
+
+        Args:
+            blocks: The list of blocks to be updated.
+            num_computed_tokens: The number of tokens that have been computed.
+        Returns:
+            The removed blocks in eviction order.
+        """
+        raise NotImplementedError
+
+
+class FullAttentionManager(SpecializedManager):
+
+    def find_longest_cache_hit(
+            self, block_hashes: list[BlockHashType]) -> list[KVCacheBlock]:
+        computed_blocks: list[KVCacheBlock] = []
+        for block_hash in block_hashes:
+            # block_hashes is a chain of block hashes. If a block hash is not
+            # in the cached_block_hash_to_id, the following block hashes are
+            # not computed yet for sure.
+            if cached_block := self.block_pool.get_cached_block(block_hash):
+                computed_blocks.append(cached_block)
+            else:
+                break
+        return computed_blocks
+
+    def remove_skipped_blocks(self, blocks: list[KVCacheBlock],
+                              num_computed_tokens: int) -> list[KVCacheBlock]:
+        # No need to remove blocks for full attention.
+        return []
+
+
+class SlidingWindowManager(SpecializedManager):
+
+    def __init__(self, kv_cache_spec: SlidingWindowSpec,
+                 block_pool: BlockPool):
+        super().__init__(kv_cache_spec, block_pool)
+        self.sliding_window = kv_cache_spec.sliding_window
+        # The number of contiguous blocks needed for prefix cache hit.
+        # -1 since the input token itself is also included in the window
+        self.sliding_window_contiguous_blocks = cdiv(
+            (kv_cache_spec.sliding_window - 1), self.block_size)
+        self._null_block = block_pool.null_block
+
+    def find_longest_cache_hit(
+            self, block_hashes: list[BlockHashType]) -> list[KVCacheBlock]:
+        # TODO: reduce i by sliding_window_contiguous_blocks when cache miss, to
+        # optimize the time complexity from O(len(block_hashes)) to
+        # O(len(block_hashes) / sliding_window_contiguous_blocks +
+        # sliding_window_contiguous_blocks),
+        # which is good for low cache hit rate scenarios.
+        computed_blocks = [self._null_block] * len(block_hashes)
+        num_contiguous_blocks = 0
+
+        # Search from right to left and early stop when a match is found.
+        for i in range(len(block_hashes) - 1, -1, -1):
+            if cached_block := self.block_pool.get_cached_block(
+                    block_hashes[i]):
+                computed_blocks[i] = cached_block
+                num_contiguous_blocks += 1
+                if (num_contiguous_blocks
+                        >= self.sliding_window_contiguous_blocks):
+                    # Trim the trailing blocks.
+                    # E.g., [NULL, NULL, 8, 3, NULL, 9] -> [NULL, NULL, 8, 3]
+                    # when sliding_window_contiguous_blocks=2.
+                    del computed_blocks[i + num_contiguous_blocks:]
+                    return computed_blocks
+            else:
+                num_contiguous_blocks = 0
+        # The first `num_contiguous_blocks` is a cache hit even if
+        # `num_contiguous_blocks < sliding_window_contiguous_blocks`.
+        del computed_blocks[num_contiguous_blocks:]
+        return computed_blocks
+
+    def remove_skipped_blocks(self, blocks: list[KVCacheBlock],
+                              num_computed_tokens: int) -> list[KVCacheBlock]:
+        # Remove the blocks that are no longer be in the sliding window and
+        # skipped during the attention computation.
+        last_useful_token = num_computed_tokens - self.sliding_window + 1
+        last_useful_block = last_useful_token // self.block_size
+
+        removed_blocks: list[KVCacheBlock] = []
+        for i in range(last_useful_block - 1, -1, -1):
+            if blocks[i] == self._null_block:
+                # If the block is already a null block, the blocks before it
+                # should also have been set to null blocks by the previous calls
+                # to this function.
+                break
+            removed_blocks.append(blocks[i])
+            blocks[i] = self._null_block
+        return removed_blocks
+
+
+spec_manager_map: dict[type[KVCacheSpec], type[SpecializedManager]] = {
+    FullAttentionSpec: FullAttentionManager,
+    SlidingWindowSpec: SlidingWindowManager,
+}
+
+
+def get_specialized_manager(kv_cache_spec: KVCacheSpec,
+                            block_pool: BlockPool) -> SpecializedManager:
+    manager_class = spec_manager_map[type(kv_cache_spec)]
+    manager = manager_class(kv_cache_spec, block_pool)
+    return manager