first commit

2026-03-10 13:31:25 +08:00
parent ba974cecfa
commit b62b889355
2604 changed files with 438977 additions and 0 deletions
--- a/vllm_br/v1/core/init.py
+++ b/vllm_br/v1/core/init.py
@@ -0,0 +1,17 @@
+################################################################################
+# Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+################################################################################
+
+from . import kv_cache_utils, sched  # noqa: F401
--- a/vllm_br/v1/core/pycache/init.cpython-310.pyc
+++ b/vllm_br/v1/core/pycache/init.cpython-310.pyc
--- a/vllm_br/v1/core/pycache/kv_cache_utils.cpython-310.pyc
+++ b/vllm_br/v1/core/pycache/kv_cache_utils.cpython-310.pyc
--- a/vllm_br/v1/core/kv_cache_utils.py
+++ b/vllm_br/v1/core/kv_cache_utils.py
@@ -0,0 +1,219 @@
+################################################################################
+# Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+################################################################################
+
+from fastcore.basics import patch_to
+
+import vllm.v1.core.kv_cache_utils
+from vllm.config import VllmConfig
+from vllm.logger import logger
+from vllm.v1.core.kv_cache_utils import (
+    create_kv_cache_group_specs, get_max_concurrency_for_kv_cache_config,
+    get_uniform_page_size, may_override_num_blocks)
+from vllm.v1.kv_cache_interface import (KVCacheConfig, KVCacheGroupSpec,
+                                        KVCacheSpec, KVCacheTensor,
+                                        UniformTypeKVCacheSpecs)
+from vllm_br.v1.attention.backends.attention_v1 import (
+    SUPAFlashAttentionBackend)
+
+
+@patch_to(vllm.v1.core.kv_cache_utils)
+def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
+                                      kv_cache_spec: dict[str, KVCacheSpec],
+                                      available_memory: int) -> KVCacheConfig:
+    """
+    Generates the KV cache configuration for a model with one type of KV cache.
+    Divide the available memory equally among all layers.
+
+    Args:
+        vllm_config: The global VllmConfig
+        kv_cache_spec: The kv cache spec of each attention layer in the model
+        available_memory: Memory available for KV cache in bytes.
+
+    Returns:
+        The generated KVCacheConfig
+    """
+    page_sizes = {layer.page_size_bytes for layer in kv_cache_spec.values()}
+    assert len(page_sizes) == 1
+    page_size = page_sizes.pop()
+
+    # NOTE: SUPA has layouts
+    # Both MLA/FlashAttention use the same gran
+    th_gran = SUPAFlashAttentionBackend.get_kv_cache_usharp_alignment(
+        vllm_config.cache_config.block_size)
+    num_blocks = int(available_memory // page_size // len(kv_cache_spec))
+
+    # NOTE: limit gpu blocks number due to the shape restriction of colmajor layout
+    num_blocks = min(th_gran * 1024, num_blocks // th_gran * th_gran)
+
+    num_blocks = max(num_blocks, 0)
+
+    if vllm_config.cache_config.num_gpu_blocks_override is not None:
+        num_gpu_blocks_override = \
+            vllm_config.cache_config.num_gpu_blocks_override
+        logger.info(
+            "Overriding num_gpu_blocks=%d with "
+            "num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override)
+        num_blocks = num_gpu_blocks_override
+
+    num_tokens = num_blocks * vllm_config.cache_config.block_size
+    num_tokens_str = f"{num_tokens:,}"
+    logger.info("GPU KV cache size: %s tokens", num_tokens_str)
+    max_model_len_str = f"{vllm_config.model_config.max_model_len:,}"
+    max_concurrency = num_tokens / vllm_config.model_config.max_model_len
+    logger.info("Maximum concurrency for %s tokens per request: %.2fx",
+                max_model_len_str, max_concurrency)
+
+    per_layer_size = page_size * num_blocks
+    # All layers have the same KV cache spec, so we create one kv cache group
+    # for all layers.
+    grouped_layer_names = [list(kv_cache_spec.keys())]
+
+    kv_cache_config = KVCacheConfig(
+        num_blocks=num_blocks,
+        tensors={
+            layer_name: KVCacheTensor(size=per_layer_size)
+            for layer_name in kv_cache_spec
+        },
+        kv_cache_groups=create_kv_cache_group_specs(kv_cache_spec,
+                                                    grouped_layer_names),
+    )
+    return kv_cache_config
+
+
+logger.info('===[Patch] patch _get_kv_cache_config_uniform_type')
+
+
+# @patch_to(vllm.v1.core.kv_cache_utils)
+def get_num_blocks(vllm_config: VllmConfig, num_layers: int,
+                   available_memory: int, page_size: int) -> int:
+    """
+    Get the number of kv cache blocks.
+
+    Args:
+        vllm_config: The global VllmConfig
+        num_layers: The number of layers
+        available_memory: Memory available for KV cache in bytes.
+        page_size: The page size of the KV cache.
+    """
+    th_gran = SUPAFlashAttentionBackend.get_kv_cache_usharp_alignment(
+        vllm_config.cache_config.block_size)
+    num_blocks = int(available_memory // page_size // num_layers)
+    num_blocks = min(th_gran * 1024, num_blocks // th_gran * th_gran)
+    num_blocks = max(num_blocks, 0)
+    num_blocks = may_override_num_blocks(vllm_config, num_blocks)
+    return num_blocks
+
+
+@patch_to(vllm.v1.core.kv_cache_utils)
+def get_kv_cache_config_from_groups(vllm_config: VllmConfig,
+                                    kv_cache_groups: list[KVCacheGroupSpec],
+                                    kv_cache_specs: dict[str, KVCacheSpec],
+                                    available_memory: int) -> KVCacheConfig:
+    """
+    Generate the KV cache configuration from the KV cache groups and spec
+    of each layer.
+
+    Args:
+        vllm_config: The global VllmConfig
+        kv_cache_groups: The KV cache groups
+        kv_cache_specs: The KV cache spec of each attention layer in the model
+        available_memory: Memory available for KV cache in bytes
+    Returns:
+        The generated KVCacheConfig
+    """
+    if len(kv_cache_groups) == 0:
+        # Attention free models do not have KV cache.
+        # Return num_blocks=1 as BlockPool always needs a null_block.
+        return KVCacheConfig(
+            num_blocks=1,
+            kv_cache_tensors=[],
+            kv_cache_groups=kv_cache_groups,
+        )
+
+    # Determine how model runners should initialize the KV cache tensors.
+    # assert len(kv_cache_groups) == 1  # supa not support multi group
+    if len(kv_cache_groups) == 1 and \
+        isinstance(kv_cache_groups[0].kv_cache_spec, UniformTypeKVCacheSpecs):
+        # Special case: all layers have the same type of KV cache but with
+        # different hidden size. Allocate different amount of memory for each
+        # layer based on its hidden size.
+        th_gran = SUPAFlashAttentionBackend.get_kv_cache_usharp_alignment(
+            vllm_config.cache_config.block_size)
+        num_blocks = available_memory // kv_cache_groups[
+            0].kv_cache_spec.page_size_bytes
+        num_blocks = min(th_gran * 1024, num_blocks // th_gran * th_gran)
+
+        num_blocks = max(num_blocks, 0)
+        num_blocks = may_override_num_blocks(vllm_config, num_blocks)
+        per_layer_specs = kv_cache_groups[0].kv_cache_spec.kv_cache_specs
+        kv_cache_tensors = [
+            KVCacheTensor(size=per_layer_specs[layer_name].page_size_bytes *
+                          num_blocks,
+                          shared_by=[layer_name])
+            for layer_name in kv_cache_groups[0].layer_names
+        ]
+    else:
+        # General case:
+        # We will have group_size memory pools, each is shared by one layer from
+        # each group. As layers of different groups have different block table,
+        # they will use different parts of the shared Tensor.
+        # The memory layout for 3 groups (full.0, full.1), (sw.0, sw.2),
+        # (sw.1, padding) will be: (group_size = 2)
+        # full.0, sw.0, sw.1: share a Tensor with size=available_memory//2
+        # full.1, sw.2: share another Tensor with size=available_memory//2
+        group_size = max(len(group.layer_names) for group in kv_cache_groups)
+
+        page_size = get_uniform_page_size(kv_cache_specs)
+        assert group_size > 0, "group_size must be greater than 0"
+        num_blocks = get_num_blocks(vllm_config, group_size, available_memory,
+                                    page_size)
+        kv_cache_tensors = []
+        for i in range(group_size):
+            shared_by = []
+            for j in range(len(kv_cache_groups)):
+                if i < len(kv_cache_groups[j].layer_names):
+                    shared_by.append(kv_cache_groups[j].layer_names[i])
+            kv_cache_tensors.append(
+                KVCacheTensor(size=page_size * num_blocks,
+                              shared_by=shared_by))
+
+    kv_cache_config = KVCacheConfig(
+        num_blocks=num_blocks,
+        kv_cache_tensors=kv_cache_tensors,
+        kv_cache_groups=kv_cache_groups,
+    )
+
+    min_block_size = min(
+        [group.kv_cache_spec.block_size for group in kv_cache_groups])
+
+    # Print the KV cache size and maximum concurrency.
+    num_tokens = num_blocks // len(kv_cache_groups) * min_block_size
+    if vllm_config.parallel_config.decode_context_parallel_size > 1:
+        num_tokens *= vllm_config.parallel_config.decode_context_parallel_size
+        logger.info(
+            "Multiplying the GPU KV cache size by the dcp_world_size %d.",
+            vllm_config.parallel_config.decode_context_parallel_size)
+    num_tokens_str = f"{num_tokens:,}"
+    logger.info("GPU KV cache size: %s tokens", num_tokens_str)
+    max_model_len_str = f"{vllm_config.model_config.max_model_len:,}"
+    max_concurrency = get_max_concurrency_for_kv_cache_config(
+        vllm_config, kv_cache_config)
+    logger.info("Maximum concurrency for %s tokens per request: %.2fx",
+                max_model_len_str, max_concurrency)
+    return kv_cache_config
+
+
+logger.info('===[Patch] patch get_kv_cache_config_from_groups')
--- a/vllm_br/v1/core/sched/init.py
+++ b/vllm_br/v1/core/sched/init.py
@@ -0,0 +1,17 @@
+################################################################################
+# Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+################################################################################
+
+from . import scheduler  # noqa: F401
--- a/vllm_br/v1/core/sched/pycache/init.cpython-310.pyc
+++ b/vllm_br/v1/core/sched/pycache/init.cpython-310.pyc
--- a/vllm_br/v1/core/sched/pycache/scheduler.cpython-310.pyc
+++ b/vllm_br/v1/core/sched/pycache/scheduler.cpython-310.pyc
--- a/vllm_br/v1/core/sched/scheduler.py
+++ b/vllm_br/v1/core/sched/scheduler.py
@@ -0,0 +1,558 @@
+################################################################################
+# Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+################################################################################
+
+from __future__ import annotations
+
+import itertools
+import time
+from typing import Optional
+
+from fastcore.basics import patch_to
+
+from vllm.distributed.kv_events import KVEventBatch
+from vllm.logger import init_logger
+from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
+                                       SchedulerOutput)
+from vllm.v1.core.sched.request_queue import (SchedulingPolicy,
+                                              create_request_queue)
+from vllm.v1.core.sched.scheduler import Scheduler
+from vllm.v1.engine import EngineCoreEventType
+from vllm.v1.request import Request, RequestStatus
+
+logger = init_logger(__name__)
+
+
+@patch_to(Scheduler)
+def schedule(self) -> SchedulerOutput:
+    # NOTE(woosuk) on the scheduling algorithm:
+    # There's no "decoding phase" nor "prefill phase" in the scheduler.
+    # Each request just has the num_computed_tokens and
+    # num_tokens_with_spec. num_tokens_with_spec =
+    # len(prompt_token_ids) + len(output_token_ids) + len(spec_token_ids).
+    # At each step, the scheduler tries to assign tokens to the requests
+    # so that each request's num_computed_tokens can catch up its
+    # num_tokens_with_spec. This is general enough to cover
+    # chunked prefills, prefix caching, speculative decoding,
+    # and the "jump decoding" optimization in the future.
+
+    scheduled_new_reqs: list[Request] = []
+    scheduled_resumed_reqs: list[Request] = []
+    scheduled_running_reqs: list[Request] = []
+    preempted_reqs: list[Request] = []
+
+    req_to_new_blocks: dict[str, KVCacheBlocks] = {}
+    num_scheduled_tokens: dict[str, int] = {}
+    token_budget = self.max_num_scheduled_tokens
+    # Encoder-related.
+    scheduled_encoder_inputs: dict[str, list[int]] = {}
+    encoder_compute_budget = self.max_num_encoder_input_tokens
+    # Spec decode-related.
+    scheduled_spec_decode_tokens: dict[str, list[int]] = {}
+
+    # For logging.
+    scheduled_timestamp = time.monotonic()
+
+    # First, schedule the RUNNING requests.
+    req_index = 0
+    while req_index < len(self.running) and token_budget > 0:
+        request = self.running[req_index]
+
+        num_new_tokens = (request.num_tokens_with_spec +
+                          request.num_output_placeholders -
+                          request.num_computed_tokens)
+        if (0 < self.scheduler_config.long_prefill_token_threshold <
+                num_new_tokens):
+            num_new_tokens = (
+                self.scheduler_config.long_prefill_token_threshold)
+        num_new_tokens = min(num_new_tokens, token_budget)
+
+        # Make sure the input position does not exceed the max model len.
+        # This is necessary when using spec decoding.
+        num_new_tokens = min(
+            num_new_tokens,
+            self.max_model_len - 1 - request.num_computed_tokens)
+
+        # Schedule encoder inputs.
+        encoder_inputs_to_schedule = None
+        new_encoder_compute_budget = encoder_compute_budget
+        if request.has_encoder_inputs:
+            (encoder_inputs_to_schedule, num_new_tokens,
+             new_encoder_compute_budget) = self._try_schedule_encoder_inputs(
+                 request, request.num_computed_tokens, num_new_tokens,
+                 encoder_compute_budget)
+
+        if self.scheduler_config.chunked_prefill_enabled and request.num_output_tokens == 0:
+            # shortest chunked prefill length is num_spec_tokens + 1
+            prefill_schedul_threshold = self.num_spec_tokens + 1
+            # Calculate remaining prompt tokens when request is in prefill phase
+            remaining_prompt_tokens = request.num_tokens - request.num_computed_tokens - num_new_tokens
+            if num_new_tokens > prefill_schedul_threshold:
+                # Boundary condition: when remaining tokens equal or less than threshold,
+                # reduce current round's token count to prevent phase misclassification
+                # in reorder batch later in next round
+                if 0 < remaining_prompt_tokens <= prefill_schedul_threshold:
+                    num_new_tokens -= (prefill_schedul_threshold -
+                                       remaining_prompt_tokens + 1)
+                    num_new_tokens = 0 if num_new_tokens < prefill_schedul_threshold else num_new_tokens
+            elif remaining_prompt_tokens > 0:
+                # cannot schedule less than threshold tokens in chunked prefill
+                num_new_tokens = 0
+
+        if num_new_tokens == 0:
+            # The request cannot be scheduled because one of the following
+            # reasons:
+            # 1. No new tokens to schedule. This may happen when
+            #    (1) PP>1 and we have already scheduled all prompt tokens
+            #    but they are not finished yet.
+            #    (2) Async scheduling and the request has reached to either
+            #    its max_total_tokens or max_model_len.
+            # 2. The encoder budget is exhausted.
+            # 3. The encoder cache is exhausted.
+            # NOTE(woosuk): Here, by doing `continue` instead of `break`,
+            # we do not strictly follow the FCFS scheduling policy and
+            # allow the lower-priority requests to be scheduled.
+            req_index += 1
+            continue
+
+        while True:
+            new_blocks = self.kv_cache_manager.allocate_slots(
+                request,
+                num_new_tokens,
+                num_lookahead_tokens=self.num_lookahead_tokens)
+            if new_blocks is None:
+                # The request cannot be scheduled.
+                # Preempt the lowest-priority request.
+                if self.policy == SchedulingPolicy.PRIORITY:
+                    preempted_req = max(
+                        self.running,
+                        key=lambda r: (r.priority, r.arrival_time),
+                    )
+                    self.running.remove(preempted_req)
+                    if preempted_req in scheduled_running_reqs:
+                        scheduled_running_reqs.remove(preempted_req)
+                else:
+                    preempted_req = self.running.pop()
+
+                self.kv_cache_manager.free(preempted_req)
+                self.encoder_cache_manager.free(preempted_req)
+                preempted_req.status = RequestStatus.PREEMPTED
+                preempted_req.num_computed_tokens = 0
+                if self.log_stats:
+                    preempted_req.record_event(EngineCoreEventType.PREEMPTED,
+                                               scheduled_timestamp)
+
+                self.waiting.prepend_request(preempted_req)
+                preempted_reqs.append(preempted_req)
+                if preempted_req == request:
+                    # No more request to preempt.
+                    can_schedule = False
+                    break
+            else:
+                # The request can be scheduled.
+                can_schedule = True
+                break
+        if not can_schedule:
+            break
+        assert new_blocks is not None
+
+        # Schedule the request.
+        scheduled_running_reqs.append(request)
+        req_to_new_blocks[request.request_id] = new_blocks
+        num_scheduled_tokens[request.request_id] = num_new_tokens
+        token_budget -= num_new_tokens
+        req_index += 1
+
+        # Speculative decode related.
+        if request.spec_token_ids:
+            num_scheduled_spec_tokens = (num_new_tokens +
+                                         request.num_computed_tokens -
+                                         request.num_tokens)
+            if num_scheduled_spec_tokens > 0:
+                # Trim spec_token_ids list to num_scheduled_spec_tokens.
+                del request.spec_token_ids[num_scheduled_spec_tokens:]
+                scheduled_spec_decode_tokens[request.request_id] = (
+                    request.spec_token_ids)
+
+        # Encoder-related.
+        if encoder_inputs_to_schedule:
+            scheduled_encoder_inputs[request.request_id] = (
+                encoder_inputs_to_schedule)
+            # Allocate the encoder cache.
+            for i in encoder_inputs_to_schedule:
+                self.encoder_cache_manager.allocate(request, i)
+            encoder_compute_budget = new_encoder_compute_budget
+
+    # Record the LoRAs in scheduled_running_reqs
+    scheduled_loras: set[int] = set()
+    if self.lora_config:
+        scheduled_loras = set(
+            req.lora_request.lora_int_id for req in scheduled_running_reqs
+            if req.lora_request and req.lora_request.lora_int_id > 0)
+        assert len(scheduled_loras) <= self.lora_config.max_loras
+
+    # Use a temporary RequestQueue to collect requests that need to be
+    # skipped and put back at the head of the waiting queue later
+    skipped_waiting_requests = create_request_queue(self.policy)
+
+    # Next, schedule the WAITING requests.
+    if not preempted_reqs:
+        while self.waiting and token_budget > 0:
+            if len(self.running) == self.max_num_running_reqs:
+                break
+
+            request = self.waiting.peek_request()
+
+            # KVTransfer: skip request if still waiting for remote kvs.
+            if request.status == RequestStatus.WAITING_FOR_REMOTE_KVS:
+                is_ready = self._update_waiting_for_remote_kv(request)
+                if is_ready:
+                    request.status = RequestStatus.WAITING
+                else:
+                    logger.debug(
+                        "%s is still in WAITING_FOR_REMOTE_KVS state.",
+                        request.request_id)
+                    self.waiting.pop_request()
+                    skipped_waiting_requests.prepend_request(request)
+                    continue
+
+            # Skip request if the structured output request is still waiting
+            # for FSM compilation.
+            if request.status == RequestStatus.WAITING_FOR_FSM:
+                structured_output_req = request.structured_output_request
+                if structured_output_req and structured_output_req.grammar:
+                    request.status = RequestStatus.WAITING
+                else:
+                    self.waiting.pop_request()
+                    skipped_waiting_requests.prepend_request(request)
+                    continue
+
+            # Check that adding the request still respects the max_loras
+            # constraint.
+            if (self.lora_config and request.lora_request and
+                (len(scheduled_loras) == self.lora_config.max_loras
+                 and request.lora_request.lora_int_id not in scheduled_loras)):
+                # Scheduling would exceed max_loras, skip.
+                self.waiting.pop_request()
+                skipped_waiting_requests.prepend_request(request)
+                continue
+
+            num_external_computed_tokens = 0
+            load_kv_async = False
+
+            # Get already-cached tokens.
+            if request.num_computed_tokens == 0:
+                # Get locally-cached tokens.
+                new_computed_blocks, num_new_local_computed_tokens = \
+                    self.kv_cache_manager.get_computed_blocks(
+                        request)
+
+                # Get externally-cached tokens if using a KVConnector.
+                if self.connector is not None:
+                    num_external_computed_tokens, load_kv_async = (
+                        self.connector.get_num_new_matched_tokens(
+                            request, num_new_local_computed_tokens))
+
+                    if num_external_computed_tokens is None:
+                        # The request cannot be scheduled because
+                        # the KVConnector couldn't determine
+                        # the number of matched tokens.
+                        self.waiting.pop_request()
+                        skipped_waiting_requests.prepend_request(request)
+                        continue
+
+                # Total computed tokens (local + external).
+                num_computed_tokens = (num_new_local_computed_tokens +
+                                       num_external_computed_tokens)
+            # KVTransfer: WAITING reqs have num_computed_tokens > 0
+            # after async KV recvs are completed.
+            else:
+                new_computed_blocks = (
+                    self.kv_cache_manager.create_empty_block_list())
+                num_new_local_computed_tokens = 0
+                num_computed_tokens = request.num_computed_tokens
+
+            encoder_inputs_to_schedule = None
+            new_encoder_compute_budget = encoder_compute_budget
+
+            # KVTransfer: loading remote KV, do not allocate for new work.
+            if load_kv_async:
+                assert num_external_computed_tokens > 0
+                num_new_tokens = 0
+            # Number of tokens to be scheduled.
+            else:
+                # We use `request.num_tokens` instead of
+                # `request.num_prompt_tokens` to consider the resumed
+                # requests, which have output tokens.
+                num_new_tokens = request.num_tokens - num_computed_tokens
+                if (0 < self.scheduler_config.long_prefill_token_threshold <
+                        num_new_tokens):
+                    num_new_tokens = (
+                        self.scheduler_config.long_prefill_token_threshold)
+
+                # chunked prefill has to be enabled explicitly to allow
+                # pooling requests to be chunked
+                if not self.scheduler_config.chunked_prefill_enabled and \
+                    num_new_tokens > token_budget:
+                    self.waiting.pop_request()
+                    skipped_waiting_requests.prepend_request(request)
+                    continue
+
+                num_new_tokens = min(num_new_tokens, token_budget)
+                assert num_new_tokens > 0
+
+                # Schedule encoder inputs.
+                if request.has_encoder_inputs:
+                    (encoder_inputs_to_schedule, num_new_tokens,
+                     new_encoder_compute_budget
+                     ) = self._try_schedule_encoder_inputs(
+                         request, num_computed_tokens, num_new_tokens,
+                         encoder_compute_budget)
+                    if num_new_tokens == 0:
+                        # The request cannot be scheduled.
+                        break
+
+                if num_new_tokens <= self.num_spec_tokens + 1:
+                    # Too short waiting requests can not be scheduled.
+                    self.waiting.pop_request()
+                    skipped_waiting_requests.prepend_request(request)
+                    continue
+
+            # Handles an edge case when P/D Disaggregation
+            # is used with Spec Decoding where an
+            # extra block gets allocated which
+            # creates a mismatch between the number
+            # of local and remote blocks.
+            effective_lookahead_tokens = (0 if request.num_computed_tokens == 0
+                                          else self.num_lookahead_tokens)
+
+            # Determine if we need to allocate cross-attention blocks.
+            if self.is_encoder_decoder and request.has_encoder_inputs:
+                # TODO(russellb): For Whisper, we know that the input is
+                # always padded to the maximum length. If we support other
+                # encoder-decoder models, this will need to be updated if we
+                # want to only allocate what is needed.
+                num_encoder_tokens =\
+                    self.scheduler_config.max_num_encoder_input_tokens
+            else:
+                num_encoder_tokens = 0
+
+            new_blocks = self.kv_cache_manager.allocate_slots(
+                request,
+                num_new_tokens + num_external_computed_tokens,
+                num_new_local_computed_tokens,
+                new_computed_blocks,
+                num_lookahead_tokens=effective_lookahead_tokens,
+                delay_cache_blocks=load_kv_async,
+                num_encoder_tokens=num_encoder_tokens,
+            )
+
+            if new_blocks is None:
+                # The request cannot be scheduled.
+                break
+
+            # KVTransfer: the connector uses this info to determine
+            # if a load is needed. Note that
+            # This information is used to determine if a load is
+            # needed for this request.
+            if self.connector is not None:
+                self.connector.update_state_after_alloc(
+                    request,
+                    new_computed_blocks + new_blocks,
+                    num_external_computed_tokens,
+                )
+
+            # Request was already popped from self.waiting
+            # unless it was re-added above due to new_blocks being None.
+            request = self.waiting.pop_request()
+            if load_kv_async:
+                # If loading async, allocate memory and put request
+                # into the WAITING_FOR_REMOTE_KV state.
+                skipped_waiting_requests.prepend_request(request)
+                request.status = RequestStatus.WAITING_FOR_REMOTE_KVS
+                continue
+
+            req_index += 1
+            self.running.append(request)
+            if self.log_stats:
+                request.record_event(EngineCoreEventType.SCHEDULED,
+                                     scheduled_timestamp)
+            if request.status == RequestStatus.WAITING:
+                scheduled_new_reqs.append(request)
+            elif request.status == RequestStatus.PREEMPTED:
+                scheduled_resumed_reqs.append(request)
+            else:
+                raise RuntimeError(f"Invalid request status: {request.status}")
+
+            if self.lora_config and request.lora_request:
+                scheduled_loras.add(request.lora_request.lora_int_id)
+            req_to_new_blocks[request.request_id] = (
+                self.kv_cache_manager.get_blocks(request.request_id))
+            num_scheduled_tokens[request.request_id] = num_new_tokens
+            token_budget -= num_new_tokens
+            request.status = RequestStatus.RUNNING
+            request.num_computed_tokens = num_computed_tokens
+            # Count the number of prefix cached tokens.
+            if request.num_cached_tokens < 0:
+                request.num_cached_tokens = num_computed_tokens
+            # Encoder-related.
+            if encoder_inputs_to_schedule:
+                scheduled_encoder_inputs[request.request_id] = (
+                    encoder_inputs_to_schedule)
+                # Allocate the encoder cache.
+                for i in encoder_inputs_to_schedule:
+                    self.encoder_cache_manager.allocate(request, i)
+                encoder_compute_budget = new_encoder_compute_budget
+
+    # Put back any skipped requests at the head of the waiting queue
+    if skipped_waiting_requests:
+        self.waiting.prepend_requests(skipped_waiting_requests)
+
+    # Check if the scheduling constraints are satisfied.
+    total_num_scheduled_tokens = sum(num_scheduled_tokens.values())
+    assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens
+    assert token_budget >= 0
+    assert len(self.running) <= self.max_num_running_reqs
+    # Since some requests in the RUNNING queue may not be scheduled in
+    # this step, the total number of scheduled requests can be smaller than
+    # len(self.running).
+    assert (len(scheduled_new_reqs) + len(scheduled_resumed_reqs) +
+            len(scheduled_running_reqs) <= len(self.running))
+
+    # Get the longest common prefix among all requests in the running queue.
+    # This can be potentially used for cascade attention.
+    num_common_prefix_blocks = [0] * len(self.kv_cache_config.kv_cache_groups)
+    if self.running:
+        any_request = self.running[0]
+        num_common_prefix_blocks = (
+            self.kv_cache_manager.get_num_common_prefix_blocks(
+                any_request, len(self.running)))
+
+    # Construct the scheduler output.
+    new_reqs_data = [
+        NewRequestData.from_request(
+            req, req_to_new_blocks[req.request_id].get_block_ids())
+        for req in scheduled_new_reqs
+    ]
+    cached_reqs_data = self._make_cached_request_data(
+        scheduled_running_reqs,
+        scheduled_resumed_reqs,
+        num_scheduled_tokens,
+        scheduled_spec_decode_tokens,
+        req_to_new_blocks,
+    )
+    scheduled_requests = (scheduled_new_reqs + scheduled_running_reqs +
+                          scheduled_resumed_reqs)
+    structured_output_request_ids, grammar_bitmask = (self.get_grammar_bitmask(
+        scheduled_requests, scheduled_spec_decode_tokens))
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=new_reqs_data,
+        scheduled_cached_reqs=cached_reqs_data,
+        num_scheduled_tokens=num_scheduled_tokens,
+        total_num_scheduled_tokens=total_num_scheduled_tokens,
+        scheduled_spec_decode_tokens=scheduled_spec_decode_tokens,
+        scheduled_encoder_inputs=scheduled_encoder_inputs,
+        num_common_prefix_blocks=num_common_prefix_blocks,
+        # finished_req_ids is an existing state in the scheduler,
+        # instead of being newly scheduled in this step.
+        # It contains the request IDs that are finished in between
+        # the previous and the current steps.
+        finished_req_ids=self.finished_req_ids,
+        free_encoder_mm_hashes=self.encoder_cache_manager.get_freed_mm_hashes(
+        ),
+        structured_output_request_ids=structured_output_request_ids,
+        grammar_bitmask=grammar_bitmask,
+    )
+
+    # NOTE(Kuntai): this function is designed for multiple purposes:
+    # 1. Plan the KV cache store
+    # 2. Wrap up all the KV cache load / save ops into an opaque object
+    # 3. Clear the internal states of the connector
+    if self.connector is not None:
+        meta = self.connector.build_connector_meta(scheduler_output)
+        scheduler_output.kv_connector_metadata = meta
+
+    # collect KV cache events from KV cache manager
+    events = self.kv_cache_manager.take_events()
+
+    # collect KV cache events from connector
+    if self.connector is not None:
+        connector_events = self.connector.take_events()
+        if connector_events:
+            if events is None:
+                events = list(connector_events)
+            else:
+                events.extend(connector_events)
+
+    # publish collected KV cache events
+    if events:
+        batch = KVEventBatch(ts=time.time(), events=events)
+        self.kv_event_publisher.publish(batch)
+
+    self._update_after_schedule(scheduler_output)
+    return scheduler_output
+
+
+@patch_to(Scheduler)
+def _make_cached_request_data(
+    self,
+    running_reqs: list[Request],
+    resumed_reqs: list[Request],
+    num_scheduled_tokens: dict[str, int],
+    spec_decode_tokens: dict[str, list[int]],
+    req_to_new_blocks: dict[str, KVCacheBlocks],
+) -> CachedRequestData:
+    req_ids: list[str] = []
+    new_token_ids: list[list[int]] = []
+    new_block_ids: list[Optional[tuple[list[int], ...]]] = []
+    num_computed_tokens: list[int] = []
+
+    use_connector = self.connector is not None
+    for req in itertools.chain(running_reqs, resumed_reqs):
+        req_id = req.request_id
+        req_ids.append(req_id)
+        num_tokens = (num_scheduled_tokens[req_id] -
+                      len(spec_decode_tokens.get(req_id, ())))
+        # if self.use_pp:
+        if not use_connector:
+            # When using PP, the scheduler sends the sampled tokens back,
+            # because there's no direct communication between the first-
+            # stage worker and the last-stage worker. Otherwise, we don't
+            # need to send the sampled tokens back because the model runner
+            # will cache them.
+            token_ids = req.all_token_ids[req.num_computed_tokens:req.
+                                          num_computed_tokens + num_tokens]
+            new_token_ids.append(token_ids)
+        elif use_connector:
+            # When using a KVConnector, we add a placeholder to avoid index
+            # out of bounds errors. TODO: Remove this once the KVConnector
+            # is updated to handle token IDs properly.
+            new_token_ids.append([])
+        new_block_ids.append(
+            req_to_new_blocks[req_id].get_block_ids(allow_none=True))
+        num_computed_tokens.append(req.num_computed_tokens)
+    # Because resumed_reqs is usually empty, it is more efficient to do
+    # in-place appending so that we don't need to allocate a new list.
+    resumed_from_preemption = [False] * len(running_reqs)
+    resumed_from_preemption += [True] * len(resumed_reqs)
+
+    return CachedRequestData(
+        req_ids=req_ids,
+        resumed_from_preemption=resumed_from_preemption,
+        new_token_ids=new_token_ids,
+        new_block_ids=new_block_ids,
+        num_computed_tokens=num_computed_tokens,
+    )