Sync from v0.13

This commit is contained in:
2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions

View File

View File

@@ -0,0 +1,68 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm.logger import init_logger
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.core.sched.scheduler import Scheduler
from vllm.v1.request import Request, RequestStatus
logger = init_logger(__name__)
class AsyncScheduler(Scheduler):
def _update_after_schedule(
self,
scheduler_output: SchedulerOutput,
) -> None:
super()._update_after_schedule(scheduler_output)
pending_structured_output_tokens = False
spec_decode_tokens = scheduler_output.scheduled_spec_decode_tokens
for req_id in scheduler_output.num_scheduled_tokens:
request = self.requests[req_id]
pending_structured_output_tokens |= (
request.use_structured_output and request.num_output_placeholders > 0
)
cur_num_spec_tokens = len(spec_decode_tokens.get(req_id, ()))
if (
request.num_computed_tokens
== request.num_tokens
+ request.num_output_placeholders
+ cur_num_spec_tokens
):
# The request will generate a new token plus num_spec_tokens
# in this scheduling step.
request.num_output_placeholders += 1 + cur_num_spec_tokens
# Add placeholders for the new tokens in spec_token_ids.
# We will update the actual spec token ids in the worker process.
request.spec_token_ids = [-1] * self.num_spec_tokens
scheduler_output.pending_structured_output_tokens = (
pending_structured_output_tokens
)
def _update_request_with_output(
self,
request: Request,
new_token_ids: list[int],
) -> tuple[list[int], bool]:
if request.discard_latest_async_tokens:
# If the request is force preempted in reset_prefix_cache, we
# should discard the latest async token.
request.discard_latest_async_tokens = False
return [], False
status_before_update = request.status
new_token_ids, stopped = super()._update_request_with_output(
request, new_token_ids
)
# Update the number of output placeholders.
request.num_output_placeholders -= len(new_token_ids)
assert request.num_output_placeholders >= 0
# Cache the new tokens. Preempted requests should be skipped.
if status_before_update == RequestStatus.RUNNING:
self.kv_cache_manager.cache_blocks(
request, request.num_computed_tokens - request.num_output_placeholders
)
return new_token_ids, stopped

View File

@@ -0,0 +1,189 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from abc import ABC, abstractmethod
from collections.abc import Iterable
from typing import TYPE_CHECKING, Optional
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
if TYPE_CHECKING:
from vllm.config import VllmConfig
from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
from vllm.v1.engine import EngineCoreOutputs
from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.metrics.stats import SchedulerStats
from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
from vllm.v1.request import Request, RequestStatus
from vllm.v1.structured_output import StructuredOutputManager
class SchedulerInterface(ABC):
@abstractmethod
def __init__(
self,
vllm_config: "VllmConfig",
kv_cache_config: "KVCacheConfig",
structured_output_manager: "StructuredOutputManager",
block_size: int,
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
include_finished_set: bool = False,
log_stats: bool = False,
) -> None:
raise NotImplementedError
@abstractmethod
def schedule(self) -> "SchedulerOutput":
"""Schedule the requests to process in this scheduling step.
The scheduling decision is made at the iteration level. Each scheduling
step corresponds to a single forward pass of the model. Therefore, this
method is called repeatedly by a busy loop in the engine.
Essentially, the scheduler produces a dictionary of {req_id: num_tokens}
that specifies how many tokens to process for each request in this
scheduling step. For example, num_tokens can be as large as the number
of prompt tokens for new requests, or it can be 1 for the requests that
are auto-regressively generating new tokens one by one. Otherwise, it
can be somewhere in between in case of chunked prefills, prefix caching,
speculative decoding, etc.
Additionally, the scheduler also returns useful data about each request
or the batch as a whole. The model runner will use this information in
preparing inputs to the model.
Returns:
A SchedulerOutput object containing information about the scheduled
requests.
"""
raise NotImplementedError
@abstractmethod
def get_grammar_bitmask(
self, scheduler_output: "SchedulerOutput"
) -> "GrammarOutput | None":
raise NotImplementedError
@abstractmethod
def update_from_output(
self,
scheduler_output: "SchedulerOutput",
model_runner_output: "ModelRunnerOutput",
) -> dict[int, "EngineCoreOutputs"]:
"""Update the scheduler state based on the model runner output.
This method is called after the model runner has processed the scheduled
requests. The model runner output includes generated token ids, draft
token ids for next step, etc. The scheduler uses this information to
update its states, checks the finished requests, and returns the output
for each request.
Returns:
A dict of client index to EngineCoreOutputs object containing the
outputs for each request originating from that client.
"""
raise NotImplementedError
@abstractmethod
def update_draft_token_ids(
self,
draft_token_ids: "DraftTokenIds",
) -> None:
"""Update the draft token ids for the scheduled requests."""
raise NotImplementedError
@abstractmethod
def add_request(self, request: "Request") -> None:
"""Add a new request to the scheduler's internal queue.
Args:
request: The new request being added.
"""
raise NotImplementedError
@abstractmethod
def finish_requests(
self,
request_ids: str | Iterable[str],
finished_status: "RequestStatus",
) -> None:
"""Finish the requests in the scheduler's internal queue. If the request
is not in the queue, this method will do nothing.
This method is called in two cases:
1. When the request is aborted by the client.
2. When the frontend process detects a stop string of the request after
de-tokenizing its generated tokens.
Args:
request_ids: A single or a list of request IDs.
finished_status: The finished status of the given requests.
"""
raise NotImplementedError
@abstractmethod
def get_num_unfinished_requests(self) -> int:
"""Number of unfinished requests in the scheduler's internal queue."""
raise NotImplementedError
def has_unfinished_requests(self) -> bool:
"""Returns True if there are unfinished requests in the scheduler's
internal queue."""
return self.get_num_unfinished_requests() > 0
@abstractmethod
def has_finished_requests(self) -> bool:
"""Returns True if there are finished requests that need to be cleared.
NOTE: This is different from `not self.has_unfinished_requests()`.
The scheduler maintains an internal list of the requests finished in the
previous step. This list is returned from the next call to schedule(),
to be sent to the model runner in the next step to clear cached states
for these finished requests.
This method checks if this internal list of finished requests is
non-empty. This information is useful for DP attention.
"""
raise NotImplementedError
def has_requests(self) -> bool:
"""Returns True if there are unfinished requests, or finished requests
not yet returned in SchedulerOutputs."""
return self.has_unfinished_requests() or self.has_finished_requests()
@abstractmethod
def reset_prefix_cache(
self, reset_running_requests: bool = False, reset_connector: bool = False
) -> bool:
"""Reset the prefix cache for KV cache.
This is particularly required when the model weights are live-updated.
Args:
reset_running_requests: If True, all the running requests will be
preempted and moved to the waiting queue. Otherwise, this method
will only reset the KV prefix cache when there is no running request
taking KV cache.
"""
raise NotImplementedError
@abstractmethod
def get_request_counts(self) -> tuple[int, int]:
"""Returns (num_running_reqs, num_waiting_reqs)."""
raise NotImplementedError
@abstractmethod
def make_stats(self) -> Optional["SchedulerStats"]:
"""Make a SchedulerStats object for logging.
The SchedulerStats object is created for every scheduling step.
"""
raise NotImplementedError
@abstractmethod
def shutdown(self) -> None:
"""Shutdown the scheduler."""
raise NotImplementedError
def get_kv_connector(self) -> Optional["KVConnectorBase_V1"]:
return None

View File

@@ -0,0 +1,230 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass
from functools import cached_property
from typing import TYPE_CHECKING
from typing_extensions import deprecated
from vllm._bc_linter import bc_linter_include
if TYPE_CHECKING:
import numpy as np
import numpy.typing as npt
import torch
from vllm.distributed.ec_transfer.ec_connector.base import ECConnectorMetadata
from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
from vllm.lora.request import LoRARequest
from vllm.multimodal.inputs import MultiModalFeatureSpec
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams
from vllm.v1.request import Request
else:
ECConnectorMetadata = object
KVConnectorMetadata = object
LoRARequest = object
MultiModalFeatureSpec = object
PoolingParams = object
SamplingParams = object
Request = object
@bc_linter_include
@dataclass
class NewRequestData:
req_id: str
prompt_token_ids: list[int] | None
mm_features: list[MultiModalFeatureSpec]
sampling_params: SamplingParams | None
pooling_params: PoolingParams | None
block_ids: tuple[list[int], ...]
num_computed_tokens: int
lora_request: LoRARequest | None
prompt_embeds: "torch.Tensor | None" = None
# Only used for v2 model runner.
prefill_token_ids: list[int] | None = None
@classmethod
def from_request(
cls,
request: Request,
block_ids: tuple[list[int], ...],
prefill_token_ids: list[int] | None = None,
) -> "NewRequestData":
return cls(
req_id=request.request_id,
prompt_token_ids=request.prompt_token_ids,
mm_features=request.mm_features,
sampling_params=request.sampling_params,
pooling_params=request.pooling_params,
block_ids=block_ids,
num_computed_tokens=request.num_computed_tokens,
lora_request=request.lora_request,
prompt_embeds=request.prompt_embeds,
prefill_token_ids=prefill_token_ids,
)
def __repr__(self) -> str:
prompt_embeds_shape = (
self.prompt_embeds.shape if self.prompt_embeds is not None else None
)
return (
f"NewRequestData("
f"req_id={self.req_id},"
f"prompt_token_ids={self.prompt_token_ids},"
f"prefill_token_ids={self.prefill_token_ids},"
f"mm_features={self.mm_features},"
f"sampling_params={self.sampling_params},"
f"block_ids={self.block_ids},"
f"num_computed_tokens={self.num_computed_tokens},"
f"lora_request={self.lora_request},"
f"prompt_embeds_shape={prompt_embeds_shape}"
")"
)
# Version of __repr__ with the prompt data obfuscated
def anon_repr(self) -> str:
prompt_token_ids_len = (
len(self.prompt_token_ids) if self.prompt_token_ids is not None else None
)
prompt_embeds_shape = (
self.prompt_embeds.shape if self.prompt_embeds is not None else None
)
return (
f"NewRequestData("
f"req_id={self.req_id},"
f"prompt_token_ids_len={prompt_token_ids_len},"
f"mm_features={self.mm_features},"
f"sampling_params={self.sampling_params},"
f"block_ids={self.block_ids},"
f"num_computed_tokens={self.num_computed_tokens},"
f"lora_request={self.lora_request},"
f"prompt_embeds_shape={prompt_embeds_shape}"
")"
)
@bc_linter_include
@dataclass
class CachedRequestData:
req_ids: list[str]
# For request ids not in resumed_req_ids, new_block_ids will be appended to
# the request's block IDs. For those in the set, new_block_ids will be used as the
# request's block IDs instead of appending to the existing block IDs.
resumed_req_ids: set[str]
# NOTE(woosuk): new_token_ids is only used for pipeline parallelism.
# When PP is not used, new_token_ids will be empty.
new_token_ids: list[list[int]]
# For requests not scheduled in the last step, propagate the token ids to the
# connector. Won't contain requests that were scheduled in the prior step.
all_token_ids: dict[str, list[int]]
new_block_ids: list[tuple[list[int], ...] | None]
num_computed_tokens: list[int]
num_output_tokens: list[int]
@property
def num_reqs(self) -> int:
return len(self.req_ids)
@cached_property
@deprecated("This will be removed in v0.14, use `resumed_req_ids` instead.")
def resumed_from_preemption(self) -> list[bool]:
return [req_id in self.resumed_req_ids for req_id in self.req_ids]
@cached_property
@deprecated("This will be removed in v0.14, use `all_token_ids` instead.")
def resumed_req_token_ids(self) -> list[list[int] | None]:
return [
self.all_token_ids[req_id] if req_id in self.resumed_req_ids else None
for req_id in self.req_ids
]
@classmethod
def make_empty(cls) -> "CachedRequestData":
return cls(
req_ids=[],
resumed_req_ids=set(),
new_token_ids=[],
all_token_ids={},
new_block_ids=[],
num_computed_tokens=[],
num_output_tokens=[],
)
@bc_linter_include
@dataclass
class SchedulerOutput:
# list of the requests that are scheduled for the first time.
# We cache the request's data in each worker process, so that we don't
# need to re-send it every scheduling step.
scheduled_new_reqs: list[NewRequestData]
# list of the requests that have been scheduled before.
# Since the request's data is already cached in the worker processes,
# we only send the diff to minimize the communication cost.
scheduled_cached_reqs: CachedRequestData
# req_id -> num_scheduled_tokens
# Number of tokens scheduled for each request.
num_scheduled_tokens: dict[str, int]
# Total number of tokens scheduled for all requests.
# Equal to sum(num_scheduled_tokens.values())
total_num_scheduled_tokens: int
# req_id -> spec_token_ids
# If a request does not have any spec decode tokens, it will not be
# included in the dictionary.
scheduled_spec_decode_tokens: dict[str, list[int]]
# req_id -> encoder input indices that need processing.
# E.g., if a request has [0, 1], it could mean the vision encoder needs
# to process that the request's 0-th and 1-th images in the current step.
scheduled_encoder_inputs: dict[str, list[int]]
# Number of common prefix blocks for all requests in each KV cache group.
# This can be used for cascade attention.
num_common_prefix_blocks: list[int]
# Request IDs that are finished in between the previous and the current
# steps. This is used to notify the workers about the finished requests
# so that they can free the cached states for those requests.
finished_req_ids: set[str]
# list of mm_hash strings associated with the encoder outputs to be
# freed from the encoder cache.
free_encoder_mm_hashes: list[str]
# Request IDs that are preempted in this step.
# Only used for v2 model runner.
preempted_req_ids: set[str] | None = None
# Whether the scheduled requests have all the output tokens they
# need to perform grammar bitmask computation.
pending_structured_output_tokens: bool = False
# KV Cache Connector metadata.
kv_connector_metadata: KVConnectorMetadata | None = None
# EC Cache Connector metadata
ec_connector_metadata: ECConnectorMetadata | None = None
@classmethod
def make_empty(cls) -> "SchedulerOutput":
return cls(
scheduled_new_reqs=[],
scheduled_cached_reqs=CachedRequestData.make_empty(),
num_scheduled_tokens={},
total_num_scheduled_tokens=0,
scheduled_spec_decode_tokens={},
scheduled_encoder_inputs={},
num_common_prefix_blocks=[],
finished_req_ids=set(),
free_encoder_mm_hashes=[],
)
@dataclass
class GrammarOutput:
# ids of structured output requests.
structured_output_request_ids: list[str]
# Bitmask ordered as structured_output_request_ids.
grammar_bitmask: "npt.NDArray[np.int32]"

View File

@@ -0,0 +1,217 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import heapq
from abc import ABC, abstractmethod
from collections import deque
from collections.abc import Iterable, Iterator
from enum import Enum
from vllm.v1.request import Request
class SchedulingPolicy(Enum):
"""Enum for scheduling policies."""
FCFS = "fcfs"
PRIORITY = "priority"
class RequestQueue(ABC):
"""Abstract base class for request queues."""
@abstractmethod
def add_request(self, request: Request) -> None:
"""Add a request to the queue according to the policy."""
pass
@abstractmethod
def pop_request(self) -> Request:
"""Pop a request from the queue according to the policy."""
pass
@abstractmethod
def peek_request(self) -> Request:
"""Peek at the request at the front of the queue without removing it."""
pass
@abstractmethod
def prepend_request(self, request: Request) -> None:
"""Prepend a request to the front of the queue."""
pass
@abstractmethod
def prepend_requests(self, requests: "RequestQueue") -> None:
"""Prepend all requests from another queue to the front of this
queue."""
pass
@abstractmethod
def remove_request(self, request: Request) -> None:
"""Remove a specific request from the queue."""
pass
@abstractmethod
def remove_requests(self, requests: Iterable[Request]) -> None:
"""Remove multiple specific requests from the queue."""
pass
@abstractmethod
def __bool__(self) -> bool:
"""Check if queue has any requests."""
pass
@abstractmethod
def __len__(self) -> int:
"""Get number of requests in queue."""
pass
@abstractmethod
def __iter__(self) -> Iterator[Request]:
"""Iterate over the queue according to the policy."""
pass
@abstractmethod
def __reversed__(self) -> Iterator[Request]:
"""Iterate over the queue in reverse order."""
pass
class FCFSRequestQueue(deque[Request], RequestQueue):
"""A first-come-first-served queue that supports deque operations."""
def add_request(self, request: Request) -> None:
"""Add a request to the queue according to FCFS policy."""
self.append(request)
def pop_request(self) -> Request:
"""Pop a request from the queue according to FCFS policy."""
return self.popleft()
def peek_request(self) -> Request:
"""Peek at the next request in the queue without removing it."""
if not self:
raise IndexError("peek from an empty queue")
return self[0]
def prepend_request(self, request: Request) -> None:
"""Prepend a request to the front of the queue."""
self.appendleft(request)
def prepend_requests(self, requests: RequestQueue) -> None:
"""Prepend all requests from another queue to the front of this
queue."""
self.extendleft(reversed(requests))
def remove_request(self, request: Request) -> None:
"""Remove a specific request from the queue."""
self.remove(request)
def remove_requests(self, requests: Iterable[Request]) -> None:
"""Remove multiple specific requests from the queue."""
requests_to_remove = set(requests)
filtered_requests = [req for req in self if req not in requests_to_remove]
# deque does not support in-place filtering, so we need to clear
# and extend
self.clear()
self.extend(filtered_requests)
def __bool__(self) -> bool:
"""Check if queue has any requests."""
return len(self) > 0
def __len__(self) -> int:
"""Get number of requests in queue."""
return super().__len__()
def __iter__(self) -> Iterator[Request]:
"""Iterate over the queue according to FCFS policy."""
return super().__iter__()
def __reversed__(self) -> Iterator[Request]:
"""Iterate over the queue in reverse order."""
return super().__reversed__()
class PriorityRequestQueue(RequestQueue):
"""
A priority queue that supports heap operations.
Respects the ordering defined in the Request class, where
requests with a smaller value of `priority` are processed first.
If multiple requests have the same priority, the one with the earlier
`arrival_time` is processed first.
"""
def __init__(self) -> None:
self._heap: list[Request] = []
def add_request(self, request: Request) -> None:
"""Add a request to the queue according to priority policy."""
heapq.heappush(self._heap, request)
def pop_request(self) -> Request:
"""Pop a request from the queue according to priority policy."""
if not self._heap:
raise IndexError("pop from empty heap")
return heapq.heappop(self._heap)
def peek_request(self) -> Request:
"""Peek at the next request in the queue without removing it."""
if not self._heap:
raise IndexError("peek from empty heap")
return self._heap[0]
def prepend_request(self, request: Request) -> None:
"""Add a request to the queue according to priority policy.
Note: In a priority queue, there is no concept of prepending to the
front. Requests are ordered by (priority, arrival_time)."""
self.add_request(request)
def prepend_requests(self, requests: RequestQueue) -> None:
"""Add all requests from another queue according to priority policy.
Note: In a priority queue, there is no concept of prepending to the
front. Requests are ordered by (priority, arrival_time)."""
for request in requests:
self.add_request(request)
def remove_request(self, request: Request) -> None:
"""Remove a specific request from the queue."""
self._heap.remove(request)
heapq.heapify(self._heap)
def remove_requests(self, requests: Iterable[Request]) -> None:
"""Remove multiple specific requests from the queue."""
requests_to_remove = requests if isinstance(requests, set) else set(requests)
self._heap = [r for r in self._heap if r not in requests_to_remove]
heapq.heapify(self._heap)
def __bool__(self) -> bool:
"""Check if queue has any requests."""
return bool(self._heap)
def __len__(self) -> int:
"""Get number of requests in queue."""
return len(self._heap)
def __iter__(self) -> Iterator[Request]:
"""Iterate over the queue according to priority policy."""
heap_copy = self._heap[:]
while heap_copy:
yield heapq.heappop(heap_copy)
def __reversed__(self) -> Iterator[Request]:
"""Iterate over the queue in reverse priority order."""
return reversed(list(self))
def create_request_queue(policy: SchedulingPolicy) -> RequestQueue:
"""Create request queue based on scheduling policy."""
if policy == SchedulingPolicy.PRIORITY:
return PriorityRequestQueue()
elif policy == SchedulingPolicy.FCFS:
return FCFSRequestQueue()
else:
raise ValueError(f"Unknown scheduling policy: {policy}")

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,64 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import contextlib
from vllm.v1.request import Request, RequestStatus
def remove_all(lst: list, items_to_remove: set) -> list:
"""Remove all items from a list that are in the items_to_remove set.
This method optimizes for the common case of removing a single item,
falling back to list comprehension for multiple items.
Args:
lst: The list to remove items from
items_to_remove: Set of items to remove
Returns:
Either the modified original list (for single item removal) or
a new list (for multiple item removal). Callers should use the
returned value.
Note:
For single item removal, this modifies the original list in-place
and returns it. For multiple items, it creates and returns a new list.
"""
if not items_to_remove:
return lst
if len(items_to_remove) == 1:
# Fast path for single item removal (most common case)
item = next(iter(items_to_remove))
with contextlib.suppress(ValueError):
lst.remove(item)
return lst
# For multiple items, use list comprehension
return [item for item in lst if item not in items_to_remove]
def check_stop(request: Request, max_model_len: int) -> bool:
assert not request.pooling_params
sampling_params = request.sampling_params
assert sampling_params is not None
if request.num_output_tokens < sampling_params.min_tokens:
return False
last_token_id = request.output_token_ids[-1]
if not sampling_params.ignore_eos and last_token_id == request.eos_token_id:
request.status = RequestStatus.FINISHED_STOPPED
return True
if last_token_id in (sampling_params.stop_token_ids or ()):
request.status = RequestStatus.FINISHED_STOPPED
request.stop_reason = last_token_id
return True
if (
request.num_tokens >= max_model_len
or request.num_output_tokens >= request.max_tokens
):
request.status = RequestStatus.FINISHED_LENGTH_CAPPED
return True
return False