Sync from v0.13
This commit is contained in:
0
vllm/v1/core/sched/__init__.py
Normal file
0
vllm/v1/core/sched/__init__.py
Normal file
68
vllm/v1/core/sched/async_scheduler.py
Normal file
68
vllm/v1/core/sched/async_scheduler.py
Normal file
@@ -0,0 +1,68 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.core.sched.scheduler import Scheduler
|
||||
from vllm.v1.request import Request, RequestStatus
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class AsyncScheduler(Scheduler):
|
||||
def _update_after_schedule(
|
||||
self,
|
||||
scheduler_output: SchedulerOutput,
|
||||
) -> None:
|
||||
super()._update_after_schedule(scheduler_output)
|
||||
pending_structured_output_tokens = False
|
||||
spec_decode_tokens = scheduler_output.scheduled_spec_decode_tokens
|
||||
for req_id in scheduler_output.num_scheduled_tokens:
|
||||
request = self.requests[req_id]
|
||||
pending_structured_output_tokens |= (
|
||||
request.use_structured_output and request.num_output_placeholders > 0
|
||||
)
|
||||
cur_num_spec_tokens = len(spec_decode_tokens.get(req_id, ()))
|
||||
if (
|
||||
request.num_computed_tokens
|
||||
== request.num_tokens
|
||||
+ request.num_output_placeholders
|
||||
+ cur_num_spec_tokens
|
||||
):
|
||||
# The request will generate a new token plus num_spec_tokens
|
||||
# in this scheduling step.
|
||||
request.num_output_placeholders += 1 + cur_num_spec_tokens
|
||||
# Add placeholders for the new tokens in spec_token_ids.
|
||||
# We will update the actual spec token ids in the worker process.
|
||||
request.spec_token_ids = [-1] * self.num_spec_tokens
|
||||
|
||||
scheduler_output.pending_structured_output_tokens = (
|
||||
pending_structured_output_tokens
|
||||
)
|
||||
|
||||
def _update_request_with_output(
|
||||
self,
|
||||
request: Request,
|
||||
new_token_ids: list[int],
|
||||
) -> tuple[list[int], bool]:
|
||||
if request.discard_latest_async_tokens:
|
||||
# If the request is force preempted in reset_prefix_cache, we
|
||||
# should discard the latest async token.
|
||||
request.discard_latest_async_tokens = False
|
||||
return [], False
|
||||
|
||||
status_before_update = request.status
|
||||
new_token_ids, stopped = super()._update_request_with_output(
|
||||
request, new_token_ids
|
||||
)
|
||||
|
||||
# Update the number of output placeholders.
|
||||
request.num_output_placeholders -= len(new_token_ids)
|
||||
assert request.num_output_placeholders >= 0
|
||||
|
||||
# Cache the new tokens. Preempted requests should be skipped.
|
||||
if status_before_update == RequestStatus.RUNNING:
|
||||
self.kv_cache_manager.cache_blocks(
|
||||
request, request.num_computed_tokens - request.num_output_placeholders
|
||||
)
|
||||
return new_token_ids, stopped
|
||||
189
vllm/v1/core/sched/interface.py
Normal file
189
vllm/v1/core/sched/interface.py
Normal file
@@ -0,0 +1,189 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from abc import ABC, abstractmethod
|
||||
from collections.abc import Iterable
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
|
||||
from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
|
||||
from vllm.v1.engine import EngineCoreOutputs
|
||||
from vllm.v1.kv_cache_interface import KVCacheConfig
|
||||
from vllm.v1.metrics.stats import SchedulerStats
|
||||
from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
|
||||
from vllm.v1.request import Request, RequestStatus
|
||||
from vllm.v1.structured_output import StructuredOutputManager
|
||||
|
||||
|
||||
class SchedulerInterface(ABC):
|
||||
@abstractmethod
|
||||
def __init__(
|
||||
self,
|
||||
vllm_config: "VllmConfig",
|
||||
kv_cache_config: "KVCacheConfig",
|
||||
structured_output_manager: "StructuredOutputManager",
|
||||
block_size: int,
|
||||
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
||||
include_finished_set: bool = False,
|
||||
log_stats: bool = False,
|
||||
) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def schedule(self) -> "SchedulerOutput":
|
||||
"""Schedule the requests to process in this scheduling step.
|
||||
|
||||
The scheduling decision is made at the iteration level. Each scheduling
|
||||
step corresponds to a single forward pass of the model. Therefore, this
|
||||
method is called repeatedly by a busy loop in the engine.
|
||||
|
||||
Essentially, the scheduler produces a dictionary of {req_id: num_tokens}
|
||||
that specifies how many tokens to process for each request in this
|
||||
scheduling step. For example, num_tokens can be as large as the number
|
||||
of prompt tokens for new requests, or it can be 1 for the requests that
|
||||
are auto-regressively generating new tokens one by one. Otherwise, it
|
||||
can be somewhere in between in case of chunked prefills, prefix caching,
|
||||
speculative decoding, etc.
|
||||
|
||||
Additionally, the scheduler also returns useful data about each request
|
||||
or the batch as a whole. The model runner will use this information in
|
||||
preparing inputs to the model.
|
||||
|
||||
Returns:
|
||||
A SchedulerOutput object containing information about the scheduled
|
||||
requests.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def get_grammar_bitmask(
|
||||
self, scheduler_output: "SchedulerOutput"
|
||||
) -> "GrammarOutput | None":
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def update_from_output(
|
||||
self,
|
||||
scheduler_output: "SchedulerOutput",
|
||||
model_runner_output: "ModelRunnerOutput",
|
||||
) -> dict[int, "EngineCoreOutputs"]:
|
||||
"""Update the scheduler state based on the model runner output.
|
||||
|
||||
This method is called after the model runner has processed the scheduled
|
||||
requests. The model runner output includes generated token ids, draft
|
||||
token ids for next step, etc. The scheduler uses this information to
|
||||
update its states, checks the finished requests, and returns the output
|
||||
for each request.
|
||||
|
||||
Returns:
|
||||
A dict of client index to EngineCoreOutputs object containing the
|
||||
outputs for each request originating from that client.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def update_draft_token_ids(
|
||||
self,
|
||||
draft_token_ids: "DraftTokenIds",
|
||||
) -> None:
|
||||
"""Update the draft token ids for the scheduled requests."""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def add_request(self, request: "Request") -> None:
|
||||
"""Add a new request to the scheduler's internal queue.
|
||||
|
||||
Args:
|
||||
request: The new request being added.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def finish_requests(
|
||||
self,
|
||||
request_ids: str | Iterable[str],
|
||||
finished_status: "RequestStatus",
|
||||
) -> None:
|
||||
"""Finish the requests in the scheduler's internal queue. If the request
|
||||
is not in the queue, this method will do nothing.
|
||||
|
||||
This method is called in two cases:
|
||||
1. When the request is aborted by the client.
|
||||
2. When the frontend process detects a stop string of the request after
|
||||
de-tokenizing its generated tokens.
|
||||
|
||||
Args:
|
||||
request_ids: A single or a list of request IDs.
|
||||
finished_status: The finished status of the given requests.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def get_num_unfinished_requests(self) -> int:
|
||||
"""Number of unfinished requests in the scheduler's internal queue."""
|
||||
raise NotImplementedError
|
||||
|
||||
def has_unfinished_requests(self) -> bool:
|
||||
"""Returns True if there are unfinished requests in the scheduler's
|
||||
internal queue."""
|
||||
return self.get_num_unfinished_requests() > 0
|
||||
|
||||
@abstractmethod
|
||||
def has_finished_requests(self) -> bool:
|
||||
"""Returns True if there are finished requests that need to be cleared.
|
||||
NOTE: This is different from `not self.has_unfinished_requests()`.
|
||||
|
||||
The scheduler maintains an internal list of the requests finished in the
|
||||
previous step. This list is returned from the next call to schedule(),
|
||||
to be sent to the model runner in the next step to clear cached states
|
||||
for these finished requests.
|
||||
|
||||
This method checks if this internal list of finished requests is
|
||||
non-empty. This information is useful for DP attention.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def has_requests(self) -> bool:
|
||||
"""Returns True if there are unfinished requests, or finished requests
|
||||
not yet returned in SchedulerOutputs."""
|
||||
return self.has_unfinished_requests() or self.has_finished_requests()
|
||||
|
||||
@abstractmethod
|
||||
def reset_prefix_cache(
|
||||
self, reset_running_requests: bool = False, reset_connector: bool = False
|
||||
) -> bool:
|
||||
"""Reset the prefix cache for KV cache.
|
||||
|
||||
This is particularly required when the model weights are live-updated.
|
||||
|
||||
Args:
|
||||
reset_running_requests: If True, all the running requests will be
|
||||
preempted and moved to the waiting queue. Otherwise, this method
|
||||
will only reset the KV prefix cache when there is no running request
|
||||
taking KV cache.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def get_request_counts(self) -> tuple[int, int]:
|
||||
"""Returns (num_running_reqs, num_waiting_reqs)."""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def make_stats(self) -> Optional["SchedulerStats"]:
|
||||
"""Make a SchedulerStats object for logging.
|
||||
|
||||
The SchedulerStats object is created for every scheduling step.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def shutdown(self) -> None:
|
||||
"""Shutdown the scheduler."""
|
||||
raise NotImplementedError
|
||||
|
||||
def get_kv_connector(self) -> Optional["KVConnectorBase_V1"]:
|
||||
return None
|
||||
230
vllm/v1/core/sched/output.py
Normal file
230
vllm/v1/core/sched/output.py
Normal file
@@ -0,0 +1,230 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from dataclasses import dataclass
|
||||
from functools import cached_property
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from typing_extensions import deprecated
|
||||
|
||||
from vllm._bc_linter import bc_linter_include
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import numpy as np
|
||||
import numpy.typing as npt
|
||||
import torch
|
||||
|
||||
from vllm.distributed.ec_transfer.ec_connector.base import ECConnectorMetadata
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.multimodal.inputs import MultiModalFeatureSpec
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.v1.request import Request
|
||||
else:
|
||||
ECConnectorMetadata = object
|
||||
KVConnectorMetadata = object
|
||||
LoRARequest = object
|
||||
MultiModalFeatureSpec = object
|
||||
PoolingParams = object
|
||||
SamplingParams = object
|
||||
Request = object
|
||||
|
||||
|
||||
@bc_linter_include
|
||||
@dataclass
|
||||
class NewRequestData:
|
||||
req_id: str
|
||||
prompt_token_ids: list[int] | None
|
||||
mm_features: list[MultiModalFeatureSpec]
|
||||
sampling_params: SamplingParams | None
|
||||
pooling_params: PoolingParams | None
|
||||
block_ids: tuple[list[int], ...]
|
||||
num_computed_tokens: int
|
||||
lora_request: LoRARequest | None
|
||||
prompt_embeds: "torch.Tensor | None" = None
|
||||
|
||||
# Only used for v2 model runner.
|
||||
prefill_token_ids: list[int] | None = None
|
||||
|
||||
@classmethod
|
||||
def from_request(
|
||||
cls,
|
||||
request: Request,
|
||||
block_ids: tuple[list[int], ...],
|
||||
prefill_token_ids: list[int] | None = None,
|
||||
) -> "NewRequestData":
|
||||
return cls(
|
||||
req_id=request.request_id,
|
||||
prompt_token_ids=request.prompt_token_ids,
|
||||
mm_features=request.mm_features,
|
||||
sampling_params=request.sampling_params,
|
||||
pooling_params=request.pooling_params,
|
||||
block_ids=block_ids,
|
||||
num_computed_tokens=request.num_computed_tokens,
|
||||
lora_request=request.lora_request,
|
||||
prompt_embeds=request.prompt_embeds,
|
||||
prefill_token_ids=prefill_token_ids,
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
prompt_embeds_shape = (
|
||||
self.prompt_embeds.shape if self.prompt_embeds is not None else None
|
||||
)
|
||||
return (
|
||||
f"NewRequestData("
|
||||
f"req_id={self.req_id},"
|
||||
f"prompt_token_ids={self.prompt_token_ids},"
|
||||
f"prefill_token_ids={self.prefill_token_ids},"
|
||||
f"mm_features={self.mm_features},"
|
||||
f"sampling_params={self.sampling_params},"
|
||||
f"block_ids={self.block_ids},"
|
||||
f"num_computed_tokens={self.num_computed_tokens},"
|
||||
f"lora_request={self.lora_request},"
|
||||
f"prompt_embeds_shape={prompt_embeds_shape}"
|
||||
")"
|
||||
)
|
||||
|
||||
# Version of __repr__ with the prompt data obfuscated
|
||||
def anon_repr(self) -> str:
|
||||
prompt_token_ids_len = (
|
||||
len(self.prompt_token_ids) if self.prompt_token_ids is not None else None
|
||||
)
|
||||
prompt_embeds_shape = (
|
||||
self.prompt_embeds.shape if self.prompt_embeds is not None else None
|
||||
)
|
||||
return (
|
||||
f"NewRequestData("
|
||||
f"req_id={self.req_id},"
|
||||
f"prompt_token_ids_len={prompt_token_ids_len},"
|
||||
f"mm_features={self.mm_features},"
|
||||
f"sampling_params={self.sampling_params},"
|
||||
f"block_ids={self.block_ids},"
|
||||
f"num_computed_tokens={self.num_computed_tokens},"
|
||||
f"lora_request={self.lora_request},"
|
||||
f"prompt_embeds_shape={prompt_embeds_shape}"
|
||||
")"
|
||||
)
|
||||
|
||||
|
||||
@bc_linter_include
|
||||
@dataclass
|
||||
class CachedRequestData:
|
||||
req_ids: list[str]
|
||||
# For request ids not in resumed_req_ids, new_block_ids will be appended to
|
||||
# the request's block IDs. For those in the set, new_block_ids will be used as the
|
||||
# request's block IDs instead of appending to the existing block IDs.
|
||||
resumed_req_ids: set[str]
|
||||
# NOTE(woosuk): new_token_ids is only used for pipeline parallelism.
|
||||
# When PP is not used, new_token_ids will be empty.
|
||||
new_token_ids: list[list[int]]
|
||||
# For requests not scheduled in the last step, propagate the token ids to the
|
||||
# connector. Won't contain requests that were scheduled in the prior step.
|
||||
all_token_ids: dict[str, list[int]]
|
||||
new_block_ids: list[tuple[list[int], ...] | None]
|
||||
num_computed_tokens: list[int]
|
||||
num_output_tokens: list[int]
|
||||
|
||||
@property
|
||||
def num_reqs(self) -> int:
|
||||
return len(self.req_ids)
|
||||
|
||||
@cached_property
|
||||
@deprecated("This will be removed in v0.14, use `resumed_req_ids` instead.")
|
||||
def resumed_from_preemption(self) -> list[bool]:
|
||||
return [req_id in self.resumed_req_ids for req_id in self.req_ids]
|
||||
|
||||
@cached_property
|
||||
@deprecated("This will be removed in v0.14, use `all_token_ids` instead.")
|
||||
def resumed_req_token_ids(self) -> list[list[int] | None]:
|
||||
return [
|
||||
self.all_token_ids[req_id] if req_id in self.resumed_req_ids else None
|
||||
for req_id in self.req_ids
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def make_empty(cls) -> "CachedRequestData":
|
||||
return cls(
|
||||
req_ids=[],
|
||||
resumed_req_ids=set(),
|
||||
new_token_ids=[],
|
||||
all_token_ids={},
|
||||
new_block_ids=[],
|
||||
num_computed_tokens=[],
|
||||
num_output_tokens=[],
|
||||
)
|
||||
|
||||
|
||||
@bc_linter_include
|
||||
@dataclass
|
||||
class SchedulerOutput:
|
||||
# list of the requests that are scheduled for the first time.
|
||||
# We cache the request's data in each worker process, so that we don't
|
||||
# need to re-send it every scheduling step.
|
||||
scheduled_new_reqs: list[NewRequestData]
|
||||
# list of the requests that have been scheduled before.
|
||||
# Since the request's data is already cached in the worker processes,
|
||||
# we only send the diff to minimize the communication cost.
|
||||
scheduled_cached_reqs: CachedRequestData
|
||||
|
||||
# req_id -> num_scheduled_tokens
|
||||
# Number of tokens scheduled for each request.
|
||||
num_scheduled_tokens: dict[str, int]
|
||||
# Total number of tokens scheduled for all requests.
|
||||
# Equal to sum(num_scheduled_tokens.values())
|
||||
total_num_scheduled_tokens: int
|
||||
# req_id -> spec_token_ids
|
||||
# If a request does not have any spec decode tokens, it will not be
|
||||
# included in the dictionary.
|
||||
scheduled_spec_decode_tokens: dict[str, list[int]]
|
||||
# req_id -> encoder input indices that need processing.
|
||||
# E.g., if a request has [0, 1], it could mean the vision encoder needs
|
||||
# to process that the request's 0-th and 1-th images in the current step.
|
||||
scheduled_encoder_inputs: dict[str, list[int]]
|
||||
# Number of common prefix blocks for all requests in each KV cache group.
|
||||
# This can be used for cascade attention.
|
||||
num_common_prefix_blocks: list[int]
|
||||
|
||||
# Request IDs that are finished in between the previous and the current
|
||||
# steps. This is used to notify the workers about the finished requests
|
||||
# so that they can free the cached states for those requests.
|
||||
finished_req_ids: set[str]
|
||||
# list of mm_hash strings associated with the encoder outputs to be
|
||||
# freed from the encoder cache.
|
||||
free_encoder_mm_hashes: list[str]
|
||||
|
||||
# Request IDs that are preempted in this step.
|
||||
# Only used for v2 model runner.
|
||||
preempted_req_ids: set[str] | None = None
|
||||
|
||||
# Whether the scheduled requests have all the output tokens they
|
||||
# need to perform grammar bitmask computation.
|
||||
pending_structured_output_tokens: bool = False
|
||||
|
||||
# KV Cache Connector metadata.
|
||||
kv_connector_metadata: KVConnectorMetadata | None = None
|
||||
|
||||
# EC Cache Connector metadata
|
||||
ec_connector_metadata: ECConnectorMetadata | None = None
|
||||
|
||||
@classmethod
|
||||
def make_empty(cls) -> "SchedulerOutput":
|
||||
return cls(
|
||||
scheduled_new_reqs=[],
|
||||
scheduled_cached_reqs=CachedRequestData.make_empty(),
|
||||
num_scheduled_tokens={},
|
||||
total_num_scheduled_tokens=0,
|
||||
scheduled_spec_decode_tokens={},
|
||||
scheduled_encoder_inputs={},
|
||||
num_common_prefix_blocks=[],
|
||||
finished_req_ids=set(),
|
||||
free_encoder_mm_hashes=[],
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class GrammarOutput:
|
||||
# ids of structured output requests.
|
||||
structured_output_request_ids: list[str]
|
||||
# Bitmask ordered as structured_output_request_ids.
|
||||
grammar_bitmask: "npt.NDArray[np.int32]"
|
||||
217
vllm/v1/core/sched/request_queue.py
Normal file
217
vllm/v1/core/sched/request_queue.py
Normal file
@@ -0,0 +1,217 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import heapq
|
||||
from abc import ABC, abstractmethod
|
||||
from collections import deque
|
||||
from collections.abc import Iterable, Iterator
|
||||
from enum import Enum
|
||||
|
||||
from vllm.v1.request import Request
|
||||
|
||||
|
||||
class SchedulingPolicy(Enum):
|
||||
"""Enum for scheduling policies."""
|
||||
|
||||
FCFS = "fcfs"
|
||||
PRIORITY = "priority"
|
||||
|
||||
|
||||
class RequestQueue(ABC):
|
||||
"""Abstract base class for request queues."""
|
||||
|
||||
@abstractmethod
|
||||
def add_request(self, request: Request) -> None:
|
||||
"""Add a request to the queue according to the policy."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def pop_request(self) -> Request:
|
||||
"""Pop a request from the queue according to the policy."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def peek_request(self) -> Request:
|
||||
"""Peek at the request at the front of the queue without removing it."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def prepend_request(self, request: Request) -> None:
|
||||
"""Prepend a request to the front of the queue."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def prepend_requests(self, requests: "RequestQueue") -> None:
|
||||
"""Prepend all requests from another queue to the front of this
|
||||
queue."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def remove_request(self, request: Request) -> None:
|
||||
"""Remove a specific request from the queue."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def remove_requests(self, requests: Iterable[Request]) -> None:
|
||||
"""Remove multiple specific requests from the queue."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def __bool__(self) -> bool:
|
||||
"""Check if queue has any requests."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def __len__(self) -> int:
|
||||
"""Get number of requests in queue."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def __iter__(self) -> Iterator[Request]:
|
||||
"""Iterate over the queue according to the policy."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def __reversed__(self) -> Iterator[Request]:
|
||||
"""Iterate over the queue in reverse order."""
|
||||
pass
|
||||
|
||||
|
||||
class FCFSRequestQueue(deque[Request], RequestQueue):
|
||||
"""A first-come-first-served queue that supports deque operations."""
|
||||
|
||||
def add_request(self, request: Request) -> None:
|
||||
"""Add a request to the queue according to FCFS policy."""
|
||||
self.append(request)
|
||||
|
||||
def pop_request(self) -> Request:
|
||||
"""Pop a request from the queue according to FCFS policy."""
|
||||
return self.popleft()
|
||||
|
||||
def peek_request(self) -> Request:
|
||||
"""Peek at the next request in the queue without removing it."""
|
||||
if not self:
|
||||
raise IndexError("peek from an empty queue")
|
||||
return self[0]
|
||||
|
||||
def prepend_request(self, request: Request) -> None:
|
||||
"""Prepend a request to the front of the queue."""
|
||||
self.appendleft(request)
|
||||
|
||||
def prepend_requests(self, requests: RequestQueue) -> None:
|
||||
"""Prepend all requests from another queue to the front of this
|
||||
queue."""
|
||||
self.extendleft(reversed(requests))
|
||||
|
||||
def remove_request(self, request: Request) -> None:
|
||||
"""Remove a specific request from the queue."""
|
||||
self.remove(request)
|
||||
|
||||
def remove_requests(self, requests: Iterable[Request]) -> None:
|
||||
"""Remove multiple specific requests from the queue."""
|
||||
requests_to_remove = set(requests)
|
||||
filtered_requests = [req for req in self if req not in requests_to_remove]
|
||||
# deque does not support in-place filtering, so we need to clear
|
||||
# and extend
|
||||
self.clear()
|
||||
self.extend(filtered_requests)
|
||||
|
||||
def __bool__(self) -> bool:
|
||||
"""Check if queue has any requests."""
|
||||
return len(self) > 0
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""Get number of requests in queue."""
|
||||
return super().__len__()
|
||||
|
||||
def __iter__(self) -> Iterator[Request]:
|
||||
"""Iterate over the queue according to FCFS policy."""
|
||||
return super().__iter__()
|
||||
|
||||
def __reversed__(self) -> Iterator[Request]:
|
||||
"""Iterate over the queue in reverse order."""
|
||||
return super().__reversed__()
|
||||
|
||||
|
||||
class PriorityRequestQueue(RequestQueue):
|
||||
"""
|
||||
A priority queue that supports heap operations.
|
||||
|
||||
Respects the ordering defined in the Request class, where
|
||||
requests with a smaller value of `priority` are processed first.
|
||||
If multiple requests have the same priority, the one with the earlier
|
||||
`arrival_time` is processed first.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._heap: list[Request] = []
|
||||
|
||||
def add_request(self, request: Request) -> None:
|
||||
"""Add a request to the queue according to priority policy."""
|
||||
heapq.heappush(self._heap, request)
|
||||
|
||||
def pop_request(self) -> Request:
|
||||
"""Pop a request from the queue according to priority policy."""
|
||||
if not self._heap:
|
||||
raise IndexError("pop from empty heap")
|
||||
return heapq.heappop(self._heap)
|
||||
|
||||
def peek_request(self) -> Request:
|
||||
"""Peek at the next request in the queue without removing it."""
|
||||
if not self._heap:
|
||||
raise IndexError("peek from empty heap")
|
||||
return self._heap[0]
|
||||
|
||||
def prepend_request(self, request: Request) -> None:
|
||||
"""Add a request to the queue according to priority policy.
|
||||
|
||||
Note: In a priority queue, there is no concept of prepending to the
|
||||
front. Requests are ordered by (priority, arrival_time)."""
|
||||
self.add_request(request)
|
||||
|
||||
def prepend_requests(self, requests: RequestQueue) -> None:
|
||||
"""Add all requests from another queue according to priority policy.
|
||||
|
||||
Note: In a priority queue, there is no concept of prepending to the
|
||||
front. Requests are ordered by (priority, arrival_time)."""
|
||||
for request in requests:
|
||||
self.add_request(request)
|
||||
|
||||
def remove_request(self, request: Request) -> None:
|
||||
"""Remove a specific request from the queue."""
|
||||
self._heap.remove(request)
|
||||
heapq.heapify(self._heap)
|
||||
|
||||
def remove_requests(self, requests: Iterable[Request]) -> None:
|
||||
"""Remove multiple specific requests from the queue."""
|
||||
requests_to_remove = requests if isinstance(requests, set) else set(requests)
|
||||
self._heap = [r for r in self._heap if r not in requests_to_remove]
|
||||
heapq.heapify(self._heap)
|
||||
|
||||
def __bool__(self) -> bool:
|
||||
"""Check if queue has any requests."""
|
||||
return bool(self._heap)
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""Get number of requests in queue."""
|
||||
return len(self._heap)
|
||||
|
||||
def __iter__(self) -> Iterator[Request]:
|
||||
"""Iterate over the queue according to priority policy."""
|
||||
heap_copy = self._heap[:]
|
||||
while heap_copy:
|
||||
yield heapq.heappop(heap_copy)
|
||||
|
||||
def __reversed__(self) -> Iterator[Request]:
|
||||
"""Iterate over the queue in reverse priority order."""
|
||||
return reversed(list(self))
|
||||
|
||||
|
||||
def create_request_queue(policy: SchedulingPolicy) -> RequestQueue:
|
||||
"""Create request queue based on scheduling policy."""
|
||||
if policy == SchedulingPolicy.PRIORITY:
|
||||
return PriorityRequestQueue()
|
||||
elif policy == SchedulingPolicy.FCFS:
|
||||
return FCFSRequestQueue()
|
||||
else:
|
||||
raise ValueError(f"Unknown scheduling policy: {policy}")
|
||||
1826
vllm/v1/core/sched/scheduler.py
Normal file
1826
vllm/v1/core/sched/scheduler.py
Normal file
File diff suppressed because it is too large
Load Diff
64
vllm/v1/core/sched/utils.py
Normal file
64
vllm/v1/core/sched/utils.py
Normal file
@@ -0,0 +1,64 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import contextlib
|
||||
|
||||
from vllm.v1.request import Request, RequestStatus
|
||||
|
||||
|
||||
def remove_all(lst: list, items_to_remove: set) -> list:
|
||||
"""Remove all items from a list that are in the items_to_remove set.
|
||||
|
||||
This method optimizes for the common case of removing a single item,
|
||||
falling back to list comprehension for multiple items.
|
||||
|
||||
Args:
|
||||
lst: The list to remove items from
|
||||
items_to_remove: Set of items to remove
|
||||
|
||||
Returns:
|
||||
Either the modified original list (for single item removal) or
|
||||
a new list (for multiple item removal). Callers should use the
|
||||
returned value.
|
||||
|
||||
Note:
|
||||
For single item removal, this modifies the original list in-place
|
||||
and returns it. For multiple items, it creates and returns a new list.
|
||||
"""
|
||||
if not items_to_remove:
|
||||
return lst
|
||||
|
||||
if len(items_to_remove) == 1:
|
||||
# Fast path for single item removal (most common case)
|
||||
item = next(iter(items_to_remove))
|
||||
with contextlib.suppress(ValueError):
|
||||
lst.remove(item)
|
||||
return lst
|
||||
# For multiple items, use list comprehension
|
||||
return [item for item in lst if item not in items_to_remove]
|
||||
|
||||
|
||||
def check_stop(request: Request, max_model_len: int) -> bool:
|
||||
assert not request.pooling_params
|
||||
|
||||
sampling_params = request.sampling_params
|
||||
assert sampling_params is not None
|
||||
|
||||
if request.num_output_tokens < sampling_params.min_tokens:
|
||||
return False
|
||||
|
||||
last_token_id = request.output_token_ids[-1]
|
||||
if not sampling_params.ignore_eos and last_token_id == request.eos_token_id:
|
||||
request.status = RequestStatus.FINISHED_STOPPED
|
||||
return True
|
||||
|
||||
if last_token_id in (sampling_params.stop_token_ids or ()):
|
||||
request.status = RequestStatus.FINISHED_STOPPED
|
||||
request.stop_reason = last_token_id
|
||||
return True
|
||||
if (
|
||||
request.num_tokens >= max_model_len
|
||||
or request.num_output_tokens >= request.max_tokens
|
||||
):
|
||||
request.status = RequestStatus.FINISHED_LENGTH_CAPPED
|
||||
return True
|
||||
return False
|
||||
Reference in New Issue
Block a user