[gpt-oss] Add gpt-oss bf16 support

2025-08-13 21:25:57 +08:00
parent 5d2e7edf78
commit 17ea2ec6aa
1232 changed files with 777 additions and 36 deletions
--- a/vllm/v1/core/sched/init.py
+++ b/vllm/v1/core/sched/init.py
--- a/vllm/v1/core/sched/interface.py
+++ b/vllm/v1/core/sched/interface.py
@@ -0,0 +1,150 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+from collections.abc import Iterable
+from typing import TYPE_CHECKING, Optional, Union
+
+if TYPE_CHECKING:
+    from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
+    from vllm.v1.core.sched.output import SchedulerOutput
+    from vllm.v1.engine import EngineCoreOutputs
+    from vllm.v1.metrics.stats import SchedulerStats
+    from vllm.v1.outputs import ModelRunnerOutput
+    from vllm.v1.request import Request, RequestStatus
+
+
+class SchedulerInterface(ABC):
+
+    @abstractmethod
+    def schedule(self) -> "SchedulerOutput":
+        """Schedule the requests to process in this scheduling step.
+
+        The scheduling decision is made at the iteration level. Each scheduling
+        step corresponds to a single forward pass of the model. Therefore, this
+        method is called repeatedly by a busy loop in the engine.
+
+        Essentially, the scheduler produces a dictionary of {req_id: num_tokens}
+        that specifies how many tokens to process for each request in this
+        scheduling step. For example, num_tokens can be as large as the number
+        of prompt tokens for new requests, or it can be 1 for the requests that
+        are auto-regressively generating new tokens one by one. Otherwise, it
+        can be somewhere in between in case of chunked prefills, prefix caching,
+        speculative decoding, etc.
+
+        Additionally, the scheduler also returns useful data about each request
+        or the batch as a whole. The model runner will use this information in
+        preparing inputs to the model.
+
+        Returns:
+            A SchedulerOutput object containing information about the scheduled
+            requests.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def update_from_output(
+        self,
+        scheduler_output: "SchedulerOutput",
+        model_runner_output: "ModelRunnerOutput",
+    ) -> dict[int, "EngineCoreOutputs"]:
+        """Update the scheduler state based on the model runner output.
+
+        This method is called after the model runner has processed the scheduled
+        requests. The model runner output includes generated token ids, draft
+        token ids for next step, etc. The scheduler uses this information to
+        update its states, checks the finished requests, and returns the output
+        for each request.
+
+        Returns:
+            A dict of client index to EngineCoreOutputs object containing the
+            outputs for each request originating from that client.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_request(self, request: "Request") -> None:
+        """Add a new request to the scheduler's internal queue.
+        
+        Args:
+            request: The new request being added.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def finish_requests(
+        self,
+        request_ids: Union[str, Iterable[str]],
+        finished_status: "RequestStatus",
+    ) -> None:
+        """Finish the requests in the scheduler's internal queue. If the request
+        is not in the queue, this method will do nothing.
+
+        This method is called in two cases:
+        1. When the request is aborted by the client.
+        2. When the frontend process detects a stop string of the request after
+           de-tokenizing its generated tokens.
+           
+        Args:
+            request_ids: A single or a list of request IDs.
+            finished_status: The finished status of the given requests.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_num_unfinished_requests(self) -> int:
+        """Number of unfinished requests in the scheduler's internal queue."""
+        raise NotImplementedError
+
+    def has_unfinished_requests(self) -> bool:
+        """Returns True if there are unfinished requests in the scheduler's
+        internal queue."""
+        return self.get_num_unfinished_requests() > 0
+
+    @abstractmethod
+    def has_finished_requests(self) -> bool:
+        """Returns True if there are finished requests that need to be cleared.
+        NOTE: This is different from `not self.has_unfinished_requests()`.
+
+        The scheduler maintains an internal list of the requests finished in the
+        previous step. This list is returned from the next call to schedule(),
+        to be sent to the model runner in the next step to clear cached states
+        for these finished requests.
+
+        This method checks if this internal list of finished requests is
+        non-empty. This information is useful for DP attention.
+        """
+        raise NotImplementedError
+
+    def has_requests(self) -> bool:
+        """Returns True if there are unfinished requests, or finished requests
+        not yet returned in SchedulerOutputs."""
+        return self.has_unfinished_requests() or self.has_finished_requests()
+
+    @abstractmethod
+    def reset_prefix_cache(self) -> bool:
+        """Reset the prefix cache for KV cache.
+
+        This is particularly required when the model weights are live-updated.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_request_counts(self) -> tuple[int, int]:
+        """Returns (num_running_reqs, num_waiting_reqs)."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def make_stats(self) -> Optional["SchedulerStats"]:
+        """Make a SchedulerStats object for logging.
+
+        The SchedulerStats object is created for every scheduling step.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def shutdown(self) -> None:
+        """Shutdown the scheduler."""
+        raise NotImplementedError
+
+    def get_kv_connector(self) -> Optional["KVConnectorBase_V1"]:
+        return None
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -0,0 +1,154 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional
+
+if TYPE_CHECKING:
+    import numpy as np
+    import numpy.typing as npt
+
+    from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+        KVConnectorMetadata)
+    from vllm.lora.request import LoRARequest
+    from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+    from vllm.sampling_params import SamplingParams
+    from vllm.v1.request import Request
+
+
+@dataclass
+class NewRequestData:
+
+    req_id: str
+    prompt_token_ids: list[int]
+    mm_inputs: list[MultiModalKwargs]
+    mm_hashes: list[str]
+    mm_positions: list[PlaceholderRange]
+    sampling_params: SamplingParams
+    block_ids: tuple[list[int], ...]
+    num_computed_tokens: int
+    lora_request: Optional[LoRARequest]
+
+    @classmethod
+    def from_request(
+        cls,
+        request: Request,
+        block_ids: tuple[list[int], ...],
+    ) -> NewRequestData:
+        return cls(
+            req_id=request.request_id,
+            prompt_token_ids=request.prompt_token_ids,
+            mm_inputs=request.mm_inputs,
+            mm_hashes=request.mm_hashes,
+            mm_positions=request.mm_positions,
+            sampling_params=request.sampling_params,
+            block_ids=block_ids,
+            num_computed_tokens=request.num_computed_tokens,
+            lora_request=request.lora_request,
+        )
+
+    def __repr__(self):
+        return (f"NewRequestData("
+                f"req_id={self.req_id},"
+                f"prompt_token_ids={self.prompt_token_ids},"
+                f"mm_inputs={self.mm_inputs},"
+                f"mm_hashes={self.mm_hashes},"
+                f"mm_positions={self.mm_positions},"
+                f"sampling_params={self.sampling_params},"
+                f"block_ids={self.block_ids},"
+                f"num_computed_tokens={self.num_computed_tokens},"
+                f"lora_request={self.lora_request}"
+                ")")
+
+    # Version of __repr__ with the prompt data obfuscated
+    def anon_repr(self):
+        return (f"NewRequestData("
+                f"req_id={self.req_id},"
+                f"prompt_token_ids_len={len(self.prompt_token_ids)},"
+                f"mm_inputs={self.mm_inputs},"
+                f"mm_hashes={self.mm_hashes},"
+                f"mm_positions={self.mm_positions},"
+                f"sampling_params={self.sampling_params},"
+                f"block_ids={self.block_ids},"
+                f"num_computed_tokens={self.num_computed_tokens},"
+                f"lora_request={self.lora_request}"
+                ")")
+
+
+@dataclass
+class CachedRequestData:
+
+    req_id: str
+    # If resumed_from_preemption is False, new_block_ids will be appended to
+    # the request's block IDs. If True, new_block_ids will be used as the
+    # request's block IDs instead of appending to the existing block IDs.
+    resumed_from_preemption: bool
+    new_token_ids: list[int]
+    new_block_ids: tuple[list[int], ...]
+    num_computed_tokens: int
+
+    @classmethod
+    def from_request(
+        cls,
+        request: Request,
+        resumed_from_preemption: bool,
+        new_token_ids: list[int],
+        new_block_ids: tuple[list[int], ...],
+    ) -> CachedRequestData:
+        return cls(
+            req_id=request.request_id,
+            resumed_from_preemption=resumed_from_preemption,
+            new_token_ids=new_token_ids,
+            new_block_ids=new_block_ids,
+            num_computed_tokens=request.num_computed_tokens,
+        )
+
+
+@dataclass
+class SchedulerOutput:
+
+    # list of the requests that are scheduled for the first time.
+    # We cache the request's data in each worker process, so that we don't
+    # need to re-send it every scheduling step.
+    scheduled_new_reqs: list[NewRequestData]
+    # list of the requests that have been scheduled before.
+    # Since the request's data is already cached in the worker processes,
+    # we only send the diff to minimize the communication cost.
+    scheduled_cached_reqs: list[CachedRequestData]
+
+    # req_id -> num_scheduled_tokens
+    # Number of tokens scheduled for each request.
+    num_scheduled_tokens: dict[str, int]
+    # Total number of tokens scheduled for all requests.
+    # Equal to sum(num_scheduled_tokens.values())
+    total_num_scheduled_tokens: int
+    # req_id -> spec_token_ids
+    # If a request does not have any spec decode tokens, it will not be
+    # included in the dictionary.
+    scheduled_spec_decode_tokens: dict[str, list[int]]
+    # req_id -> encoder input indices that need processing.
+    # E.g., if a request has [0, 1], it could mean the vision encoder needs
+    # to process that the request's 0-th and 1-th images in the current step.
+    scheduled_encoder_inputs: dict[str, list[int]]
+    # Number of common prefix blocks for all requests in each KV cache group.
+    # This can be used for cascade attention.
+    num_common_prefix_blocks: list[int]
+
+    # Request IDs that are finished in between the previous and the current
+    # steps. This is used to notify the workers about the finished requests
+    # so that they can free the cached states for those requests.
+    finished_req_ids: set[str]
+    # list of (req_id, encoder_input_index) tuples.
+    # Used to free the encoder cache.
+    free_encoder_input_ids: list[tuple[str, int]]
+
+    # Dict of request ids to their index within the batch
+    # for filling the next token bitmask
+    structured_output_request_ids: dict[str, int]
+    # the bitmask for the whole batch
+    grammar_bitmask: Optional[npt.NDArray[np.int32]]
+
+    # KV Cache Connector metadata.
+    kv_connector_metadata: Optional[KVConnectorMetadata] = None
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
--- a/vllm/v1/core/sched/utils.py
+++ b/vllm/v1/core/sched/utils.py
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.v1.request import Request, RequestStatus
+
+
+def check_stop(request: Request, max_model_len: int) -> bool:
+    if (request.num_tokens >= max_model_len
+            or request.num_output_tokens >= request.max_tokens):
+        request.status = RequestStatus.FINISHED_LENGTH_CAPPED
+        return True
+
+    sampling_params = request.sampling_params
+    last_token_id = request.output_token_ids[-1]
+    if (not sampling_params.ignore_eos
+            and last_token_id == request.eos_token_id):
+        request.status = RequestStatus.FINISHED_STOPPED
+        return True
+
+    if last_token_id in (sampling_params.stop_token_ids or ()):
+        request.status = RequestStatus.FINISHED_STOPPED
+        request.stop_reason = last_token_id
+        return True
+    return False