Iluvatar-mrv100 SDK 4.3.0

2025-09-15 14:58:11 +08:00
parent 9efe891f99
commit 8af8290b1d
1052 changed files with 294967 additions and 1 deletions
--- a/vllm/v1/engine/init.py
+++ b/vllm/v1/engine/init.py
@@ -0,0 +1,157 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import enum
+import time
+from typing import Any, Optional, Union
+
+import msgspec
+
+from vllm.lora.request import LoRARequest
+from vllm.multimodal import MultiModalKwargs
+from vllm.multimodal.inputs import PlaceholderRange
+from vllm.sampling_params import SamplingParams
+from vllm.v1.metrics.stats import SchedulerStats
+from vllm.v1.outputs import LogprobsLists, LogprobsTensors
+
+# These are possible values of RequestOutput.finish_reason,
+# so form part of the external API.
+FINISH_REASON_STRINGS = ("stop", "length", "abort")
+
+
+class FinishReason(enum.IntEnum):
+    """
+    Reason a request finished - stop, length, or abort.
+
+    Int rather than Str for more compact serialization.
+
+    stop - a stop string was emitted
+    length - max_tokens was consumed, or max_model_len was reached
+    abort - aborted for another reason
+
+    """
+    STOP = 0
+    LENGTH = 1
+    ABORT = 2
+
+    def __str__(self):
+        return FINISH_REASON_STRINGS[self.value]
+
+
+class EngineCoreRequest(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        omit_defaults=True,  # type: ignore[call-arg]
+        gc=False):  # type: ignore[call-arg]
+
+    # NOTE: prompt and prompt_token_ids should be DecoderOnlyInput,
+    # but this object is currently not playing well with msgspec
+    # due to circular imports and typing we have in data.py
+
+    request_id: str
+    # NOTE(ywang96): original text prompt is needed when a request is added to
+    # Detokenizer, but set to None when it is added to EngineCoreClient.
+    prompt: Optional[str]
+    prompt_token_ids: list[int]
+    mm_inputs: Optional[list[MultiModalKwargs]]
+    mm_hashes: Optional[list[str]]
+    mm_placeholders: Optional[list[PlaceholderRange]]
+    sampling_params: SamplingParams
+    eos_token_id: Optional[int]
+    arrival_time: float
+    lora_request: Optional[LoRARequest]
+
+
+class EngineCoreEventType(enum.IntEnum):
+    """The type of engine core request event."""
+    QUEUED = 1
+    SCHEDULED = 2
+    PREEMPTED = 3
+
+
+class EngineCoreEvent(msgspec.Struct):
+    """A timestamped engine core event associated with a request.
+
+    The timestamp is a monotonic timestamps and is used for by the engine
+    frontend to calculate intervals between engine core events. These
+    timestamps should not be compared with timestamps from other processes.
+    """
+    type: EngineCoreEventType
+    timestamp: float
+
+    @classmethod
+    def new_event(cls,
+                  event_type: EngineCoreEventType,
+                  timestamp: Optional[float] = None) -> "EngineCoreEvent":
+        timestamp = time.monotonic() if timestamp is None else timestamp
+        return cls(event_type, timestamp)
+
+
+class EngineCoreOutput(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        omit_defaults=True,  # type: ignore[call-arg]
+        gc=False):  # type: ignore[call-arg]
+
+    request_id: str
+    new_token_ids: list[int]
+
+    new_logprobs: Optional[LogprobsLists] = None
+    new_prompt_logprobs_tensors: Optional[LogprobsTensors] = None
+
+    finish_reason: Optional[FinishReason] = None
+    stop_reason: Union[int, str, None] = None
+    events: Optional[list[EngineCoreEvent]] = None
+
+    @property
+    def finished(self) -> bool:
+        return self.finish_reason is not None
+
+
+class UtilityOutput(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        gc=False):  # type: ignore[call-arg]
+
+    call_id: int
+
+    # Non-None implies the call failed, result should be None.
+    failure_message: Optional[str] = None
+    result: Any = None
+
+
+class EngineCoreOutputs(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        omit_defaults=True,  # type: ignore[call-arg]
+        gc=False):  # type: ignore[call-arg]
+
+    #NOTE(Nick): We could consider ways to make this more compact,
+    # e.g. columnwise layout
+
+    engine_index: int = 0
+
+    # [num_reqs]
+    outputs: list[EngineCoreOutput] = []
+    scheduler_stats: Optional[SchedulerStats] = None
+    timestamp: float = 0.0
+
+    utility_output: Optional[UtilityOutput] = None
+    finished_requests: Optional[set[str]] = None
+
+    # In DP case, used to signal that the engine is paused.
+    engine_paused: bool = False
+
+    def __post_init__(self):
+        if self.timestamp == 0.0:
+            self.timestamp = time.monotonic()
+
+
+class EngineCoreRequestType(enum.Enum):
+    """
+    Request types defined as hex byte strings, so it can be sent over sockets
+    without separate encoding step.
+    """
+    ADD = b'\x00'
+    ABORT = b'\x01'
+    START_DP = b'\x02'
+    UTILITY = b'\x03'