Add minimal vLLM 0.16.1 build repo for BI-V150

2026-04-18 10:56:22 +08:00
commit d69657327e
1895 changed files with 615301 additions and 0 deletions
--- a/vllm/v1/engine/init.py
+++ b/vllm/v1/engine/init.py
@@ -0,0 +1,246 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import enum
+import time
+from collections.abc import Mapping
+from typing import Any, Literal
+
+import msgspec
+import numpy as np
+import torch
+from typing_extensions import deprecated
+
+from vllm.lora.request import LoRARequest
+from vllm.multimodal.inputs import MultiModalFeatureSpec
+from vllm.pooling_params import PoolingParams
+from vllm.sampling_params import SamplingParams
+from vllm.v1.metrics.stats import SchedulerStats
+from vllm.v1.outputs import LogprobsLists, LogprobsTensors
+from vllm.v1.serial_utils import UtilityResult
+
+# Type for pause_generation mode parameter.
+# - "abort": Abort all in-flight requests immediately (default).
+# - "wait": Wait for in-flight requests to complete before pausing.
+# - "keep": Freeze requests in queue; they resume on resume_generation().
+PauseMode = Literal["abort", "wait", "keep"]
+
+# These are possible values of RequestOutput.finish_reason,
+# so form part of the external API.
+FINISH_REASON_STRINGS = ("stop", "length", "abort", "error")
+
+
+class FinishReason(enum.IntEnum):
+    """
+    Reason a request finished - stop, length, abort, or error.
+
+    Int rather than Str for more compact serialization.
+
+    stop - a stop string was emitted
+    length - max_tokens was consumed, or max_model_len was reached
+    abort - aborted by client
+    error - retryable request-level internal error (e.g., KV load failure).
+            Invariant: always converted to 500 Internal Server Error.
+
+    """
+
+    STOP = 0
+    LENGTH = 1
+    ABORT = 2
+    ERROR = 3
+
+    def __str__(self):
+        return FINISH_REASON_STRINGS[self.value]
+
+
+class EngineCoreRequest(
+    msgspec.Struct,
+    array_like=True,  # type: ignore[call-arg]
+    omit_defaults=True,  # type: ignore[call-arg]
+    gc=False,
+):  # type: ignore[call-arg]
+    request_id: str
+    prompt_token_ids: list[int] | None
+    mm_features: list[MultiModalFeatureSpec] | None
+    sampling_params: SamplingParams | None
+    pooling_params: PoolingParams | None
+    arrival_time: float
+    lora_request: LoRARequest | None
+    cache_salt: str | None
+    data_parallel_rank: int | None
+    prompt_embeds: torch.Tensor | None = None
+
+    # Index of the client, used to ensure outputs are sent back to the same
+    # client for this request when scaling out the front-end.
+    client_index: int = 0
+
+    # Used in DP case to indicate which wave of requests this is expected to
+    # belong to, to cover a race condition where the request is sent before
+    # a wave finished notification is received.
+    current_wave: int = 0
+    priority: int = 0
+
+    trace_headers: Mapping[str, str] | None = None
+    resumable: bool = False
+
+    # The user-provided request ID. This field is set internally,
+    # copied from the provided request_id that's originally assigned
+    # to the request_id field, see InputProcessor.assign_request_id().
+    # Used in outputs and to support abort(req_id, internal=False).
+    external_req_id: str | None = None
+
+    reasoning_ended: bool | None = None
+
+    @property
+    def params(self) -> SamplingParams | PoolingParams:
+        """Return the processed params (sampling or pooling)."""
+        if self.sampling_params is not None:
+            return self.sampling_params
+        assert self.pooling_params is not None
+        return self.pooling_params
+
+    @property
+    @deprecated(
+        "EngineCoreRequest.eos_token_id will be removed in v0.18. "
+        "Please use EngineCoreRequest.sampling_params.eos_token_id instead."
+    )
+    def eos_token_id(self) -> int | None:
+        if self.sampling_params is None:
+            return None
+
+        return self.sampling_params.eos_token_id
+
+
+class EngineCoreEventType(enum.IntEnum):
+    """The type of engine core request event."""
+
+    QUEUED = 1
+    SCHEDULED = 2
+    PREEMPTED = 3
+
+
+class EngineCoreEvent(msgspec.Struct):
+    """A timestamped engine core event associated with a request.
+
+    The timestamp is a monotonic timestamps and is used for by the engine
+    frontend to calculate intervals between engine core events. These
+    timestamps should not be compared with timestamps from other processes.
+    """
+
+    type: EngineCoreEventType
+    timestamp: float
+
+    @classmethod
+    def new_event(
+        cls, event_type: EngineCoreEventType, timestamp: float | None = None
+    ) -> "EngineCoreEvent":
+        timestamp = time.monotonic() if timestamp is None else timestamp
+        return cls(event_type, timestamp)
+
+
+class EngineCoreOutput(
+    msgspec.Struct,
+    array_like=True,  # type: ignore[call-arg]
+    omit_defaults=True,  # type: ignore[call-arg]
+    gc=False,
+):  # type: ignore[call-arg]
+    request_id: str
+    new_token_ids: list[int]
+
+    new_logprobs: LogprobsLists | None = None
+    new_prompt_logprobs_tensors: LogprobsTensors | None = None
+
+    pooling_output: torch.Tensor | None = None
+
+    finish_reason: FinishReason | None = None
+    stop_reason: int | str | None = None
+    events: list[EngineCoreEvent] | None = None
+    kv_transfer_params: dict[str, Any] | None = None
+
+    trace_headers: Mapping[str, str] | None = None
+    # The number of tokens with prefix cache hits (local + external).
+    num_cached_tokens: int = 0
+    # The number of tokens computed remotely (original count from connector).
+    num_external_computed_tokens: int = 0
+    routed_experts: np.ndarray | None = None
+    # The number of NaNs in logits.
+    # A value greater than 0 indicates that the output is corrupted.
+    num_nans_in_logits: int = 0
+
+    @property
+    def finished(self) -> bool:
+        return self.finish_reason is not None
+
+
+class UtilityOutput(
+    msgspec.Struct,
+    array_like=True,  # type: ignore[call-arg]
+    gc=False,
+):  # type: ignore[call-arg]
+    call_id: int
+
+    # Non-None implies the call failed, result should be None.
+    failure_message: str | None = None
+    result: UtilityResult | None = None
+
+
+class EngineCoreOutputs(
+    msgspec.Struct,
+    array_like=True,  # type: ignore[call-arg]
+    omit_defaults=True,  # type: ignore[call-arg]
+    gc=False,
+):  # type: ignore[call-arg]
+    # NOTE(Nick): We could consider ways to make this more compact,
+    # e.g. columnwise layout
+
+    engine_index: int = 0
+
+    # [num_reqs]
+    outputs: list[EngineCoreOutput] = []
+    scheduler_stats: SchedulerStats | None = None
+    timestamp: float = 0.0
+
+    utility_output: UtilityOutput | None = None
+    finished_requests: set[str] | None = None
+
+    # In DP case, used to signal that the current wave of requests
+    # has finished and the engines are paused.
+    wave_complete: int | None = None
+    # In DP case, used to signal that a request was received for an
+    # "old" wave, so the next wave needs to be started in other engines.
+    start_wave: int | None = None
+
+    def __post_init__(self):
+        if self.timestamp == 0.0:
+            self.timestamp = time.monotonic()
+
+
+class EngineCoreRequestType(enum.Enum):
+    """
+    Request types defined as hex byte strings, so it can be sent over sockets
+    without separate encoding step.
+    """
+
+    ADD = b"\x00"
+    ABORT = b"\x01"
+    START_DP_WAVE = b"\x02"
+    UTILITY = b"\x03"
+    # Sentinel used within EngineCoreProc.
+    EXECUTOR_FAILED = b"\x04"
+
+
+class ReconfigureDistributedRequest(msgspec.Struct):
+    new_data_parallel_size: int
+    new_data_parallel_rank: int
+    new_data_parallel_rank_local: int
+    new_data_parallel_master_ip: str
+    new_data_parallel_master_port: int
+
+
+class ReconfigureRankType(enum.IntEnum):
+    """
+    Rank type for reconfiguring distributed request.
+    """
+
+    KEEP_CURRENT_RANK = -1
+    SHUTDOWN_CURRENT_RANK = -2
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
--- a/vllm/v1/engine/coordinator.py
+++ b/vllm/v1/engine/coordinator.py
@@ -0,0 +1,394 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import copy
+import multiprocessing
+import time
+import weakref
+
+import msgspec.msgpack
+import zmq
+
+from vllm.config import ParallelConfig
+from vllm.logger import init_logger
+from vllm.utils.network_utils import make_zmq_socket
+from vllm.utils.system_utils import get_mp_context, set_process_title
+from vllm.v1.engine import EngineCoreOutputs, EngineCoreRequestType
+from vllm.v1.serial_utils import MsgpackDecoder
+from vllm.v1.utils import get_engine_client_zmq_addr, shutdown
+
+logger = init_logger(__name__)
+
+
+class DPCoordinator:
+    """Coordinator process used for data-parallel deployments (DP>1).
+
+    Intermediates between multiple DP engine rank processes and one or more
+    front-end API server processes.
+
+    * Collects stats from each DP engine (currently just waiting and running
+      queue lengths), and publishes these to all front-ends for use in
+      load-balancing decisions.
+
+    * Keeps track of the current DP "request wave" number and running state
+      of the engines. This is received from the DP rank 0 engine and published
+      to the front-end processes along with the current load stats.
+
+      The engines alternate between a global running/paused state. The global
+      "request wave" number is a count of the number of times that the workers
+      collectively move from a running state to a paused state. This transition
+      is synchronized via the all-reduce operation performed in the
+      DPEngineCoreProc._has_global_unfinished_reqs method.
+
+    * Broadcasts the START_DP_WAVE message to engines to move them from paused
+      to running state when one engine receives a new request. This can happen
+      in two cases:
+      1) A front-end sending a new request while the engines are paused will
+         concurrently notify the coordinator.
+      2) An engine receiving a request for a stale request wave while in paused
+         state will notify the coordinator.
+
+    Engines will move into running state when receiving a new request or
+    START_DP_WAVE message.
+
+    Note that when deployed in External LB mode, no stats will be published by
+    the engines and thus updates will only be sent to front-ends when the
+    request wave / running state changes.
+    """
+
+    def __init__(
+        self, parallel_config: ParallelConfig, enable_wave_coordination: bool = True
+    ):
+        dp_size = parallel_config.data_parallel_size
+        assert dp_size > 1, "Coordinator only used for data parallel"
+
+        host = parallel_config.data_parallel_master_ip
+
+        # Assume coordinator is colocated with front-end procs when not in
+        # either external or hybrid DP LB mode.
+        local_only = not parallel_config.local_engines_only
+        front_publish_address = get_engine_client_zmq_addr(
+            local_only=local_only, host=host
+        )
+
+        local_only_eng = dp_size == parallel_config.data_parallel_size_local
+        back_publish_address = get_engine_client_zmq_addr(local_only_eng, host)
+        back_output_address = get_engine_client_zmq_addr(local_only_eng, host)
+
+        context = get_mp_context()
+        self.proc: multiprocessing.Process = context.Process(
+            target=DPCoordinatorProc.run_coordinator,
+            name="VLLM_DP_Coordinator",
+            kwargs={
+                "engine_count": parallel_config.data_parallel_size,
+                "front_publish_address": front_publish_address,
+                "back_output_address": back_output_address,
+                "back_publish_address": back_publish_address,
+                "enable_wave_coordination": enable_wave_coordination,
+            },
+            daemon=True,
+        )
+        self.proc.start()
+
+        self.stats_publish_address = front_publish_address
+        self.coord_in_address = back_publish_address
+        self.coord_out_address = back_output_address
+        self._finalizer = weakref.finalize(self, shutdown, [self.proc])
+
+    def get_stats_publish_address(self) -> str:
+        return self.stats_publish_address
+
+    def get_engine_socket_addresses(self) -> tuple[str, str]:
+        """Returns tuple of ZMQ input address, output address."""
+        return self.coord_in_address, self.coord_out_address
+
+    def close(self):
+        self._finalizer()
+
+
+class EngineState:
+    def __init__(self):
+        self.request_counts = [0, 0]  # [waiting, running]
+
+
+class DPCoordinatorProc:
+    def __init__(
+        self,
+        engine_count: int,
+        min_stats_update_interval_ms: int = 100,
+        enable_wave_coordination: bool = True,
+    ):
+        set_process_title("DPCoordinator")
+        self.ctx = zmq.Context()
+
+        self.engines = [EngineState() for _ in range(engine_count)]
+
+        self.stats_update_interval_ms = min_stats_update_interval_ms
+        self.enable_wave_coordination = enable_wave_coordination
+
+    @staticmethod
+    def run_coordinator(
+        engine_count: int,
+        front_publish_address: str,
+        back_output_address: str,
+        back_publish_address: str,
+        min_stats_update_interval_ms: int = 100,
+        enable_wave_coordination: bool = True,
+    ):
+        coordinator = DPCoordinatorProc(
+            engine_count=engine_count,
+            min_stats_update_interval_ms=min_stats_update_interval_ms,
+            enable_wave_coordination=enable_wave_coordination,
+        )
+        try:
+            coordinator.process_input_socket(
+                front_publish_address,
+                back_output_address,
+                back_publish_address,
+            )
+        except KeyboardInterrupt:
+            logger.info("DP Coordinator process exiting")
+
+    def process_input_socket(
+        self,
+        front_publish_address: str,
+        back_output_address: str,
+        back_publish_address: str,
+    ):
+        decoder = MsgpackDecoder(EngineCoreOutputs)
+
+        # For tracking request wave progression.
+        current_wave = 0
+        engines_running = False
+
+        # For tracking request counts for internal load-balancing.
+        stats_changed = False
+        last_stats_step = -1
+        last_stats_wave = -1
+        last_step_counts: list[list[int]] | None = None
+
+        with (
+            make_zmq_socket(
+                path=front_publish_address,  # IPC
+                ctx=self.ctx,
+                socket_type=zmq.XPUB,
+                bind=True,
+            ) as publish_front,
+            make_zmq_socket(
+                path=back_output_address,  # IPC or TCP
+                ctx=self.ctx,
+                socket_type=zmq.PULL,
+                bind=True,
+            ) as output_back,
+            make_zmq_socket(
+                path=back_publish_address,  # IPC or TCP
+                ctx=self.ctx,
+                socket_type=zmq.XPUB,
+                bind=True,
+            ) as publish_back,
+        ):
+            # Wait until all engines subscribe.
+            for _ in self.engines:
+                if publish_back.recv() != b"\x01":
+                    logger.error(
+                        "DP Coordinator received unexpected message while "
+                        "waiting for engines to subscribe"
+                    )
+                    return
+            # Send ready message to engines.
+            publish_back.send(b"READY")
+
+            logger.info("All engine subscriptions received by DP coordinator")
+
+            poller = zmq.Poller()
+            poller.register(publish_front, zmq.POLLIN)
+            poller.register(output_back, zmq.POLLIN)
+            last_publish_time = 0
+            while True:
+                elapsed = int(time.time() * 1000) - last_publish_time
+                # Send at stats_update_interval_ms interval if the stats have
+                # changed, or otherwise every 5 seconds.
+                wait_for = self.stats_update_interval_ms if stats_changed else 5000
+
+                # Wait at least 50ms to ensure we've received all stats for
+                # the current step.
+                min_timeout = 50 if last_step_counts is None else 0
+
+                events = poller.poll(timeout=max(min_timeout, wait_for - elapsed))
+                if not events:
+                    # Poller timeout - publish current stats to front-ends.
+                    if last_step_counts is not None:
+                        engine_req_counts_list = last_step_counts
+                        last_step_counts = None
+                    else:
+                        engine_req_counts_list = self._get_engine_counts()
+                        stats_changed = False
+
+                    to_publish = (engine_req_counts_list, current_wave, engines_running)
+                    publish_front.send(msgspec.msgpack.encode(to_publish))
+                    last_publish_time = int(time.time() * 1000)
+                    continue
+
+                events = dict(events)
+                wave_state_changed = False
+
+                if publish_front in events:
+                    buffer = publish_front.recv()
+                    if buffer in (b"\x01", b"\x00"):
+                        # Ignore subscription messages.
+                        continue
+
+                    decoded = msgspec.msgpack.decode(buffer)
+                    if (
+                        isinstance(decoded, (list, tuple))
+                        and len(decoded) == 2
+                        and decoded[0] == "SCALE_ELASTIC_EP"
+                    ):
+                        # Handle scale up notification
+                        new_engine_count = decoded[1]
+                        current_count = len(self.engines)
+                        if new_engine_count > current_count:
+                            for _ in range(new_engine_count - current_count):
+                                self.engines.append(EngineState())
+                            # NOTE(yongji): handle the case
+                            # where newly started engines have current_wave = 0
+                            # if existing engines just finished a wave
+                            # and engine_running isn't updated yet at
+                            # CoordinatorProc requests routed to newly started
+                            # engines may not wake up existing engines, as long
+                            # as 0 < request.wave < existing engines'
+                            # current_wave
+                            # we note that 0 is the wave number for the new
+                            # engine
+                            engines_running = False
+                            logger.info(
+                                "DPCoordinator scaled up from %s to %s engines",
+                                current_count,
+                                new_engine_count,
+                            )
+                        else:
+                            self.engines = self.engines[:new_engine_count]
+                            logger.info(
+                                "DPCoordinator scaled down from %s to %s engines",
+                                current_count,
+                                new_engine_count,
+                            )
+                        continue  # Skip normal engine notification processing
+
+                    # Wave coordination: handle new-request messages from front-end.
+                    # Only process these when wave coordination is enabled
+                    if self.enable_wave_coordination:
+                        # We received a message on the front-end XPUB socket,
+                        # from an API server sending a new request while the
+                        # engines are paused, so that we can wake the other
+                        # engines.
+                        engine_to_exclude, wave = decoded
+                        if not engines_running:
+                            if wave < current_wave:
+                                # If the wave number is stale, ensure the message
+                                # is handled by all the engines.
+                                engine_to_exclude = None
+
+                            engines_running = True
+                            wave_state_changed = True
+                            self._send_start_wave(
+                                publish_back, current_wave, engine_to_exclude
+                            )
+
+                if output_back in events:
+                    # We received a message from one of the engines.
+
+                    buffer = output_back.recv()
+                    outputs: EngineCoreOutputs = decoder.decode(buffer)
+
+                    assert not outputs.outputs
+                    assert outputs.utility_output is None
+
+                    eng_index = outputs.engine_index
+                    scheduler_stats = outputs.scheduler_stats
+                    if scheduler_stats:
+                        # 1. Updated request load stats - update our local
+                        # state with these.
+                        stats = self.engines[eng_index].request_counts
+                        stats_step = scheduler_stats.step_counter
+                        stats_wave = scheduler_stats.current_wave
+                        if (
+                            stats_wave > last_stats_wave
+                            or stats_wave == last_stats_wave
+                            and stats_step > last_stats_step
+                        ):
+                            if stats_changed:
+                                last_step_counts = self._get_engine_counts(do_copy=True)
+                            last_stats_step = stats_step
+                            last_stats_wave = stats_wave
+                        elif stats_wave != last_stats_wave or (
+                            stats_step != last_stats_step
+                        ):
+                            logger.warning(
+                                "Received stats for out-of-order "
+                                "step (%d, %d) from engine %d (expected "
+                                "> (%d, %d))",
+                                stats_wave,
+                                stats_step,
+                                eng_index,
+                                last_stats_wave,
+                                last_stats_step,
+                            )
+                        stats[0] = scheduler_stats.num_waiting_reqs
+                        stats[1] = scheduler_stats.num_running_reqs
+                        stats_changed = True
+
+                    # Wave coordination: handle wave completion and start notifications
+                    # Only process these when wave coordination is enabled
+                    if self.enable_wave_coordination:
+                        if (wave := outputs.wave_complete) is not None:
+                            # 2. Notification from rank 0 engine that we've
+                            # moved into the global paused state
+                            # (engines_running==False).
+                            if current_wave <= wave:
+                                new_wave = wave + 1
+                                logger.debug(
+                                    "Moving DP wave from %d to %d.",
+                                    current_wave,
+                                    new_wave,
+                                )
+                                current_wave = new_wave
+                                engines_running = False
+                                wave_state_changed = True
+                        elif (wave := outputs.start_wave) is not None and (
+                            wave > current_wave
+                            or (wave == current_wave and not engines_running)
+                        ):
+                            # 3. The engine received request for a non-current wave
+                            # so we must ensure that other engines progress to the
+                            # next wave (race condition handling).
+                            logger.debug(
+                                "Starting wave %d after notification of "
+                                "stale wave request from engine.",
+                                wave,
+                            )
+                            current_wave = wave
+                            engines_running = True
+                            wave_state_changed = True
+                            self._send_start_wave(publish_back, wave, eng_index)
+
+                if wave_state_changed:
+                    message = (None, current_wave, engines_running)
+                    publish_front.send(msgspec.msgpack.encode(message))
+
+    @staticmethod
+    def _send_start_wave(
+        socket: zmq.Socket, wave: int, exclude_engine_index: int | None
+    ):
+        """Broadcast the START_DP_WAVE message to all the engines.
+        It includes the current wave number and index of engine which
+        has already received a request with this wave number and so doesn't
+        require additional notification.
+        """
+        wave_encoded = msgspec.msgpack.encode((wave, exclude_engine_index))
+        socket.send_multipart((EngineCoreRequestType.START_DP_WAVE.value, wave_encoded))
+
+    def _get_engine_counts(self, do_copy=False) -> list[list[int]]:
+        """Return list of [waiting, running] count lists for each engine."""
+        if do_copy:
+            return [copy.copy(e.request_counts) for e in self.engines]
+        return [e.request_counts for e in self.engines]
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -0,0 +1,341 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+
+import tokenizers
+from packaging import version
+from tokenizers import Tokenizer
+from tokenizers.decoders import DecodeStream
+from transformers import PreTrainedTokenizerFast
+
+from vllm.logger import init_logger
+from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers.detokenizer_utils import (
+    convert_prompt_ids_to_tokens,
+    detokenize_incrementally,
+)
+from vllm.utils import length_from_prompt_token_ids_or_embeds
+from vllm.v1.engine import EngineCoreRequest
+
+logger = init_logger(__name__)
+
+# Only tokenizers >= 0.22.0 supports DecodeStream with native prefill
+# (ids parameter) used for FastIncrementalDetokenizer.
+USE_FAST_DETOKENIZER = version.parse(tokenizers.__version__) >= version.parse("0.22.0")
+
+# Error string from https://github.com/huggingface/tokenizers/blob/909fdde2a4ffedd9295206f705eb612be2a91b12/tokenizers/src/tokenizer/mod.rs#L1042
+INVALID_PREFIX_ERR_MSG = "Invalid prefix encountered"
+
+
+class IncrementalDetokenizer:
+    def __init__(self):
+        self.token_ids: list[int] = []
+
+    @property
+    def output_token_ids(self) -> list[int]:
+        return self.token_ids
+
+    def num_output_tokens(self) -> int:
+        return len(self.token_ids)
+
+    def update(self, new_token_ids: list[int], stop_terminated: bool) -> str | None:
+        self.token_ids.extend(new_token_ids)
+        return None
+
+    def get_next_output_text(self, finished: bool, delta: bool) -> str:
+        return ""
+
+    @classmethod
+    def from_new_request(
+        cls,
+        tokenizer: TokenizerLike | None,
+        request: EngineCoreRequest,
+    ) -> "IncrementalDetokenizer":
+        assert request.sampling_params is not None
+
+        if tokenizer is None:
+            # No tokenizer => skipping detokenization.
+            return IncrementalDetokenizer()
+
+        if USE_FAST_DETOKENIZER and isinstance(tokenizer, PreTrainedTokenizerFast):
+            # Fast tokenizer => use tokenizers library DecodeStream.
+            return FastIncrementalDetokenizer(tokenizer, request)
+
+        # Fall back to slow python-based incremental detokenization.
+        return SlowIncrementalDetokenizer(tokenizer, request)
+
+
+class BaseIncrementalDetokenizer(IncrementalDetokenizer, ABC):
+    def __init__(self, request: EngineCoreRequest):
+        super().__init__()
+
+        # Stop strings
+        params = request.sampling_params
+        assert params is not None
+        stop_list: list[str]
+        if params.stop is None:
+            stop_list = []
+        elif isinstance(params.stop, str):
+            stop_list = [params.stop]
+        else:
+            stop_list = params.stop
+        self.stop = stop_list
+        self.min_tokens = params.min_tokens
+        self.include_stop_str_in_output = params.include_stop_str_in_output
+
+        # Number of chars to hold back when stop strings are to be excluded
+        # from streamed output.
+        if self.stop and not self.include_stop_str_in_output:
+            self.stop_buffer_length = max(len(s) for s in self.stop) - 1
+        else:
+            self.stop_buffer_length = 0
+        self._last_output_text_offset: int = 0
+
+        # Generation data
+        self.output_text = ""
+
+    def update(self, new_token_ids: list[int], stop_terminated: bool) -> str | None:
+        """
+        Update RequestState for the request_id by:
+            1) Detokenize the new token ids incrementally.
+            2) Evaluate stop criteria.
+
+        Return matched stop string or None.
+        """
+        if not new_token_ids:
+            # Skip detokenization if no new token ids.
+            return None
+
+        if stop_terminated and not self.include_stop_str_in_output:
+            # If stop-terminated, exclude last token from detokenization
+            # based on include_stop_str_in_output parameter.
+            skipped_stop_token_id = new_token_ids[-1]
+            new_token_ids = new_token_ids[:-1]
+        else:
+            skipped_stop_token_id = None
+
+        # 1) Detokenize the new token ids incrementally.
+        stop_check_offset = len(self.output_text)
+        for new_token_id in new_token_ids:
+            self.token_ids.append(new_token_id)
+            self.output_text += self.decode_next(new_token_id)
+            # Support min_tokens, see https://github.com/vllm-project/vllm/pull/22014
+            if self.min_tokens and self.num_output_tokens() <= self.min_tokens:
+                stop_check_offset = len(self.output_text)
+
+        if skipped_stop_token_id is not None:
+            # Cleanup after skipping detokenization.
+            self.token_ids.append(skipped_stop_token_id)
+
+        # 2) Evaluate stop strings.
+        stop_string = None
+        if self.stop and self.num_output_tokens() > self.min_tokens:
+            stop = check_stop_strings(
+                output_text=self.output_text,
+                new_char_count=len(self.output_text) - stop_check_offset,
+                stop=self.stop,
+                include_in_output=self.include_stop_str_in_output,
+            )
+            if stop is not None:
+                stop_string, truncate_to = stop
+                if truncate_to != -1:
+                    self.output_text = self.output_text[:truncate_to]
+
+        return stop_string
+
+    @abstractmethod
+    def decode_next(self, next_token_id: int) -> str:
+        raise NotImplementedError
+
+    def get_next_output_text(self, finished: bool, delta: bool) -> str:
+        """If delta is True, only new text since the last call to
+        this method is returned"""
+
+        # We return the full output text if the sequence is finished.
+        buffer_length = 0 if finished else self.stop_buffer_length
+        if not delta:
+            if not buffer_length:
+                return self.output_text
+            return self.output_text[:-buffer_length]
+
+        length = len(self.output_text) - buffer_length
+        last_offset = self._last_output_text_offset
+        if last_offset < length:
+            self._last_output_text_offset = length
+            return self.output_text[last_offset:length]
+        return ""
+
+
+class FastIncrementalDetokenizer(BaseIncrementalDetokenizer):
+    def __init__(self, tokenizer: PreTrainedTokenizerFast, request: EngineCoreRequest):
+        super().__init__(request)
+
+        sampling_params = request.sampling_params
+        assert sampling_params is not None
+
+        self.request_id = request.request_id
+        self.skip_special_tokens = sampling_params.skip_special_tokens
+
+        self.tokenizer: Tokenizer = tokenizer._tokenizer
+
+        # Use native prefill to prime the decode stream with prompt tokens.
+        self.stream = DecodeStream(
+            ids=request.prompt_token_ids,
+            skip_special_tokens=self.skip_special_tokens,
+        )
+
+        self.spaces_between_special_tokens = (
+            sampling_params.skip_special_tokens
+            or sampling_params.spaces_between_special_tokens
+        )
+
+        if not self.spaces_between_special_tokens:
+            # Store dict of added token ids so that we can suppress
+            # the spaces between them.
+            added_token_ids = getattr(self.tokenizer, "added_token_ids", None)
+            if added_token_ids is None:
+                self.tokenizer.added_token_ids = added_token_ids = {
+                    tid: tok.content
+                    for tid, tok in self.tokenizer.get_added_tokens_decoder().items()
+                }
+
+            if added_token_ids:
+                self.last_special = False
+                self.added_token_ids = added_token_ids
+            else:
+                # No added tokens.
+                self.spaces_between_special_tokens = True
+
+    def decode_next(self, next_token_id: int) -> str:
+        token = self._protected_step(next_token_id)
+
+        if not self.spaces_between_special_tokens:
+            special_token = self.added_token_ids.get(next_token_id)
+            is_special = special_token is not None
+            if is_special and self.last_special:
+                # Return raw token string without any prefixed spaces.
+                token = special_token
+            self.last_special = is_special
+
+        return token or ""
+
+    def _protected_step(self, next_token_id: int) -> str | None:
+        try:
+            token = self.stream.step(self.tokenizer, next_token_id)
+        except (OverflowError, TypeError):
+            # Handle rare observed overflow, still to be diagnosed.
+            # See https://github.com/vllm-project/vllm/issues/21951.
+            logger.exception("Encountered invalid token id: %r", next_token_id)
+            token = None
+        except Exception as e:
+            if not str(e).startswith(INVALID_PREFIX_ERR_MSG):
+                raise e
+            # Recover from edge case where tokenizer can produce non-monotonic,
+            # invalid UTF-8 output, which breaks the internal state of
+            # tokenizers' DecodeStream.
+            # See https://github.com/vllm-project/vllm/issues/17448.
+            logger.warning(
+                "Encountered invalid prefix detokenization error"
+                " for request %s, resetting decode stream.",
+                self.request_id,
+            )
+            self.stream = DecodeStream(skip_special_tokens=self.skip_special_tokens)
+            token = self.stream.step(self.tokenizer, next_token_id)
+        return token
+
+
+class SlowIncrementalDetokenizer(BaseIncrementalDetokenizer):
+    def __init__(self, tokenizer: TokenizerLike, request: EngineCoreRequest):
+        super().__init__(request)
+
+        self.tokenizer = tokenizer
+        params = request.sampling_params
+        assert params is not None
+
+        self.prompt_len = length_from_prompt_token_ids_or_embeds(
+            request.prompt_token_ids, request.prompt_embeds
+        )
+
+        # Metadata for incremental detokenization.
+        if request.prompt_token_ids is not None:
+            self.tokens, self.prefix_offset, self.read_offset = (
+                convert_prompt_ids_to_tokens(
+                    tokenizer=tokenizer,
+                    prompt_ids=request.prompt_token_ids,
+                    skip_special_tokens=params.skip_special_tokens,
+                )
+            )
+        else:
+            # Prompt embedding requests cannot be detokenized, in general.
+            self.tokens = [""] * self.prompt_len
+            self.prefix_offset = 0
+            self.read_offset = 0
+
+        self.token_ids.extend(request.prompt_token_ids or [0] * self.prompt_len)
+
+        self.skip_special_tokens = params.skip_special_tokens
+        self.spaces_between_special_tokens = params.spaces_between_special_tokens
+
+    @property
+    def output_token_ids(self) -> list[int]:
+        if self.prompt_len:
+            return self.token_ids[self.prompt_len :]
+        return self.token_ids
+
+    def num_output_tokens(self) -> int:
+        return len(self.token_ids) - self.prompt_len
+
+    def decode_next(self, next_token_id: int) -> str:
+        new_tokens, decoded_text, prefix_offset, read_offset = detokenize_incrementally(
+            tokenizer=self.tokenizer,
+            all_input_ids=self.token_ids,
+            prev_tokens=self.tokens,
+            prefix_offset=self.prefix_offset,
+            read_offset=self.read_offset,
+            skip_special_tokens=self.skip_special_tokens,
+            spaces_between_special_tokens=self.spaces_between_special_tokens,
+        )
+
+        self.tokens.extend(new_tokens)
+        self.prefix_offset = prefix_offset
+        self.read_offset = read_offset
+
+        return decoded_text
+
+
+def check_stop_strings(
+    output_text: str,
+    new_char_count: int,
+    stop: list[str],
+    include_in_output: bool,
+) -> tuple[str, int] | None:
+    """Check if any stop strings are matched and truncate sequence
+    output text accordingly.
+
+    Returns tuple (stop_string, offset) if matched or else None.
+
+    Where stop_string is the matched stop string and offset is the
+    length to which output_text should be truncated, or -1 for no
+    truncation.
+    """
+    if not new_char_count or not stop:
+        return None
+
+    for stop_str in stop:
+        stop_string_len = len(stop_str)
+        # Avoid searching already-searched text.
+        stop_index = output_text.find(stop_str, 1 - new_char_count - stop_string_len)
+        if stop_index == -1:
+            continue
+
+        if include_in_output:
+            # Truncate to end of stop string.
+            stop_index += stop_string_len
+            if stop_index >= len(output_text):
+                # No truncation required.
+                return stop_str, -1
+
+        # Truncate the output text to either the beginning
+        # or end of the stop string.
+        return stop_str, stop_index
+    return None
--- a/vllm/v1/engine/exceptions.py
+++ b/vllm/v1/engine/exceptions.py
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+class EngineGenerateError(Exception):
+    """Raised when a AsyncLLM.generate() fails. Recoverable."""
+
+    pass
+
+
+class EngineDeadError(Exception):
+    """Raised when the EngineCore dies. Unrecoverable."""
+
+    def __init__(self, *args, suppress_context: bool = False, **kwargs):
+        ENGINE_DEAD_MESSAGE = "EngineCore encountered an issue. See stack trace (above) for the root cause."  # noqa: E501
+
+        super().__init__(ENGINE_DEAD_MESSAGE, *args, **kwargs)
+        # Make stack trace clearer when using with LLMEngine by
+        # silencing irrelevant ZMQError.
+        self.__suppress_context__ = suppress_context
--- a/vllm/v1/engine/input_processor.py
+++ b/vllm/v1/engine/input_processor.py
@@ -0,0 +1,476 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import time
+import warnings
+from collections.abc import Mapping
+from typing import Any, Literal
+
+import vllm.envs as envs
+from vllm.config import VllmConfig
+from vllm.inputs.data import (
+    ProcessorInputs,
+    PromptType,
+    SingletonInputs,
+)
+from vllm.inputs.parse import split_enc_dec_inputs
+from vllm.inputs.preprocess import InputPreprocessor
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.multimodal.encoder_budget import MultiModalBudget
+from vllm.multimodal.inputs import (
+    MultiModalFeatureSpec,
+)
+from vllm.multimodal.utils import argsort_mm_positions
+from vllm.pooling_params import PoolingParams
+from vllm.renderers import BaseRenderer, renderer_from_config
+from vllm.sampling_params import SamplingParams
+from vllm.tasks import GENERATION_TASKS, POOLING_TASKS, SupportedTask
+from vllm.tokenizers import TokenizerLike
+from vllm.utils import length_from_prompt_token_ids_or_embeds, random_uuid
+from vllm.utils.func_utils import supports_kw
+from vllm.utils.jsontree import json_iter_leaves
+from vllm.v1.engine import EngineCoreRequest
+
+logger = init_logger(__name__)
+
+
+class InputProcessor:
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        renderer: BaseRenderer | None = None,
+        *,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+    ) -> None:
+        self.vllm_config = vllm_config
+        self.model_config = model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.speculative_config = vllm_config.speculative_config
+        self.structured_outputs_config = vllm_config.structured_outputs_config
+        self.observability_config = vllm_config.observability_config
+
+        self.generation_config_fields = model_config.try_get_generation_config()
+
+        self.renderer = renderer or renderer_from_config(vllm_config)
+
+        self.supports_mm_inputs = mm_registry.supports_multimodal_inputs(model_config)
+        self.mm_encoder_cache_size = 0
+        self.skip_prompt_length_check = False
+        if self.supports_mm_inputs:
+            mm_budget = MultiModalBudget(vllm_config, mm_registry)
+            self.mm_encoder_cache_size = mm_budget.encoder_cache_size
+            self.skip_prompt_length_check = (
+                mm_budget.processor.info.skip_prompt_length_check
+            )
+            mm_budget.reset_cache()  # Not used anymore
+
+        self.input_preprocessor = InputPreprocessor(
+            vllm_config,
+            renderer=renderer,
+            mm_registry=mm_registry,
+        )
+
+        from vllm.platforms import current_platform
+
+        platform_validate_request = current_platform.validate_request
+        if supports_kw(platform_validate_request, "prompt"):
+            logger.warning_once(
+                "The signature of Platform.validate_request has changed from "
+                "`(cls, prompt, params, processed_inputs) -> None` to "
+                "`(cls, processed_inputs, params) -> None`. The old signature "
+                "will no longer be supported starting from v0.18."
+            )
+
+            orig_validate_request = platform_validate_request
+
+            def compat_validate_request(
+                processed_inputs: ProcessorInputs,
+                params: SamplingParams | PoolingParams,
+            ):
+                return orig_validate_request(
+                    processed_inputs,
+                    params,
+                    processed_inputs,  # type: ignore
+                )  # type: ignore
+
+            platform_validate_request = compat_validate_request
+
+        self._platform_validate_request = platform_validate_request
+
+    @property
+    def tokenizer(self) -> TokenizerLike | None:
+        return self.renderer.tokenizer
+
+    def get_tokenizer(self) -> TokenizerLike:
+        return self.renderer.get_tokenizer()
+
+    def _validate_params(
+        self,
+        params: SamplingParams | PoolingParams,
+        supported_tasks: tuple[SupportedTask, ...],
+    ) -> None:
+        """Raise `ValueError` if SamplingParams or PoolingParams is not valid."""
+        if params.truncate_prompt_tokens is not None:
+            params_type = type(params).__name__
+            warnings.warn(
+                f"The `truncate_prompt_tokens` parameter in `{params_type}` "
+                "is deprecated and will be removed in v0.17. "
+                "Please pass it via `tokenization_kwargs` instead.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+
+        if isinstance(params, SamplingParams):
+            supported_generation_tasks = [
+                task for task in supported_tasks if task in GENERATION_TASKS
+            ]
+            if not supported_generation_tasks:
+                raise ValueError("This model does not support generation")
+
+            params.verify(
+                self.model_config,
+                self.speculative_config,
+                self.structured_outputs_config,
+                self.tokenizer,
+            )
+        elif isinstance(params, PoolingParams):
+            supported_pooling_tasks = [
+                task for task in supported_tasks if task in POOLING_TASKS
+            ]
+            if not supported_pooling_tasks:
+                raise ValueError("This model does not support pooling")
+
+            if params.task is None:
+                if "token_embed" in supported_pooling_tasks:
+                    params.task = "token_embed"
+                elif "token_classify" in supported_pooling_tasks:
+                    params.task = "token_classify"
+                elif "plugin" in supported_pooling_tasks:
+                    params.task = "plugin"
+
+            if params.task not in supported_pooling_tasks:
+                raise ValueError(
+                    f"Unsupported task: {params.task!r} "
+                    f"Supported tasks: {supported_pooling_tasks}"
+                )
+
+            params.verify(self.model_config)
+        else:
+            raise TypeError(
+                f"params must be either SamplingParams or PoolingParams, "
+                f"but got {type(params).__name__}"
+            )
+
+    def _validate_lora(self, lora_request: LoRARequest | None) -> None:
+        if lora_request is None:
+            return
+
+        # LoRA request passed in while LoRA is not enabled
+        if not self.lora_config:
+            raise ValueError(
+                f"Got lora_request {lora_request} but LoRA is not enabled!"
+            )
+
+        if self.tokenizer is not None:
+            logger.warning_once(
+                "vLLM has deprecated support for supporting different "
+                "tokenizers for different LoRAs. By default, vLLM uses base "
+                "model's tokenizer. If you are using a LoRA "
+                "with its own tokenizer, consider specifying `--tokenizer "
+                "[lora_path]` to use the LoRA tokenizer."
+            )
+
+    def _get_mm_identifier(
+        self,
+        mm_hash: str,
+        lora_request: LoRARequest | None,
+    ) -> str:
+        """
+        When enable_tower_connector_lora is True, multi-modal embeddings
+        vary depending on the LoRA request. Therefore, the mm_hash must be
+        generated based on the LoRA request to prevent incorrect cache hits.
+        """
+        if (
+            lora_request is None
+            or self.lora_config is None
+            or not self.lora_config.enable_tower_connector_lora
+        ):
+            return mm_hash
+        return f"{lora_request.lora_name}:{mm_hash}"
+
+    @staticmethod
+    def assign_request_id(request: EngineCoreRequest):
+        """Replace the externally supplied request ID with an internal request ID
+        that adds 8 random characters in order to ensure uniquness.
+        """
+        if request.external_req_id is not None:
+            raise ValueError(
+                "The external_req_id field should not be set on EngineCoreRequests"
+                " passed to vLLM; use the request_id field."
+            )
+        request.external_req_id = request.request_id
+        if envs.VLLM_DISABLE_REQUEST_ID_RANDOMIZATION:
+            logger.warning_once(
+                "VLLM_DISABLE_REQUEST_ID_RANDOMIZATION is set and will be "
+                "removed in a future release. Duplicate externally-provided "
+                "request IDs may cause failures and/or subtle correctness errors."
+            )
+        else:
+            request.request_id = f"{request.external_req_id}-{random_uuid():.8}"
+
+    def process_inputs(
+        self,
+        request_id: str,
+        prompt: PromptType | ProcessorInputs,
+        params: SamplingParams | PoolingParams,
+        supported_tasks: tuple[SupportedTask, ...],
+        arrival_time: float | None = None,
+        lora_request: LoRARequest | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
+        trace_headers: Mapping[str, str] | None = None,
+        priority: int = 0,
+        data_parallel_rank: int | None = None,
+        resumable: bool = False,
+    ) -> EngineCoreRequest:
+        self._validate_params(params, supported_tasks)
+        self._validate_lora(lora_request)
+
+        parallel_config = self.vllm_config.parallel_config
+        dp_size = parallel_config.data_parallel_size
+        dp_local_size = parallel_config.data_parallel_size_local
+        num_ranks = dp_local_size if parallel_config.local_engines_only else dp_size
+        if data_parallel_rank is not None and not (0 <= data_parallel_rank < num_ranks):
+            raise ValueError(
+                f"data_parallel_rank {data_parallel_rank} "
+                f"is out of range [0, {num_ranks})."
+            )
+
+        if isinstance(prompt, dict) and "type" in prompt:
+            if tokenization_kwargs:
+                logger.warning_once(
+                    "Passing tokenization_kwargs to InputProcessor is deprecated "
+                    "and will be removed in v0.18. You should instead pass "
+                    "them to Renderer.render_cmpl() or Renderer.render_chat()."
+                )
+
+            if arrival_time is None:
+                arrival_time = prompt.get("arrival_time", time.time())  # type: ignore[assignment]
+
+            processed_inputs: ProcessorInputs = prompt  # type: ignore[assignment]
+        else:
+            logger.warning_once(
+                "Passing raw prompts to InputProcessor is deprecated "
+                "and will be removed in v0.18. You should instead pass "
+                "the outputs of Renderer.render_cmpl() or Renderer.render_chat()."
+            )
+
+            if arrival_time is None:
+                arrival_time = time.time()
+
+            processed_inputs = self.input_preprocessor.preprocess(
+                prompt,
+                tokenization_kwargs=tokenization_kwargs,
+            )
+
+        self._platform_validate_request(processed_inputs, params)
+
+        encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
+        self._validate_model_inputs(encoder_inputs, decoder_inputs)
+
+        # Mypy can be conservative for TypedDict unions; normalize access.
+        if decoder_inputs["type"] == "embeds":
+            prompt_token_ids = None
+            prompt_embeds = decoder_inputs["prompt_embeds"]
+        else:
+            prompt_token_ids = decoder_inputs["prompt_token_ids"]
+            prompt_embeds = None
+
+        sampling_params = None
+        pooling_params = None
+        if isinstance(params, SamplingParams):
+            # TODO: can we avoid cloning here in multiproc case?
+            sampling_params = params.clone()
+            # If unset max tokens, then generate up to the max_model_len.
+            if sampling_params.max_tokens is None:
+                seq_len = length_from_prompt_token_ids_or_embeds(
+                    prompt_token_ids, prompt_embeds
+                )
+                sampling_params.max_tokens = self.model_config.max_model_len - seq_len
+
+            sampling_params.update_from_generation_config(
+                self.generation_config_fields,
+                self.renderer.get_eos_token_id(),
+            )
+            if self.tokenizer is not None:
+                sampling_params.update_from_tokenizer(self.tokenizer)
+        else:
+            pooling_params = params.clone()
+
+        # Multimodal related.
+        mm_features: list[MultiModalFeatureSpec] | None = None
+
+        if decoder_inputs["type"] == "multimodal":
+            decoder_mm_inputs = decoder_inputs["mm_kwargs"]
+            decoder_mm_positions = decoder_inputs["mm_placeholders"]
+            decoder_mm_hashes = decoder_inputs["mm_hashes"]
+
+            if not all(
+                isinstance(leaf, str) for leaf in json_iter_leaves(decoder_mm_hashes)
+            ):
+                raise ValueError(
+                    f"mm_hashes must contain only strings, got: {decoder_mm_hashes}. "
+                    "This is likely due to an incorrect custom implementation of "
+                    "MultiModalProcessor.apply method."
+                )
+
+            # Merge and flatten multimodal placeholders, hashes and inputs
+            # from dictionaries to lists, and sort them by each item's position
+            # in the input sequence.
+            sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions)
+
+            mm_features = []
+            for modality, idx in sorted_mm_idxs:
+                base_mm_hash = decoder_mm_hashes[modality][idx]
+                mm_features.append(
+                    MultiModalFeatureSpec(
+                        data=decoder_mm_inputs[modality][idx],
+                        modality=modality,
+                        identifier=self._get_mm_identifier(
+                            base_mm_hash,
+                            lora_request,
+                        ),
+                        mm_position=decoder_mm_positions[modality][idx],
+                        mm_hash=base_mm_hash,
+                    )
+                )
+
+        return EngineCoreRequest(
+            request_id=request_id,
+            prompt_token_ids=prompt_token_ids,
+            prompt_embeds=prompt_embeds,
+            mm_features=mm_features,
+            sampling_params=sampling_params,
+            pooling_params=pooling_params,
+            arrival_time=arrival_time,
+            lora_request=lora_request,
+            cache_salt=decoder_inputs.get("cache_salt"),
+            priority=priority,
+            data_parallel_rank=data_parallel_rank,
+            trace_headers=trace_headers,
+            resumable=resumable,
+        )
+
+    def _validate_prompt_len(
+        self,
+        prompt_len: int,
+        prompt_type: Literal["encoder", "decoder"],
+    ):
+        if self.skip_prompt_length_check:
+            return
+
+        if prompt_len == 0 and prompt_type == "decoder":
+            raise ValueError(f"The {prompt_type} prompt cannot be empty")
+
+        model_config = self.model_config
+        max_prompt_len = (
+            model_config.max_model_len
+            if prompt_type == "decoder"
+            else self.mm_encoder_cache_size
+        )
+        if prompt_len > max_prompt_len:
+            if self.supports_mm_inputs:
+                suggestion = (
+                    "Make sure that `max_model_len` is no smaller than the "
+                    "number of text tokens plus multimodal tokens. For image "
+                    "inputs, the number of image tokens depends on the number "
+                    "of images, and possibly their aspect ratios as well."
+                )
+            else:
+                suggestion = (
+                    "Make sure that `max_model_len` is no smaller than the "
+                    "number of text tokens."
+                )
+
+            raise ValueError(
+                f"The {prompt_type} prompt (length {prompt_len}) is "
+                f"longer than the maximum model length of {max_prompt_len}. "
+                f"{suggestion}"
+            )
+        elif prompt_len == max_prompt_len and model_config.runner_type == "generate":
+            suggestion = (
+                "Make sure that `max_model_len` is no smaller than the "
+                "number of text tokens (prompt + requested output tokens)."
+            )
+            raise ValueError(
+                f"The {prompt_type} prompt (length {prompt_len}) plus the number of "
+                f"requested output tokens (at least 1) is longer than the maximum "
+                f"model length of {max_prompt_len}. {suggestion}"
+            )
+
+    def _validate_model_input(
+        self,
+        prompt_inputs: SingletonInputs,
+        prompt_type: Literal["encoder", "decoder"],
+    ) -> None:
+        model_config = self.model_config
+        tokenizer = self.tokenizer
+
+        prompt_ids = (
+            None
+            if prompt_inputs["type"] == "embeds"
+            else prompt_inputs["prompt_token_ids"]
+        )
+        prompt_embeds = (
+            prompt_inputs["prompt_embeds"]
+            if prompt_inputs["type"] == "embeds"
+            else None
+        )
+
+        prompt_len = length_from_prompt_token_ids_or_embeds(prompt_ids, prompt_embeds)
+        self._validate_prompt_len(prompt_len, prompt_type)
+
+        if prompt_inputs["type"] == "multimodal":
+            decoder_mm_positions = prompt_inputs["mm_placeholders"]
+            for modality, mm_positions in decoder_mm_positions.items():
+                for mm_position in mm_positions:
+                    embed_length = mm_position.get_num_embeds()
+                    if embed_length > self.mm_encoder_cache_size:
+                        raise ValueError(
+                            f"The {prompt_type} prompt contains a(n) {modality} item "
+                            f"with length {embed_length}, which exceeds the "
+                            f"pre-allocated encoder cache size "
+                            f"{self.mm_encoder_cache_size}. Please reduce the input "
+                            f"size or increase the encoder cache size "
+                            f"by setting --limit-mm-per-prompt at startup."
+                        )
+
+        if prompt_ids and tokenizer is not None:
+            max_input_id = max(prompt_ids, default=0)
+
+            # NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while
+            # self.model_config.get_vocab_size() is the model’s vocab size.
+            # For Qwen3 models, the language model has extra tokens that do
+            # not exist in the tokenizer, and vice versa for multimodal
+            # placeholder tokens in some multimodal models.
+            # See https://github.com/QwenLM/Qwen3/issues/29#issuecomment-1933720399 # noqa: E501
+            # and https://github.com/vllm-project/vllm/pull/22471#discussion_r2312251421 # noqa: E501
+
+            # Here we take the max of the two to determine if a token id is
+            # truly out-of-vocabulary.
+            model_vocab_size = model_config.get_vocab_size()
+            if max_input_id > max(tokenizer.max_token_id, model_vocab_size - 1):
+                raise ValueError(f"Token id {max_input_id} is out of vocabulary")
+
+    def _validate_model_inputs(
+        self,
+        encoder_inputs: SingletonInputs | None,
+        decoder_inputs: SingletonInputs,
+    ):
+        if encoder_inputs is not None:
+            self._validate_model_input(encoder_inputs, prompt_type="encoder")
+
+        self._validate_model_input(decoder_inputs, prompt_type="decoder")
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -0,0 +1,429 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import time
+from collections.abc import Callable, Mapping
+from copy import copy
+from typing import Any
+
+import torch.nn as nn
+from typing_extensions import TypeVar
+
+import vllm.envs as envs
+from vllm.config import ParallelConfig, VllmConfig
+from vllm.distributed import stateless_destroy_torch_distributed_process_group
+from vllm.distributed.parallel_state import get_dp_group
+from vllm.engine.arg_utils import EngineArgs
+from vllm.inputs import ProcessorInputs, PromptType
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.outputs import PoolingRequestOutput, RequestOutput
+from vllm.plugins.io_processors import get_io_processor
+from vllm.pooling_params import PoolingParams
+from vllm.renderers import renderer_from_config
+from vllm.renderers.inputs.preprocess import extract_prompt_components
+from vllm.sampling_params import SamplingParams
+from vllm.tasks import SupportedTask
+from vllm.tokenizers import TokenizerLike
+from vllm.tracing import init_tracer
+from vllm.usage.usage_lib import UsageContext
+from vllm.v1.engine import EngineCoreRequest, PauseMode
+from vllm.v1.engine.core_client import EngineCoreClient
+from vllm.v1.engine.input_processor import InputProcessor
+from vllm.v1.engine.output_processor import OutputProcessor
+from vllm.v1.engine.parallel_sampling import ParentRequest
+from vllm.v1.executor import Executor
+from vllm.v1.metrics.loggers import StatLoggerFactory, StatLoggerManager
+from vllm.v1.metrics.reader import Metric, get_metrics_snapshot
+from vllm.v1.metrics.stats import IterationStats
+from vllm.v1.utils import record_function_or_nullcontext
+from vllm.v1.worker.worker_base import WorkerBase
+
+logger = init_logger(__name__)
+
+_R = TypeVar("_R", default=Any)
+
+
+class LLMEngine:
+    """Legacy LLMEngine for backwards compatibility."""
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        executor_class: type[Executor],
+        log_stats: bool,
+        aggregate_engine_logging: bool = False,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: list[StatLoggerFactory] | None = None,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+        use_cached_outputs: bool = False,
+        multiprocess_mode: bool = False,
+    ) -> None:
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.observability_config = vllm_config.observability_config
+
+        tracing_endpoint = self.observability_config.otlp_traces_endpoint
+        if tracing_endpoint is not None:
+            init_tracer("vllm.llm_engine", tracing_endpoint)
+
+        self.log_stats = log_stats
+
+        parallel_config = vllm_config.parallel_config
+        executor_backend = parallel_config.distributed_executor_backend
+
+        self.external_launcher_dp = (
+            parallel_config.data_parallel_size > 1
+            and executor_backend == "external_launcher"
+        )
+        # important: init dp group before init the engine_core
+        # In the decoupled engine case this is handled in EngineCoreProc.
+        if (
+            not multiprocess_mode
+            and parallel_config.data_parallel_size > 1
+            and not self.external_launcher_dp
+        ):
+            self.dp_group = parallel_config.stateless_init_dp_group()
+        else:
+            self.dp_group = None
+        self.should_execute_dummy_batch = False
+
+        self.renderer = renderer = renderer_from_config(self.vllm_config)
+        self.io_processor = get_io_processor(
+            self.vllm_config,
+            self.model_config.io_processor_plugin,
+        )
+
+        # Convert TokPrompt --> EngineCoreRequest.
+        self.input_processor = InputProcessor(self.vllm_config, renderer)
+
+        # Converts EngineCoreOutputs --> RequestOutput.
+        self.output_processor = OutputProcessor(
+            renderer.tokenizer,
+            log_stats=self.log_stats,
+            stream_interval=self.vllm_config.scheduler_config.stream_interval,
+            tracing_enabled=tracing_endpoint is not None,
+        )
+
+        # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs)
+        self.engine_core = EngineCoreClient.make_client(
+            multiprocess_mode=multiprocess_mode,
+            asyncio_mode=False,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=self.log_stats,
+        )
+
+        self.logger_manager: StatLoggerManager | None = None
+        if self.log_stats:
+            self.logger_manager = StatLoggerManager(
+                vllm_config=vllm_config,
+                custom_stat_loggers=stat_loggers,
+                enable_default_loggers=log_stats,
+                aggregate_engine_logging=aggregate_engine_logging,
+            )
+            self.logger_manager.log_engine_initialized()
+
+        if not multiprocess_mode:
+            # for v0 compatibility
+            self.model_executor = self.engine_core.engine_core.model_executor  # type: ignore
+
+        if self.external_launcher_dp:
+            # If we use DP in external launcher mode, we reuse the
+            # existing DP group used for data communication.
+            self.dp_group = get_dp_group().cpu_group
+
+        # Don't keep the dummy data in memory
+        self.reset_mm_cache()
+
+    @classmethod
+    def from_vllm_config(
+        cls,
+        vllm_config: VllmConfig,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: list[StatLoggerFactory] | None = None,
+        disable_log_stats: bool = False,
+    ) -> "LLMEngine":
+        return cls(
+            vllm_config=vllm_config,
+            executor_class=Executor.get_class(vllm_config),
+            log_stats=(not disable_log_stats),
+            usage_context=usage_context,
+            stat_loggers=stat_loggers,
+            multiprocess_mode=envs.VLLM_ENABLE_V1_MULTIPROCESSING,
+        )
+
+    @classmethod
+    def from_engine_args(
+        cls,
+        engine_args: EngineArgs,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: list[StatLoggerFactory] | None = None,
+        enable_multiprocessing: bool = False,
+    ) -> "LLMEngine":
+        """Creates an LLM engine from the engine arguments."""
+
+        # Create the engine configs.
+        vllm_config = engine_args.create_engine_config(usage_context)
+        executor_class = Executor.get_class(vllm_config)
+
+        if envs.VLLM_ENABLE_V1_MULTIPROCESSING:
+            logger.debug("Enabling multiprocessing for LLMEngine.")
+            enable_multiprocessing = True
+
+        # Create the LLMEngine.
+        return cls(
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=not engine_args.disable_log_stats,
+            usage_context=usage_context,
+            stat_loggers=stat_loggers,
+            multiprocess_mode=enable_multiprocessing,
+        )
+
+    def get_num_unfinished_requests(self) -> int:
+        return self.output_processor.get_num_unfinished_requests()
+
+    def has_unfinished_requests(self) -> bool:
+        has_unfinished = self.output_processor.has_unfinished_requests()
+        if self.dp_group is None:
+            return has_unfinished or self.engine_core.dp_engines_running()
+        return self.has_unfinished_requests_dp(has_unfinished)
+
+    def has_unfinished_requests_dp(self, has_unfinished: bool) -> bool:
+        aggregated_has_unfinished = ParallelConfig.has_unfinished_dp(
+            self.dp_group, has_unfinished
+        )
+        if not has_unfinished and aggregated_has_unfinished:
+            self.should_execute_dummy_batch = True
+        return aggregated_has_unfinished
+
+    def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
+        if not hasattr(self, "_supported_tasks"):
+            # Cache the result
+            self._supported_tasks = self.engine_core.get_supported_tasks()
+
+        return self._supported_tasks
+
+    def abort_request(self, request_ids: list[str], internal: bool = False) -> None:
+        """Remove request_ids from EngineCore and Detokenizer."""
+
+        request_ids = self.output_processor.abort_requests(request_ids, internal)
+        self.engine_core.abort_requests(request_ids)
+
+    def add_request(
+        self,
+        request_id: str,
+        prompt: EngineCoreRequest | PromptType | ProcessorInputs,
+        params: SamplingParams | PoolingParams,
+        arrival_time: float | None = None,
+        lora_request: LoRARequest | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
+        trace_headers: Mapping[str, str] | None = None,
+        priority: int = 0,
+        prompt_text: str | None = None,
+    ) -> str:
+        # Validate the request_id type.
+        if not isinstance(request_id, str):
+            raise TypeError(f"request_id must be a string, got {type(request_id)}")
+
+        # Process raw inputs into the request.
+        if isinstance(prompt, EngineCoreRequest):
+            logger.warning_once(
+                "Passing EngineCoreRequest to LLMEngine.generate() and .add_requests() "
+                "is deprecated and will be removed in v0.18. You should instead pass "
+                "the outputs of Renderer.render_cmpl() or Renderer.render_chat()."
+            )
+
+            request = prompt
+            if request_id != request.request_id:
+                logger.warning_once(
+                    "LLMEngine.add_request() was passed a request_id parameter that "
+                    "does not match the EngineCoreRequest.request_id attribute. The "
+                    "latter will be used, and the former will be ignored."
+                )
+        else:
+            request = self.input_processor.process_inputs(
+                request_id,
+                prompt,
+                params,
+                supported_tasks=self.get_supported_tasks(),
+                arrival_time=arrival_time,
+                lora_request=lora_request,
+                tokenization_kwargs=tokenization_kwargs,
+                trace_headers=trace_headers,
+                priority=priority,
+            )
+            prompt_text, _, _ = extract_prompt_components(self.model_config, prompt)
+
+        self.input_processor.assign_request_id(request)
+
+        req_id = request.request_id
+
+        # Use cloned params that may have been updated in process_inputs()
+        params = request.params
+
+        n = params.n if isinstance(params, SamplingParams) else 1
+
+        if n == 1:
+            # Make a new RequestState and queue.
+            self.output_processor.add_request(request, prompt_text, None, 0)
+            # Add the request to EngineCore.
+            self.engine_core.add_request(request)
+            return req_id
+
+        # Fan out child requests (for n>1).
+        parent_req = ParentRequest(request)
+        for idx in range(n):
+            request_id, child_params = parent_req.get_child_info(idx)
+            child_request = request if idx == n - 1 else copy(request)
+            child_request.request_id = request_id
+            child_request.sampling_params = child_params
+
+            # Make a new RequestState and queue.
+            self.output_processor.add_request(
+                child_request, prompt_text, parent_req, idx
+            )
+            # Add the request to EngineCore.
+            self.engine_core.add_request(child_request)
+
+        return req_id
+
+    def step(self) -> list[RequestOutput | PoolingRequestOutput]:
+        if self.should_execute_dummy_batch:
+            self.should_execute_dummy_batch = False
+            self.engine_core.execute_dummy_batch()
+            return []
+
+        # 1) Get EngineCoreOutput from the EngineCore.
+        with record_function_or_nullcontext("llm_engine step: get_output"):
+            outputs = self.engine_core.get_output()
+
+        # 2) Process EngineCoreOutputs.
+        with record_function_or_nullcontext("llm_engine step: process_outputs"):
+            iteration_stats = IterationStats() if self.log_stats else None
+            processed_outputs = self.output_processor.process_outputs(
+                outputs.outputs,
+                engine_core_timestamp=outputs.timestamp,
+                iteration_stats=iteration_stats,
+            )
+            self.output_processor.update_scheduler_stats(outputs.scheduler_stats)
+
+        # 3) Abort any reqs that finished due to stop strings.
+        with record_function_or_nullcontext("llm_engine step: abort_requests"):
+            self.engine_core.abort_requests(processed_outputs.reqs_to_abort)
+
+        # 4) Record stats
+        with record_function_or_nullcontext("llm_engine step: record_stats"):
+            if (
+                self.logger_manager is not None
+                and outputs.scheduler_stats is not None
+                and len(outputs.outputs) > 0
+            ):
+                self.logger_manager.record(
+                    scheduler_stats=outputs.scheduler_stats,
+                    iteration_stats=iteration_stats,
+                    mm_cache_stats=self.renderer.stat_mm_cache(),
+                )
+                self.do_log_stats_with_interval()
+
+        return processed_outputs.request_outputs
+
+    def start_profile(self, profile_prefix: str | None = None):
+        self.engine_core.profile(True, profile_prefix)
+
+    def stop_profile(self):
+        self.engine_core.profile(False)
+
+    def reset_mm_cache(self):
+        self.renderer.clear_mm_cache()
+        self.engine_core.reset_mm_cache()
+
+    def reset_prefix_cache(
+        self, reset_running_requests: bool = False, reset_connector: bool = False
+    ) -> bool:
+        return self.engine_core.reset_prefix_cache(
+            reset_running_requests, reset_connector
+        )
+
+    def reset_encoder_cache(self) -> None:
+        """Reset the encoder cache to invalidate all cached encoder outputs.
+
+        This should be called when model weights are updated to ensure
+        stale vision embeddings computed with old weights are not reused.
+        """
+        self.engine_core.reset_encoder_cache()
+
+    def sleep(self, level: int = 1, mode: PauseMode = "abort"):
+        self.engine_core.sleep(level, mode)
+
+        if self.logger_manager is not None:
+            self.logger_manager.record_sleep_state(1, level)
+
+    def wake_up(self, tags: list[str] | None = None):
+        self.engine_core.wake_up(tags)
+
+        if self.logger_manager is not None:
+            self.logger_manager.record_sleep_state(0, 0)
+
+    def is_sleeping(self) -> bool:
+        return self.engine_core.is_sleeping()
+
+    def get_metrics(self) -> list[Metric]:
+        assert self.log_stats, "Stat logging disabled"
+        return get_metrics_snapshot()
+
+    @property
+    def tokenizer(self) -> TokenizerLike | None:
+        return self.renderer.tokenizer
+
+    def get_tokenizer(self) -> TokenizerLike:
+        return self.renderer.get_tokenizer()
+
+    def do_log_stats(self) -> None:
+        """Log stats if logging is enabled."""
+        if self.logger_manager:
+            self.logger_manager.log()
+
+    def do_log_stats_with_interval(self) -> None:
+        """Log stats when the time interval has passed."""
+        now = time.time()
+        if not hasattr(self, "_last_log_time"):
+            self._last_log_time = now
+        if now - self._last_log_time >= envs.VLLM_LOG_STATS_INTERVAL:
+            self.do_log_stats()
+            self._last_log_time = now
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        """Load a new LoRA adapter into the engine for future requests."""
+        return self.engine_core.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        """Remove an already loaded LoRA adapter."""
+        return self.engine_core.remove_lora(lora_id)
+
+    def list_loras(self) -> set[int]:
+        """List all registered adapters."""
+        return self.engine_core.list_loras()
+
+    def pin_lora(self, lora_id: int) -> bool:
+        """Prevent an adapter from being evicted."""
+        return self.engine_core.pin_lora(lora_id)
+
+    def collective_rpc(
+        self,
+        method: str | Callable[[WorkerBase], _R],
+        timeout: float | None = None,
+        args: tuple = (),
+        kwargs: dict[str, Any] | None = None,
+    ) -> list[_R]:
+        return self.engine_core.collective_rpc(method, timeout, args, kwargs)
+
+    def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
+        return self.collective_rpc("apply_model", args=(func,))
+
+    def __del__(self):
+        dp_group = getattr(self, "dp_group", None)
+        if dp_group is not None and not self.external_launcher_dp:
+            stateless_destroy_torch_distributed_process_group(dp_group)
--- a/vllm/v1/engine/logprobs.py
+++ b/vllm/v1/engine/logprobs.py
@@ -0,0 +1,245 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import itertools
+from collections.abc import Iterable
+from dataclasses import dataclass
+
+from vllm.logger import init_logger
+from vllm.logprobs import (
+    PromptLogprobs,
+    SampleLogprobs,
+    append_logprobs_for_next_position,
+    create_prompt_logprobs,
+    create_sample_logprobs,
+)
+from vllm.tokenizers.detokenizer_utils import (
+    TokenizerLike,
+    convert_ids_list_to_tokens,
+)
+from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
+from vllm.v1.outputs import LogprobsLists, LogprobsTensors
+
+logger = init_logger(__name__)
+
+NONES = itertools.repeat(None)
+
+
+@dataclass
+class LogprobsProcessor:
+    # Tokenizer for this request,
+    # None if detokenization is disabled.
+    tokenizer: TokenizerLike | None
+
+    # Logprobs for this request
+    logprobs: SampleLogprobs | None
+    prompt_logprobs: PromptLogprobs | None
+    cumulative_logprob: float | None
+    num_logprobs: int | None
+    num_prompt_logprobs: int | None
+
+    @classmethod
+    def from_new_request(
+        cls,
+        tokenizer: TokenizerLike | None,
+        request: EngineCoreRequest,
+    ) -> "LogprobsProcessor":
+        sampling_params = request.sampling_params
+        assert sampling_params is not None
+        num_logprobs = sampling_params.logprobs
+        num_prompt_logprobs = sampling_params.prompt_logprobs
+        return cls(
+            tokenizer=tokenizer,
+            cumulative_logprob=(None if num_logprobs is None else 0.0),
+            logprobs=(
+                None
+                if num_logprobs is None
+                else create_sample_logprobs(sampling_params.flat_logprobs)
+            ),
+            prompt_logprobs=(
+                None
+                if num_prompt_logprobs is None
+                else create_prompt_logprobs(sampling_params.flat_logprobs)
+            ),
+            num_prompt_logprobs=num_prompt_logprobs,
+            num_logprobs=num_logprobs,
+        )
+
+    def _update_sample_logprobs(self, logprobs_lists: LogprobsLists) -> None:
+        """Update with sample logprobs from EngineCore.
+
+        Outer lists are only of len > 1 if EngineCore made
+        >1 tokens in prior step (e.g. in spec decoding).
+
+        Args:
+          logprobs_lists: the lists of logprob tokens, logprobs, and ranks.
+
+        """
+
+        assert self.num_logprobs is not None
+        assert self.logprobs is not None
+        assert self.cumulative_logprob is not None
+
+        token_ids_lst, logprobs_lst, ranks_lst, _ = logprobs_lists
+
+        for rank_np, logprobs_np, token_ids_np in zip(
+            ranks_lst, logprobs_lst, token_ids_lst
+        ):
+            rank = rank_np.tolist()
+            logprobs = logprobs_np.tolist()
+            token_ids = token_ids_np.tolist()
+            # Detokenize (non-incrementally).
+            decoded_tokens: list[str] | Iterable[None]
+            if self.tokenizer is None:
+                decoded_tokens = NONES
+            else:
+                decoded_tokens_list = convert_ids_list_to_tokens(
+                    self.tokenizer, token_ids
+                )
+                decoded_tokens = self._verify_tokens(
+                    decoded_tokens_list=decoded_tokens_list, tokens=token_ids
+                )
+
+            # Sampler puts the sampled logprob in first.
+            sampled_token_logprob = logprobs[0]
+            self.cumulative_logprob += sampled_token_logprob
+
+            # Update with the Logprob container for this pos.
+            append_logprobs_for_next_position(
+                self.logprobs,
+                token_ids,
+                logprobs,
+                decoded_tokens,
+                rank,
+                self.num_logprobs,
+            )
+
+    def _update_prompt_logprobs(
+        self,
+        prompt_logprobs_tensors: LogprobsTensors,
+    ) -> None:
+        """Update with prompt logprobs from EngineCore.
+
+        Args:
+          prompt_logprobs_tensors: tuple containing the prompt logprobs
+                                   tensors.
+
+        """
+
+        # Prompt logprobs are enabled.
+        assert self.num_prompt_logprobs is not None
+        assert self.prompt_logprobs is not None
+
+        token_ids, logprobs, ranks, _ = prompt_logprobs_tensors
+
+        # Recover shapes.
+        num_prompt_tokens, num_logprobs = logprobs.shape
+
+        # Detokenize non-incrementally.
+        # Output is flat: [num_tok, num_lps] -> [num_tok * num_lps]
+        all_decoded_tokens: list[str] | None = (
+            None
+            if self.tokenizer is None
+            else convert_ids_list_to_tokens(
+                self.tokenizer, token_ids.flatten().tolist()
+            )
+        )
+
+        # Pythonize the torch tensors.
+        prompt_token_ranks = ranks.tolist()
+        prompt_logprobs = logprobs.tolist()
+        token_ids_list = token_ids.tolist()
+
+        # Make Logprob for each position.
+        for pos in range(num_prompt_tokens):
+            # Handle flattening and UTF-8 correction per position
+            offset = pos * num_logprobs
+            offset_end = offset + num_logprobs
+
+            decoded_tokens_for_pos: list[str] | Iterable[None]
+            if all_decoded_tokens is None:
+                decoded_tokens_for_pos = NONES
+            else:
+                # Extract decoded tokens for this position
+                decoded_tokens_slice = all_decoded_tokens[offset:offset_end]
+                # Apply UTF-8 correction within this position's token boundaries
+                decoded_tokens_for_pos = self._verify_tokens(
+                    decoded_tokens_list=decoded_tokens_slice, tokens=token_ids_list[pos]
+                )
+
+            # Update with the Logprob container for this pos.
+            append_logprobs_for_next_position(
+                self.prompt_logprobs,
+                token_ids_list[pos],
+                prompt_logprobs[pos],
+                decoded_tokens_for_pos,
+                prompt_token_ranks[pos],
+                self.num_prompt_logprobs,
+            )
+
+    def pop_prompt_logprobs(self) -> PromptLogprobs | None:
+        """Pop and return all request prompt logprobs
+
+        The logprobs processor aggregates prompt chunk logprobs
+        over one or more prefill chunks. This method returns
+        all prompt logprobs at once and then forgets them.
+        Ensures correct RequestOutputKind.DELTA semantics
+        wherein all prompt logprobs are returned at once at
+        the end of prefill.
+
+        Returns:
+          None if prompt logprobs are disabled for this request.
+          List of all prompt logprobs, otherwise.
+        """
+        plp = self.prompt_logprobs
+        if plp:
+            self.prompt_logprobs = []
+        return plp
+
+    def _correct_decoded_token(self, idx: int, tokens: list[int]) -> str:
+        assert self.tokenizer is not None, "self.tokenizer should not be None"
+
+        # try with prev token id in same list
+        if idx > 0:
+            possible_decoded_token = self.tokenizer.decode(tokens[idx - 1 : idx + 1])
+            if not possible_decoded_token.endswith("<EFBFBD>"):
+                return possible_decoded_token
+        # try with previous logprob token id
+        if self.logprobs:
+            latest_token_id = next(iter(self.logprobs[-1]))
+
+            decode_ids = [latest_token_id]
+            if idx > 0:
+                decode_ids.extend(tokens[idx - 1 : idx + 1])
+            else:
+                decode_ids.extend(tokens[idx : idx + 1])
+
+            possible_decoded_token = self.tokenizer.decode(decode_ids)
+            if not possible_decoded_token.endswith("<EFBFBD>"):
+                return possible_decoded_token
+
+        # by default return empty string
+        return ""
+
+    def _verify_tokens(
+        self, decoded_tokens_list: list[str], tokens: list[int]
+    ) -> list[str]:
+        corrected_decoded_token_map = dict()
+        for idx, text in enumerate(decoded_tokens_list):
+            if text.endswith("<EFBFBD>"):
+                # utf-8 char at the end means it's a potential unfinished byte sequence
+                # from byte fallback tokenization.
+                corrected_decoded_token_map[idx] = self._correct_decoded_token(
+                    idx, tokens
+                )
+
+        for idx, text in corrected_decoded_token_map.items():
+            decoded_tokens_list[idx] = text
+
+        return decoded_tokens_list
+
+    def update_from_output(self, output: EngineCoreOutput) -> None:
+        if output.new_logprobs is not None:
+            self._update_sample_logprobs(output.new_logprobs)
+        if output.new_prompt_logprobs_tensors is not None:
+            self._update_prompt_logprobs(output.new_prompt_logprobs_tensors)
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -0,0 +1,807 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+from collections import defaultdict, deque
+from collections.abc import Iterable
+from dataclasses import dataclass
+from typing import Any, cast
+
+import numpy as np
+import torch
+
+from vllm.lora.request import LoRARequest
+from vllm.outputs import (
+    STREAM_FINISHED,
+    CompletionOutput,
+    PoolingOutput,
+    PoolingRequestOutput,
+    RequestOutput,
+)
+from vllm.sampling_params import RequestOutputKind
+from vllm.tokenizers import TokenizerLike
+from vllm.tracing import (
+    SpanAttributes,
+    SpanKind,
+    extract_trace_context,
+    instrument_manual,
+)
+from vllm.utils import length_from_prompt_token_ids_or_embeds
+from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
+from vllm.v1.engine.detokenizer import IncrementalDetokenizer
+from vllm.v1.engine.logprobs import LogprobsProcessor
+from vllm.v1.engine.parallel_sampling import ParentRequest
+from vllm.v1.metrics.stats import (
+    IterationStats,
+    LoRARequestStates,
+    RequestStateStats,
+    SchedulerStats,
+)
+
+# shared empty CPU tensor used as a placeholder pooling output
+EMPTY_CPU_TENSOR = torch.empty(0, device="cpu")
+
+
+class RequestOutputCollector:
+    """
+    Collects streamed RequestOutputs per individual request,
+    for hand-off to the consuming asyncio generate task.
+
+    When streaming deltas, RequestOutputs are merged if the
+    producer gets ahead of the consumer.
+    """
+
+    def __init__(self, output_kind: RequestOutputKind, request_id: str):
+        self.aggregate = output_kind == RequestOutputKind.DELTA
+        self.request_id = request_id
+        self.output: RequestOutput | PoolingRequestOutput | Exception | None = None
+        self.ready = asyncio.Event()
+
+        self._input_stream_task: asyncio.Task | None = None
+
+    def put(self, output: RequestOutput | PoolingRequestOutput | Exception) -> None:
+        """Non-blocking put operation."""
+        if self.output is None or isinstance(output, Exception):
+            self.output = output
+            self.ready.set()
+        elif isinstance(self.output, RequestOutput) and isinstance(
+            output, RequestOutput
+        ):
+            # This ensures that request outputs with different request indexes
+            # (if n > 1) do not override each other.
+            self.output.add(output, aggregate=self.aggregate)
+        elif isinstance(self.output, PoolingRequestOutput) and isinstance(
+            output, PoolingRequestOutput
+        ):
+            self.output = output
+
+    async def get(self) -> RequestOutput | PoolingRequestOutput:
+        """Get operation blocks on put event."""
+        while (output := self.output) is None:
+            await self.ready.wait()
+        self.output = None
+        self.ready.clear()
+        if isinstance(output, Exception):
+            raise output
+        return output
+
+    def get_nowait(self) -> RequestOutput | PoolingRequestOutput | None:
+        """Non-blocking get operation."""
+        output = self.output
+        if output is not None:
+            self.output = None
+            self.ready.clear()
+        if isinstance(output, Exception):
+            raise output
+        return output
+
+    def close(self):
+        if self._input_stream_task is not None:
+            self._input_stream_task.cancel()
+        self._input_stream_task = None
+
+    def __del__(self):
+        if (task := self._input_stream_task) is not None:
+            task.get_loop().call_soon_threadsafe(task.cancel)
+            self._input_stream_task = None
+
+
+@dataclass
+class OutputProcessorOutput:
+    request_outputs: list[RequestOutput | PoolingRequestOutput]
+    reqs_to_abort: list[str]
+
+
+@dataclass
+class StreamingUpdate:
+    """Streaming input update data for output processor.
+
+    Contains the incremental prompt data to be applied to a request state
+    when the current sub-request completes.
+    """
+
+    prompt: str | None
+    prompt_token_ids: list[int] | None
+    arrival_time: float
+    final: bool = False
+
+
+class RequestState:
+    def __init__(
+        self,
+        request_id: str,
+        external_req_id: str,
+        parent_req: ParentRequest | None,
+        request_index: int,
+        lora_request: LoRARequest | None,
+        output_kind: RequestOutputKind,
+        prompt: str | None,
+        prompt_token_ids: list[int] | None,
+        prompt_embeds: torch.Tensor | None,
+        logprobs_processor: LogprobsProcessor | None,
+        detokenizer: IncrementalDetokenizer | None,
+        max_tokens_param: int | None,
+        arrival_time: float,
+        queue: RequestOutputCollector | None,
+        log_stats: bool,
+        stream_interval: int,
+        top_p: float | None = None,
+        n: int | None = None,
+        temperature: float | None = None,
+        stream_input: bool = False,
+    ):
+        self.request_id = request_id
+        self.external_req_id = external_req_id
+        self.parent_req = parent_req
+        self.request_index = request_index
+        self.lora_request = lora_request
+        self.lora_name = lora_request.lora_name if lora_request is not None else None
+        self.output_kind = output_kind
+        self.prompt = prompt
+        self.prompt_token_ids = prompt_token_ids
+        self.prompt_embeds = prompt_embeds
+        self.prompt_len = length_from_prompt_token_ids_or_embeds(
+            self.prompt_token_ids, self.prompt_embeds
+        )
+        self.logprobs_processor = logprobs_processor
+        self.detokenizer = detokenizer
+        self.max_tokens_param = max_tokens_param
+        self.top_p = top_p
+        self.n = n
+        self.temperature = temperature
+        self.is_prefilling = True
+        self.queue = queue
+        self.num_cached_tokens = 0
+
+        self.stats = RequestStateStats(arrival_time=arrival_time) if log_stats else None
+
+        # Stream Interval
+        self.stream_interval = stream_interval
+        self.sent_tokens_offset = 0  # Offset of sent tokens
+
+        # Streaming input queue
+        self.streaming_input = stream_input
+        self.input_chunk_queue: deque[StreamingUpdate] | None = (
+            deque() if stream_input else None
+        )
+
+    def apply_streaming_update(self, update: StreamingUpdate) -> None:
+        # Apply the update to the request state.
+        self.streaming_input = not update.final
+        # TODO also include relevant output tokens in new prompt here
+        #     (match scheduler behavior).
+        if update.prompt:
+            self.prompt = (
+                (self.prompt + update.prompt) if self.prompt else update.prompt
+            )
+        if self.prompt_token_ids:
+            self.prompt_token_ids.extend(update.prompt_token_ids or ())
+        else:
+            self.prompt_token_ids = update.prompt_token_ids or []
+        assert self.prompt_token_ids is not None
+        self.prompt_len = len(self.prompt_token_ids)
+        if self.stats is not None:
+            self.stats.arrival_time = update.arrival_time
+        self.is_prefilling = True
+
+    @classmethod
+    def from_new_request(
+        cls,
+        tokenizer: TokenizerLike | None,
+        request: EngineCoreRequest,
+        prompt: str | None,
+        parent_req: ParentRequest | None,
+        request_index: int,
+        queue: RequestOutputCollector | None,
+        log_stats: bool,
+        stream_interval: int,
+    ) -> "RequestState":
+        if sampling_params := request.sampling_params:
+            if not sampling_params.detokenize:
+                tokenizer = None
+            output_kind = sampling_params.output_kind
+            logprobs_processor = LogprobsProcessor.from_new_request(
+                tokenizer=tokenizer,
+                request=request,
+            )
+            detokenizer = IncrementalDetokenizer.from_new_request(
+                tokenizer=tokenizer,
+                request=request,
+            )
+            max_tokens_param = sampling_params.max_tokens
+            top_p = sampling_params.top_p
+            n = sampling_params.n
+            temperature = sampling_params.temperature
+        else:
+            logprobs_processor = None
+            detokenizer = None
+            max_tokens_param = None
+            top_p = None
+            n = None
+            temperature = None
+            assert request.pooling_params is not None
+            output_kind = request.pooling_params.output_kind
+
+        assert request.external_req_id is not None
+        return cls(
+            request_id=request.request_id,
+            external_req_id=request.external_req_id,
+            parent_req=parent_req,
+            request_index=request_index,
+            lora_request=request.lora_request,
+            output_kind=output_kind,
+            prompt=prompt,
+            prompt_token_ids=request.prompt_token_ids,
+            prompt_embeds=request.prompt_embeds,
+            logprobs_processor=logprobs_processor,
+            detokenizer=detokenizer,
+            max_tokens_param=max_tokens_param,
+            top_p=top_p,
+            n=n,
+            temperature=temperature,
+            arrival_time=request.arrival_time,
+            queue=queue,
+            log_stats=log_stats,
+            stream_interval=stream_interval,
+            stream_input=request.resumable,
+        )
+
+    def make_request_output(
+        self,
+        new_token_ids: list[int],
+        pooling_output: torch.Tensor | None,
+        finish_reason: FinishReason | None,
+        stop_reason: int | str | None,
+        kv_transfer_params: dict[str, Any] | None = None,
+        routed_experts: np.ndarray | None = None,
+    ) -> RequestOutput | PoolingRequestOutput | None:
+        finished = finish_reason is not None
+        final_only = self.output_kind == RequestOutputKind.FINAL_ONLY
+
+        if not finished and final_only:
+            # Only the final output is required in FINAL_ONLY mode.
+            return None
+
+        if self.stream_interval > 1:
+            assert self.detokenizer is not None
+
+            # Send output request only when
+            # 1. It has finished, or
+            # 2. It is the first token, or
+            # 3. It has reached the stream interval number of tokens
+            if not (
+                finished
+                or self.sent_tokens_offset == 0
+                or self.detokenizer.num_output_tokens() - self.sent_tokens_offset
+                >= self.stream_interval
+            ):
+                return None
+
+            if self.output_kind == RequestOutputKind.DELTA:
+                # Send tokens from the offset in DELTA mode, otherwise all
+                # tokens are sent.
+                new_token_ids = self.detokenizer.output_token_ids[
+                    self.sent_tokens_offset :
+                ]
+                self.sent_tokens_offset = self.detokenizer.num_output_tokens()
+
+        external_req_id = self.external_req_id
+
+        if pooling_output is not None:
+            return self._new_request_output(
+                external_req_id,
+                [self._new_pooling_output(pooling_output)],
+                finished,
+            )
+
+        output = self._new_completion_output(
+            new_token_ids, finish_reason, stop_reason, routed_experts
+        )
+
+        if self.parent_req is None:
+            outputs = [output]
+        else:
+            outputs, finished = self.parent_req.get_outputs(self.request_id, output)
+            if not outputs:
+                return None
+            external_req_id = self.parent_req.external_req_id
+
+        return self._new_request_output(
+            external_req_id, outputs, finished, kv_transfer_params
+        )
+
+    def _new_request_output(
+        self,
+        external_req_id: str,
+        outputs: list[CompletionOutput] | list[PoolingOutput],
+        finished: bool,
+        kv_transfer_params: dict[str, Any] | None = None,
+    ) -> RequestOutput | PoolingRequestOutput:
+        # If prompt embeds were used, put placeholder prompt token ids
+        prompt_token_ids = self.prompt_token_ids
+        if prompt_token_ids is None and self.prompt_embeds is not None:
+            prompt_token_ids = [0] * len(self.prompt_embeds)
+        assert prompt_token_ids is not None
+
+        first_output = outputs[0]
+        if isinstance(first_output, PoolingOutput):
+            assert len(outputs) == 1
+            return PoolingRequestOutput(
+                request_id=external_req_id,
+                outputs=first_output,
+                num_cached_tokens=self.num_cached_tokens,
+                prompt_token_ids=prompt_token_ids,
+                finished=finished,
+            )
+        assert self.logprobs_processor is not None
+        if self.output_kind == RequestOutputKind.DELTA:
+            # Side effect: logprobs processor forgets prompt logprobs
+            prompt_logprobs = self.logprobs_processor.pop_prompt_logprobs()
+        else:
+            prompt_logprobs = self.logprobs_processor.prompt_logprobs
+
+        return RequestOutput(
+            request_id=external_req_id,  # request_id is what was provided externally
+            lora_request=self.lora_request,
+            prompt=self.prompt,
+            prompt_token_ids=prompt_token_ids,
+            prompt_logprobs=prompt_logprobs,
+            outputs=cast(list[CompletionOutput], outputs),
+            finished=finished,
+            kv_transfer_params=kv_transfer_params,
+            num_cached_tokens=self.num_cached_tokens,
+            metrics=self.stats,
+        )
+
+    def _new_completion_output(
+        self,
+        token_ids: list[int],
+        finish_reason: FinishReason | None,
+        stop_reason: int | str | None,
+        routed_experts: np.ndarray | None = None,
+    ) -> CompletionOutput:
+        assert self.detokenizer is not None
+        assert self.logprobs_processor is not None
+        finished = finish_reason is not None
+        delta = self.output_kind == RequestOutputKind.DELTA
+
+        # Prepare text and token_ids, based on delta mode
+        text = self.detokenizer.get_next_output_text(finished, delta)
+        if not delta:
+            token_ids = self.detokenizer.output_token_ids
+
+        # Prepare logprobs, based on delta mode
+        logprobs = self.logprobs_processor.logprobs
+        if delta and logprobs:
+            logprobs = logprobs[-len(token_ids) :]
+
+        return CompletionOutput(
+            index=self.request_index,
+            text=text,
+            token_ids=token_ids,
+            routed_experts=routed_experts,
+            logprobs=logprobs,
+            cumulative_logprob=self.logprobs_processor.cumulative_logprob,
+            finish_reason=str(finish_reason) if finished else None,
+            stop_reason=stop_reason if finished else None,
+        )
+
+    def _new_pooling_output(self, pooling_output: torch.Tensor) -> PoolingOutput:
+        return PoolingOutput(data=pooling_output)
+
+
+class OutputProcessor:
+    """Process EngineCoreOutputs into RequestOutputs."""
+
+    def __init__(
+        self,
+        tokenizer: TokenizerLike | None,
+        *,
+        log_stats: bool,
+        stream_interval: int = 1,
+        tracing_enabled: bool = False,
+    ):
+        self.log_stats = log_stats
+        self.tokenizer = tokenizer
+        self.stream_interval = stream_interval
+        self.request_states: dict[str, RequestState] = {}
+        self.parent_requests: dict[str, ParentRequest] = {}
+        self.external_req_ids: defaultdict[str, list[str]] = defaultdict(list)
+        self.lora_states = LoRARequestStates(log_stats)
+        self.tracing_enabled = tracing_enabled
+
+    def get_num_unfinished_requests(self):
+        return len(self.request_states)
+
+    def has_unfinished_requests(self) -> bool:
+        return len(self.request_states) > 0
+
+    def propagate_error(self, e: Exception):
+        """Propagate error to all generate() tasks."""
+
+        for _, state in self.request_states.items():
+            assert state.queue is not None
+            state.queue.put(e)
+
+    def abort_requests(self, request_ids: Iterable[str], internal: bool) -> list[str]:
+        """Abort a list of requests.
+
+        The request_ids may be either external request IDs (those passed to
+        InputProcessor.process_inputs()) or internal request IDs (those randomly
+        generated when creating the EngineCoreRequest).
+
+        If an external request ID is provided, and that external request ID
+        was used for multiple requests, all requests associated with that external
+        request ID are aborted.
+
+        In the case of parallel sampling, a request ID may be used to identify
+        a parent request, in which case the associated child requests are aborted
+        also.
+        """
+        internal_req_ids = []
+        for request_id in request_ids:
+            if internal:
+                # Internal ID - this may be a parent request
+                internal_req_ids.append(request_id)
+
+                # Remove internal ID from the external->internal mapping
+                if req_state := self.request_states.get(request_id):
+                    external_req_id = req_state.external_req_id
+                    internal_ids = self.external_req_ids[external_req_id]
+                    internal_ids.remove(request_id)
+                    if not internal_ids:
+                        del self.external_req_ids[external_req_id]
+            elif internal_ids := self.external_req_ids.pop(request_id, []):
+                # External ID - abort all requests in the external->internal mapping
+                internal_req_ids.extend(internal_ids)
+
+        request_ids_to_abort = []
+        for request_id in internal_req_ids:
+            req_state = self.request_states.pop(request_id, None)
+            if req_state is not None:
+                self.lora_states.request_finished(request_id, req_state.lora_name)
+                request_ids_to_abort.append(request_id)
+                # Produce final abort output.
+                if req_state.queue is not None and (
+                    request_output := req_state.make_request_output(
+                        new_token_ids=[],
+                        # Set pooling_output is not None to
+                        # correctly enter the abort pooling branch
+                        pooling_output=EMPTY_CPU_TENSOR
+                        if req_state.detokenizer is None
+                        else None,
+                        finish_reason=FinishReason.ABORT,
+                        stop_reason=None,
+                        kv_transfer_params=None,
+                    )
+                ):
+                    req_state.queue.put(request_output)
+            elif parent := self.parent_requests.get(request_id):
+                # Abort children prior to removing the parent.
+                if parent.child_requests:
+                    child_reqs = list(parent.child_requests)
+                    child_reqs = self.abort_requests(child_reqs, internal=True)
+                    request_ids_to_abort.extend(child_reqs)
+                self.parent_requests.pop(request_id, None)
+        return request_ids_to_abort
+
+    def add_request(
+        self,
+        request: EngineCoreRequest,
+        prompt: str | None,
+        parent_req: ParentRequest | None = None,
+        request_index: int = 0,
+        queue: RequestOutputCollector | None = None,
+    ) -> None:
+        request_id = request.request_id
+        req_state = self.request_states.get(request_id)
+        if req_state is not None:
+            self._update_streaming_request_state(req_state, request, prompt)
+            return
+
+        req_state = RequestState.from_new_request(
+            tokenizer=self.tokenizer,
+            request=request,
+            prompt=prompt,
+            parent_req=parent_req,
+            request_index=request_index,
+            queue=queue,
+            log_stats=self.log_stats,
+            stream_interval=self.stream_interval,
+        )
+        self.request_states[request_id] = req_state
+        if parent_req:
+            self.parent_requests[parent_req.request_id] = parent_req
+
+        # Track the external_req_id -> [internal_req_id, ...] mapping
+        self.external_req_ids[req_state.external_req_id].append(request_id)
+
+    def _update_streaming_request_state(
+        self, req_state: RequestState, request: EngineCoreRequest, prompt: str | None
+    ) -> None:
+        """Queue a streaming update instead of immediately applying it."""
+        if not request.resumable:
+            # Final request - just mark completion, don't add its dummy tokens.
+            if req_state.input_chunk_queue is None:
+                # Engine already finished - emit final output and clean up.
+                self._finish_request(req_state)
+                if req_state.queue is not None:
+                    # Emit a final output with finished=True
+                    # to unblock the generate() loop.
+                    req_state.queue.put(STREAM_FINISHED)
+            elif req_state.input_chunk_queue:
+                req_state.input_chunk_queue[-1].final = True
+            else:
+                req_state.streaming_input = False
+            return
+
+        update = StreamingUpdate(
+            prompt=prompt,
+            prompt_token_ids=request.prompt_token_ids,
+            arrival_time=request.arrival_time,
+        )
+
+        # Apply request updates now if the last input already completed.
+        if req_state.input_chunk_queue is None:
+            req_state.apply_streaming_update(update)
+            req_state.input_chunk_queue = deque()
+        else:
+            # Queue the streaming update otherwise.
+            req_state.input_chunk_queue.append(update)
+
+    def process_outputs(
+        self,
+        engine_core_outputs: list[EngineCoreOutput],
+        engine_core_timestamp: float | None = None,
+        iteration_stats: IterationStats | None = None,
+    ) -> OutputProcessorOutput:
+        """
+        Process the EngineCoreOutputs:
+        1) Compute stats for logging
+        2) Detokenize
+        3) Create and handle RequestOutput objects:
+            * If there is a queue (for usage with AsyncLLM),
+              put the RequestOutput objects into the queue for
+              handling by the per-request generate() tasks.
+
+            * If there is no queue (for usage with LLMEngine),
+              return a list of RequestOutput objects.
+
+        NOTE FOR DEVELOPERS
+
+        vLLM V1 minimizes the number of python loops over the full
+        batch to ensure system overheads are minimized. This is the
+        only function that should loop over EngineCoreOutputs.
+
+        If you need to touch every element of the batch, do it from
+        within the loop below.
+        """
+
+        request_outputs: list[RequestOutput | PoolingRequestOutput] = []
+        reqs_to_abort: list[str] = []
+        for engine_core_output in engine_core_outputs:
+            req_id = engine_core_output.request_id
+            req_state = self.request_states.get(req_id)
+            if req_state is None:
+                # Ignore output for already-aborted request.
+                continue
+
+            # 1) Compute stats for this iteration.
+            self._update_stats_from_output(
+                req_state, engine_core_output, engine_core_timestamp, iteration_stats
+            )
+
+            new_token_ids = engine_core_output.new_token_ids
+            pooling_output = engine_core_output.pooling_output
+            finish_reason = engine_core_output.finish_reason
+            stop_reason = engine_core_output.stop_reason
+            kv_transfer_params = engine_core_output.kv_transfer_params
+            routed_experts = engine_core_output.routed_experts
+            req_state.num_cached_tokens = engine_core_output.num_cached_tokens
+            req_state.is_prefilling = False
+
+            if pooling_output is None:
+                assert req_state.detokenizer is not None
+                assert req_state.logprobs_processor is not None
+                # 2) Detokenize the token ids into text and perform stop checks.
+                stop_string = req_state.detokenizer.update(
+                    new_token_ids, finish_reason == FinishReason.STOP
+                )
+                if stop_string:
+                    finish_reason = FinishReason.STOP
+                    stop_reason = stop_string
+
+                # 3) Compute sample and prompt logprobs for request,
+                # if required.
+                req_state.logprobs_processor.update_from_output(engine_core_output)
+
+            # 4) Create and handle RequestOutput objects.
+            if request_output := req_state.make_request_output(
+                new_token_ids,
+                pooling_output,
+                finish_reason,
+                stop_reason,
+                kv_transfer_params,
+                routed_experts,
+            ):
+                if req_state.streaming_input:
+                    request_output.finished = False
+
+                if req_state.queue is not None:
+                    # AsyncLLM: put into queue for handling by generate().
+                    req_state.queue.put(request_output)
+                else:
+                    # LLMEngine: return list of RequestOutputs.
+                    request_outputs.append(request_output)
+
+            # Free completed requests.
+            if finish_reason is not None:
+                if req_state.streaming_input:
+                    if req_state.input_chunk_queue:
+                        update = req_state.input_chunk_queue.popleft()
+                        req_state.apply_streaming_update(update)
+                    else:
+                        req_state.input_chunk_queue = None
+                else:
+                    self._finish_request(req_state)
+                    if not engine_core_output.finished:
+                        # If req not finished in EngineCore, but Detokenizer
+                        # detected stop string, abort needed in EngineCore.
+                        reqs_to_abort.append(req_id)
+
+                    # Track per-request stats
+                    self._update_stats_from_finished(
+                        req_state, finish_reason, iteration_stats
+                    )
+                    if self.tracing_enabled:
+                        self.do_tracing(engine_core_output, req_state, iteration_stats)
+
+        return OutputProcessorOutput(
+            request_outputs=request_outputs,
+            reqs_to_abort=reqs_to_abort,
+        )
+
+    def _finish_request(self, req_state: RequestState) -> None:
+        req_id = req_state.request_id
+        self.request_states.pop(req_id)
+
+        internal_ids = self.external_req_ids[req_state.external_req_id]
+        internal_ids.remove(req_id)
+        if not internal_ids:
+            del self.external_req_ids[req_state.external_req_id]
+
+        # Remove parent request if applicable.
+        parent_req = req_state.parent_req
+        if parent_req and not parent_req.child_requests:
+            self.parent_requests.pop(parent_req.request_id, None)
+
+    def update_scheduler_stats(self, scheduler_stats: SchedulerStats | None):
+        self.lora_states.update_scheduler_stats(scheduler_stats)
+
+    def do_tracing(
+        self,
+        engine_core_output: EngineCoreOutput,
+        req_state: RequestState,
+        iteration_stats: IterationStats | None,
+    ) -> None:
+        assert req_state.stats is not None
+        assert iteration_stats is not None
+
+        metrics = req_state.stats
+        arrival_time_ns = int(metrics.arrival_time * 1e9)
+        trace_context = extract_trace_context(engine_core_output.trace_headers)
+        prompt_length = length_from_prompt_token_ids_or_embeds(
+            req_state.prompt_token_ids, req_state.prompt_embeds
+        )
+
+        # Calculate timing metrics
+        e2e_time = iteration_stats.iteration_timestamp - metrics.arrival_time
+        queued_time = metrics.scheduled_ts - metrics.queued_ts
+        prefill_time = metrics.first_token_ts - metrics.scheduled_ts
+        decode_time = metrics.last_token_ts - metrics.first_token_ts
+        inference_time = metrics.last_token_ts - metrics.scheduled_ts
+
+        # Build attributes dict
+        attributes: dict[str, Any] = {
+            SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN: (
+                metrics.first_token_latency
+            ),
+            SpanAttributes.GEN_AI_LATENCY_E2E: e2e_time,
+            SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE: queued_time,
+            SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS: prompt_length,
+            SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS: (
+                metrics.num_generation_tokens
+            ),
+            SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL: prefill_time,
+            SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_DECODE: decode_time,
+            SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE: inference_time,
+            SpanAttributes.GEN_AI_REQUEST_ID: req_state.external_req_id,
+        }
+
+        # Add optional request parameters
+        if req_state.top_p:
+            attributes[SpanAttributes.GEN_AI_REQUEST_TOP_P] = req_state.top_p
+        if req_state.max_tokens_param:
+            attributes[SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS] = (
+                req_state.max_tokens_param
+            )
+        if req_state.temperature:
+            attributes[SpanAttributes.GEN_AI_REQUEST_TEMPERATURE] = (
+                req_state.temperature
+            )
+        if req_state.n:
+            attributes[SpanAttributes.GEN_AI_REQUEST_N] = req_state.n
+
+        instrument_manual(
+            span_name="llm_request",
+            start_time=arrival_time_ns,
+            attributes=attributes,
+            context=trace_context,
+            kind=SpanKind.SERVER,
+        )
+
+    def _update_stats_from_output(
+        self,
+        req_state: RequestState,
+        engine_core_output: EngineCoreOutput,
+        engine_core_timestamp: float | None,
+        iteration_stats: IterationStats | None,
+    ):
+        if iteration_stats is None:
+            return
+
+        assert engine_core_timestamp is not None
+        assert req_state.stats is not None
+        iteration_stats.update_from_output(
+            engine_core_output,
+            engine_core_timestamp,
+            req_state.is_prefilling,
+            req_state.prompt_len,
+            req_state.stats,
+            self.lora_states,
+            req_state.lora_name,
+        )
+
+    def _update_stats_from_finished(
+        self,
+        req_state: RequestState,
+        finish_reason: FinishReason | None,
+        iteration_stats: IterationStats | None,
+    ):
+        if iteration_stats is None:
+            return
+
+        assert finish_reason is not None
+        assert req_state.stats is not None
+        iteration_stats.update_from_finished_request(
+            finish_reason=finish_reason,
+            num_prompt_tokens=req_state.prompt_len,
+            max_tokens_param=req_state.max_tokens_param,
+            req_stats=req_state.stats,
+            num_cached_tokens=req_state.num_cached_tokens,
+        )
+        self.lora_states.request_finished(req_state.request_id, req_state.lora_name)
+
+        ParentRequest.observe_finished_request(
+            req_state.parent_req, iteration_stats, req_state.stats.num_generation_tokens
+        )
--- a/vllm/v1/engine/parallel_sampling.py
+++ b/vllm/v1/engine/parallel_sampling.py
@@ -0,0 +1,150 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from copy import copy
+from typing import cast
+
+from vllm.outputs import CompletionOutput
+from vllm.sampling_params import RequestOutputKind, SamplingParams
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.metrics.stats import IterationStats
+
+
+class ParentRequest:
+    """Info, state & processing for parallel sampling request.
+
+    Store parent request ID and sampling params.
+    Facilitate generating child request sampling params.
+    """
+
+    request_id: str
+    external_req_id: str
+    sampling_params: SamplingParams
+
+    # To track the completion of child requests
+    child_requests: set[str]
+
+    # To aggregate child completions when not streaming
+    output_aggregator: list[CompletionOutput]
+
+    # To find the max number of generated tokens across all children
+    max_num_generation_tokens: int
+
+    # To efficiently obtain child sampling params
+    cached_child_sampling_params: SamplingParams | None
+
+    def __init__(self, request: EngineCoreRequest) -> None:
+        assert request.external_req_id is not None
+        sampling_params = request.params
+        self.request_id = request.request_id
+        self.external_req_id = request.external_req_id
+        self.sampling_params = sampling_params
+
+        self.child_requests = set()
+        self.output_aggregator = (
+            [cast(CompletionOutput, None)] * sampling_params.n
+            if (sampling_params.output_kind == RequestOutputKind.FINAL_ONLY)
+            else []
+        )
+        self.max_num_generation_tokens = 0
+        self.cached_child_sampling_params = None
+
+    def _get_child_sampling_params(
+        self,
+        index: int,
+    ) -> SamplingParams:
+        """Efficiently obtain child `sampling_params`
+
+        If `sampling_params.seed` is not `None` then
+        each child request requires a unique clone of
+        parent `sampling_params` with a unique seed.
+
+        Args:
+          index: index within `n` child requests
+
+        Returns:
+          Child `sampling_params` instance.
+        """
+        seed = self.sampling_params.seed
+        if self.cached_child_sampling_params:
+            # Reuse child sampling_params data structure
+            return self.cached_child_sampling_params
+        # Build child sampling_params
+        child_sampling_params = copy(self.sampling_params)
+        child_sampling_params.n = 1
+        if seed is None:
+            # Cache child sampling_params for later reuse
+            self.cached_child_sampling_params = child_sampling_params
+        else:
+            # Each child gets a clone with a unique seed
+            child_sampling_params.seed = seed + index
+        return child_sampling_params
+
+    def get_child_info(self, index: int) -> tuple[str, SamplingParams]:
+        """Get child request ID and sampling params.
+
+        Args:
+          index: index within `n` child requests.
+
+        Returns:
+          (request ID, sampling_params) tuple
+        """
+        child_req_id = f"{index}_{self.request_id}"
+        self.child_requests.add(child_req_id)
+        return child_req_id, self._get_child_sampling_params(index)
+
+    @property
+    def n(self) -> int:
+        return self.sampling_params.n
+
+    def get_outputs(
+        self,
+        child_request_id: str,
+        completion_output: CompletionOutput,
+    ) -> tuple[list[CompletionOutput], bool]:
+        already_finished_and_returned: bool = False
+        if completion_output.finished():
+            if child_request_id in self.child_requests:
+                self.child_requests.remove(child_request_id)
+            else:
+                # child request ID is not available in child_requests
+                # which means the request had finished in previous
+                # batch step and returned to the client earlier
+                already_finished_and_returned = True
+
+        if self.sampling_params.output_kind != RequestOutputKind.FINAL_ONLY:
+            # If streaming, just return the current output
+            #
+            # DO NOT output finished and already returned child request to client again
+            outputs = [] if already_finished_and_returned else [completion_output]
+        else:
+            # If not streaming, aggregate the n final outputs.
+            self.output_aggregator[completion_output.index] = completion_output
+            outputs = [] if self.child_requests else self.output_aggregator
+
+        finished = not self.child_requests
+        return outputs, finished
+
+    def observe_num_generation_tokens(self, num_generation_tokens: int):
+        self.max_num_generation_tokens = max(
+            num_generation_tokens, self.max_num_generation_tokens
+        )
+        return self.max_num_generation_tokens
+
+    @staticmethod
+    def observe_finished_request(
+        parent_req: "ParentRequest | None",
+        iteration_stats: IterationStats,
+        num_generation_tokens: int,
+    ):
+        n_param = parent_req.n if parent_req is not None else 1
+
+        if parent_req is not None:
+            num_generation_tokens = parent_req.observe_num_generation_tokens(
+                num_generation_tokens
+            )
+
+        # Child requests finished, we can now record to iteration stats
+        if parent_req is None or not parent_req.child_requests:
+            iteration_stats.max_num_generation_tokens_iter.append(num_generation_tokens)
+            iteration_stats.n_params_iter.append(n_param)
--- a/vllm/v1/engine/utils.py
+++ b/vllm/v1/engine/utils.py