first commit

2026-03-10 13:31:25 +08:00
parent ba974cecfa
commit b62b889355
2604 changed files with 438977 additions and 0 deletions
--- a/vllm/v1/engine/init.py
+++ b/vllm/v1/engine/init.py
@@ -0,0 +1,203 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import enum
+import time
+from collections.abc import Mapping
+from typing import Any, Optional, Union
+
+import msgspec
+import torch
+
+from vllm.lora.request import LoRARequest
+from vllm.multimodal.inputs import MultiModalFeatureSpec
+from vllm.pooling_params import PoolingParams
+from vllm.sampling_params import SamplingParams
+from vllm.v1.metrics.stats import SchedulerStats
+from vllm.v1.outputs import LogprobsLists, LogprobsTensors
+
+# These are possible values of RequestOutput.finish_reason,
+# so form part of the external API.
+FINISH_REASON_STRINGS = ("stop", "length", "abort")
+
+
+class FinishReason(enum.IntEnum):
+    """
+    Reason a request finished - stop, length, or abort.
+
+    Int rather than Str for more compact serialization.
+
+    stop - a stop string was emitted
+    length - max_tokens was consumed, or max_model_len was reached
+    abort - aborted for another reason
+
+    """
+    STOP = 0
+    LENGTH = 1
+    ABORT = 2
+
+    def __str__(self):
+        return FINISH_REASON_STRINGS[self.value]
+
+
+class EngineCoreRequest(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        omit_defaults=True,  # type: ignore[call-arg]
+        gc=False):  # type: ignore[call-arg]
+
+    request_id: str
+    prompt_token_ids: Optional[list[int]]
+    mm_features: Optional[list[MultiModalFeatureSpec]]
+    sampling_params: Optional[SamplingParams]
+    pooling_params: Optional[PoolingParams]
+    eos_token_id: Optional[int]
+    arrival_time: float
+    lora_request: Optional[LoRARequest]
+    cache_salt: Optional[str]
+    data_parallel_rank: Optional[int]
+    prompt_embeds: Optional[torch.Tensor] = None
+
+    # Index of the client, used to ensure outputs are sent back to the same
+    # client for this request when scaling out the front-end.
+    client_index: int = 0
+
+    # Used in DP case to indicate which wave of requests this is expected to
+    # belong to, to cover a race condition where the request is sent before
+    # a wave finished notification is received.
+    current_wave: int = 0
+    priority: int = 0
+
+    trace_headers: Optional[Mapping[str, str]] = None
+
+
+class EngineCoreEventType(enum.IntEnum):
+    """The type of engine core request event."""
+    QUEUED = 1
+    SCHEDULED = 2
+    PREEMPTED = 3
+
+
+class EngineCoreEvent(msgspec.Struct):
+    """A timestamped engine core event associated with a request.
+
+    The timestamp is a monotonic timestamps and is used for by the engine
+    frontend to calculate intervals between engine core events. These
+    timestamps should not be compared with timestamps from other processes.
+    """
+    type: EngineCoreEventType
+    timestamp: float
+
+    @classmethod
+    def new_event(cls,
+                  event_type: EngineCoreEventType,
+                  timestamp: Optional[float] = None) -> "EngineCoreEvent":
+        timestamp = time.monotonic() if timestamp is None else timestamp
+        return cls(event_type, timestamp)
+
+
+class EngineCoreOutput(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        omit_defaults=True,  # type: ignore[call-arg]
+        gc=False):  # type: ignore[call-arg]
+
+    request_id: str
+    new_token_ids: list[int]
+
+    new_logprobs: Optional[LogprobsLists] = None
+    new_prompt_logprobs_tensors: Optional[LogprobsTensors] = None
+
+    pooling_output: Optional[torch.Tensor] = None
+
+    finish_reason: Optional[FinishReason] = None
+    stop_reason: Union[int, str, None] = None
+    events: Optional[list[EngineCoreEvent]] = None
+    kv_transfer_params: Optional[dict[str, Any]] = None
+
+    trace_headers: Optional[Mapping[str, str]] = None
+    # The number of tokens with prefix cache hits.
+    num_cached_tokens: int = 0
+
+    @property
+    def finished(self) -> bool:
+        return self.finish_reason is not None
+
+
+class UtilityResult:
+    """Wrapper for special handling when serializing/deserializing."""
+
+    def __init__(self, r: Any = None):
+        self.result = r
+
+
+class UtilityOutput(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        gc=False):  # type: ignore[call-arg]
+
+    call_id: int
+
+    # Non-None implies the call failed, result should be None.
+    failure_message: Optional[str] = None
+    result: Optional[UtilityResult] = None
+
+
+class EngineCoreOutputs(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        omit_defaults=True,  # type: ignore[call-arg]
+        gc=False):  # type: ignore[call-arg]
+
+    # NOTE(Nick): We could consider ways to make this more compact,
+    # e.g. columnwise layout
+
+    engine_index: int = 0
+
+    # [num_reqs]
+    outputs: list[EngineCoreOutput] = []
+    scheduler_stats: Optional[SchedulerStats] = None
+    timestamp: float = 0.0
+
+    utility_output: Optional[UtilityOutput] = None
+    finished_requests: Optional[set[str]] = None
+
+    # In DP case, used to signal that the current wave of requests
+    # has finished and the engines are paused.
+    wave_complete: Optional[int] = None
+    # In DP case, used to signal that a request was received for an
+    # "old" wave, so the next wave needs to be started in other engines.
+    start_wave: Optional[int] = None
+
+    def __post_init__(self):
+        if self.timestamp == 0.0:
+            self.timestamp = time.monotonic()
+
+
+class EngineCoreRequestType(enum.Enum):
+    """
+    Request types defined as hex byte strings, so it can be sent over sockets
+    without separate encoding step.
+    """
+    ADD = b'\x00'
+    ABORT = b'\x01'
+    START_DP_WAVE = b'\x02'
+    UTILITY = b'\x03'
+    # Sentinel used within EngineCoreProc.
+    EXECUTOR_FAILED = b'\x04'
+
+
+class ReconfigureDistributedRequest(msgspec.Struct):
+    new_data_parallel_size: int
+    new_data_parallel_rank: int
+    new_data_parallel_rank_local: int
+    new_data_parallel_master_ip: str
+    new_data_parallel_master_port: int
+
+
+class ReconfigureRankType(enum.IntEnum):
+    """
+    Rank type for reconfiguring distributed request.
+    """
+    KEEP_CURRENT_RANK = -1
+    SHUTDOWN_CURRENT_RANK = -2
--- a/vllm/v1/engine/pycache/init.cpython-310.pyc
+++ b/vllm/v1/engine/pycache/init.cpython-310.pyc
--- a/vllm/v1/engine/pycache/async_llm.cpython-310.pyc
+++ b/vllm/v1/engine/pycache/async_llm.cpython-310.pyc
--- a/vllm/v1/engine/pycache/coordinator.cpython-310.pyc
+++ b/vllm/v1/engine/pycache/coordinator.cpython-310.pyc
--- a/vllm/v1/engine/pycache/core.cpython-310.pyc
+++ b/vllm/v1/engine/pycache/core.cpython-310.pyc
--- a/vllm/v1/engine/pycache/core_client.cpython-310.pyc
+++ b/vllm/v1/engine/pycache/core_client.cpython-310.pyc
--- a/vllm/v1/engine/pycache/detokenizer.cpython-310.pyc
+++ b/vllm/v1/engine/pycache/detokenizer.cpython-310.pyc
--- a/vllm/v1/engine/pycache/exceptions.cpython-310.pyc
+++ b/vllm/v1/engine/pycache/exceptions.cpython-310.pyc
--- a/vllm/v1/engine/pycache/llm_engine.cpython-310.pyc
+++ b/vllm/v1/engine/pycache/llm_engine.cpython-310.pyc
--- a/vllm/v1/engine/pycache/logprobs.cpython-310.pyc
+++ b/vllm/v1/engine/pycache/logprobs.cpython-310.pyc
--- a/vllm/v1/engine/pycache/output_processor.cpython-310.pyc
+++ b/vllm/v1/engine/pycache/output_processor.cpython-310.pyc
--- a/vllm/v1/engine/pycache/parallel_sampling.cpython-310.pyc
+++ b/vllm/v1/engine/pycache/parallel_sampling.cpython-310.pyc
--- a/vllm/v1/engine/pycache/processor.cpython-310.pyc
+++ b/vllm/v1/engine/pycache/processor.cpython-310.pyc
--- a/vllm/v1/engine/pycache/utils.cpython-310.pyc
+++ b/vllm/v1/engine/pycache/utils.cpython-310.pyc
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -0,0 +1,742 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import os
+import socket
+import time
+from collections.abc import AsyncGenerator, Iterable, Mapping
+from copy import copy
+from typing import Any, Optional, Union
+
+import numpy as np
+import torch
+
+import vllm.envs as envs
+from vllm.config import ModelConfig, VllmConfig
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.utils import _validate_truncation_size
+from vllm.envs import VLLM_V1_OUTPUT_PROC_CHUNK_SIZE
+from vllm.inputs import PromptType
+from vllm.inputs.preprocess import InputPreprocessor
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.outputs import PoolingRequestOutput, RequestOutput
+from vllm.pooling_params import PoolingParams
+from vllm.sampling_params import SamplingParams
+from vllm.tasks import SupportedTask
+from vllm.tracing import init_tracer
+from vllm.transformers_utils.config import (
+    maybe_register_config_serialize_by_value)
+from vllm.transformers_utils.tokenizer import (AnyTokenizer,
+                                               init_tokenizer_from_configs)
+from vllm.usage.usage_lib import UsageContext
+from vllm.utils import (Device, as_list, cancel_task_threadsafe, cdiv,
+                        deprecate_kwargs)
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.core_client import EngineCoreClient
+from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
+from vllm.v1.engine.output_processor import (OutputProcessor,
+                                             RequestOutputCollector)
+from vllm.v1.engine.parallel_sampling import ParentRequest
+from vllm.v1.engine.processor import Processor
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.metrics.loggers import StatLoggerFactory, StatLoggerManager
+from vllm.v1.metrics.prometheus import shutdown_prometheus
+from vllm.v1.metrics.stats import IterationStats
+
+logger = init_logger(__name__)
+
+
+class AsyncLLM(EngineClient):
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        executor_class: type[Executor],
+        log_stats: bool,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+        use_cached_outputs: bool = False,
+        log_requests: bool = True,
+        start_engine_loop: bool = True,
+        stat_loggers: Optional[list[StatLoggerFactory]] = None,
+        client_addresses: Optional[dict[str, str]] = None,
+        client_count: int = 1,
+        client_index: int = 0,
+    ) -> None:
+        """
+        Create an AsyncLLM.
+
+        Args:
+            vllm_config: global configuration.
+            executor_class: an Executor impl, e.g. MultiprocExecutor.
+            log_stats: Whether to log stats.
+            usage_context: Usage context of the LLM.
+            mm_registry: Multi-modal registry.
+            use_cached_outputs: Whether to use cached outputs.
+            log_requests: Whether to log requests.
+            start_engine_loop: Whether to start the engine loop.
+            stat_loggers: customized stat loggers for the engine.
+                If not provided, default stat loggers will be used.
+                PLEASE BE AWARE THAT STAT LOGGER IS NOT STABLE
+                IN V1, AND ITS BASE CLASS INTERFACE MIGHT CHANGE.
+
+        Returns:
+            None
+        """
+        if not envs.VLLM_USE_V1:
+            raise ValueError(
+                "Using V1 AsyncLLMEngine, but envs.VLLM_USE_V1=False. "
+                "This should not happen. As a workaround, try using "
+                "AsyncLLMEngine.from_vllm_config(...) or explicitly set "
+                "VLLM_USE_V1=0 or 1 and report this issue on Github.")
+
+        # Ensure we can serialize custom transformer configs
+        maybe_register_config_serialize_by_value()
+
+        self.model_config = vllm_config.model_config
+        self.vllm_config = vllm_config
+        self.observability_config = vllm_config.observability_config
+        self.log_requests = log_requests
+
+        self.log_stats = log_stats or (stat_loggers is not None)
+        if not log_stats and stat_loggers is not None:
+            logger.info(
+                "AsyncLLM created with log_stats=False and non-empty custom "
+                "logger list; enabling logging without default stat loggers")
+
+        if self.model_config.skip_tokenizer_init:
+            self.tokenizer = None
+        else:
+            # Tokenizer (+ ensure liveness if running in another process).
+            self.tokenizer = init_tokenizer_from_configs(
+                model_config=vllm_config.model_config)
+
+        # Processor (converts Inputs --> EngineCoreRequests).
+        self.processor = Processor(
+            vllm_config=vllm_config,
+            tokenizer=self.tokenizer,
+            mm_registry=mm_registry,
+        )
+
+        # OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
+        self.output_processor = OutputProcessor(self.tokenizer,
+                                                log_stats=self.log_stats)
+        if self.observability_config.otlp_traces_endpoint is not None:
+            tracer = init_tracer(
+                "vllm.llm_engine",
+                self.observability_config.otlp_traces_endpoint)
+            self.output_processor.tracer = tracer
+
+        # EngineCore (starts the engine in background process).
+        self.engine_core = EngineCoreClient.make_async_mp_client(
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=self.log_stats,
+            client_addresses=client_addresses,
+            client_count=client_count,
+            client_index=client_index,
+        )
+
+        # Loggers.
+        self.logger_manager: Optional[StatLoggerManager] = None
+        if self.log_stats:
+            self.logger_manager = StatLoggerManager(
+                vllm_config=vllm_config,
+                engine_idxs=self.engine_core.engine_ranks_managed,
+                custom_stat_loggers=stat_loggers,
+                enable_default_loggers=log_stats,
+                client_count=client_count,
+            )
+            self.logger_manager.log_engine_initialized()
+
+        self.output_handler: Optional[asyncio.Task] = None
+        try:
+            # Start output handler eagerly if we are in the asyncio eventloop.
+            asyncio.get_running_loop()
+            self._run_output_handler()
+        except RuntimeError:
+            pass
+
+        if envs.VLLM_TORCH_PROFILER_DIR:
+            logger.info(
+                "Torch profiler enabled. AsyncLLM CPU traces will be collected under %s",  # noqa: E501
+                envs.VLLM_TORCH_PROFILER_DIR)
+            worker_name = f"{socket.gethostname()}_{os.getpid()}.async_llm"
+            self.profiler = torch.profiler.profile(
+                activities=[
+                    torch.profiler.ProfilerActivity.CPU,
+                ],
+                with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
+                on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                    envs.VLLM_TORCH_PROFILER_DIR,
+                    worker_name=worker_name,
+                    use_gzip=True))
+        else:
+            self.profiler = None
+
+    @classmethod
+    @deprecate_kwargs(
+        "disable_log_requests",
+        additional_message=("This argument will have no effect. "
+                            "Use `enable_log_requests` instead."),
+    )
+    def from_vllm_config(
+            cls,
+            vllm_config: VllmConfig,
+            start_engine_loop: bool = True,
+            usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+            stat_loggers: Optional[list[StatLoggerFactory]] = None,
+            enable_log_requests: bool = False,
+            disable_log_stats: bool = False,
+            client_addresses: Optional[dict[str, str]] = None,
+            client_count: int = 1,
+            client_index: int = 0,
+            disable_log_requests: bool = True,  # Deprecated, will be removed
+    ) -> "AsyncLLM":
+        if not envs.VLLM_USE_V1:
+            raise ValueError(
+                "Using V1 AsyncLLMEngine, but envs.VLLM_USE_V1=False. "
+                "This should not happen. As a workaround, try using "
+                "AsyncLLMEngine.from_vllm_config(...) or explicitly set "
+                "VLLM_USE_V1=0 or 1 and report this issue on Github.")
+
+        # Create the LLMEngine.
+        return cls(
+            vllm_config=vllm_config,
+            executor_class=Executor.get_class(vllm_config),
+            start_engine_loop=start_engine_loop,
+            stat_loggers=stat_loggers,
+            log_requests=enable_log_requests,
+            log_stats=not disable_log_stats,
+            usage_context=usage_context,
+            client_addresses=client_addresses,
+            client_count=client_count,
+            client_index=client_index,
+        )
+
+    @classmethod
+    def from_engine_args(
+        cls,
+        engine_args: AsyncEngineArgs,
+        start_engine_loop: bool = True,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[list[StatLoggerFactory]] = None,
+    ) -> "AsyncLLM":
+        """Create an AsyncLLM from the EngineArgs."""
+
+        # Create the engine configs.
+        vllm_config = engine_args.create_engine_config(usage_context)
+        executor_class = Executor.get_class(vllm_config)
+
+        # Create the AsyncLLM.
+        return cls(
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_requests=engine_args.enable_log_requests,
+            log_stats=not engine_args.disable_log_stats,
+            start_engine_loop=start_engine_loop,
+            usage_context=usage_context,
+            stat_loggers=stat_loggers,
+        )
+
+    def __del__(self):
+        self.shutdown()
+
+    def shutdown(self):
+        """Shutdown, cleaning up the background proc and IPC."""
+
+        shutdown_prometheus()
+
+        if engine_core := getattr(self, "engine_core", None):
+            engine_core.shutdown()
+
+        cancel_task_threadsafe(getattr(self, "output_handler", None))
+
+    async def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
+        return await self.engine_core.get_supported_tasks_async()
+
+    async def add_request(
+        self,
+        request_id: str,
+        prompt: PromptType,
+        params: Union[SamplingParams, PoolingParams],
+        arrival_time: Optional[float] = None,
+        lora_request: Optional[LoRARequest] = None,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        priority: int = 0,
+        data_parallel_rank: Optional[int] = None,
+    ) -> RequestOutputCollector:
+        """Add new request to the AsyncLLM."""
+
+        if self.errored:
+            raise EngineDeadError()
+
+        is_pooling = isinstance(params, PoolingParams)
+
+        # Create a new output collector for the request.
+        queue = RequestOutputCollector(output_kind=params.output_kind)
+
+        # Convert Input --> Request.
+        prompt_str, request = self.processor.process_inputs(
+            request_id, prompt, params, arrival_time, lora_request,
+            tokenization_kwargs, trace_headers, priority, data_parallel_rank)
+
+        if is_pooling or params.n == 1:
+            await self._add_request(request, prompt_str, None, 0, queue)
+            return queue
+
+        # Fan out child requests (for n>1).
+        parent_request = ParentRequest(request_id, params)
+        for idx in range(params.n):
+            request_id, params = parent_request.get_child_info(idx)
+            child_request = request if idx == params.n - 1 else copy(request)
+            child_request.request_id = request_id
+            child_request.sampling_params = params
+            await self._add_request(child_request, prompt_str, parent_request,
+                                    idx, queue)
+        return queue
+
+    async def _add_request(self, request: EngineCoreRequest,
+                           prompt: Optional[str],
+                           parent_req: Optional[ParentRequest], index: int,
+                           queue: RequestOutputCollector):
+
+        # Add the request to OutputProcessor (this process).
+        self.output_processor.add_request(request, prompt, parent_req, index,
+                                          queue)
+
+        # Add the EngineCoreRequest to EngineCore (separate process).
+        await self.engine_core.add_request_async(request)
+
+        if self.log_requests:
+            logger.info("Added request %s.", request.request_id)
+
+    # TODO: we should support multiple prompts in one call, as you
+    # can do with LLM.generate. So that for multi-prompt completion
+    # requests we don't need to send multiple messages to core proc,
+    # and so we don't need multiple streams which then get
+    # re-multiplexed in the API server anyhow.
+    async def generate(
+        self,
+        prompt: PromptType,
+        sampling_params: SamplingParams,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        priority: int = 0,
+        data_parallel_rank: Optional[int] = None,
+    ) -> AsyncGenerator[RequestOutput, None]:
+        """
+        Main function called by the API server to kick off a request
+            * 1) Making an AsyncStream corresponding to the Request.
+            * 2) Processing the Input.
+            * 3) Adding the Request to the Detokenizer.
+            * 4) Adding the Request to the EngineCore (separate process).
+
+        A separate output_handler loop runs in a background AsyncIO task,
+        pulling outputs from EngineCore and putting them into the
+        per-request AsyncStream.
+
+        The caller of generate() iterates the returned AsyncGenerator,
+        returning the RequestOutput back to the caller.
+        """
+
+        if (self.vllm_config.cache_config.kv_sharing_fast_prefill
+                and sampling_params.prompt_logprobs):
+            raise ValueError(
+                "--kv-sharing-fast-prefill produces incorrect logprobs for "
+                "prompt tokens, please disable it when the requests need "
+                "prompt logprobs")
+
+        try:
+            # We start the output_handler on the first call to generate() so
+            # we can call __init__ before the event loop, which enables us
+            # to handle startup failure gracefully in the OpenAI server.
+            self._run_output_handler()
+
+            tokenization_kwargs: dict[str, Any] = {}
+            truncate_prompt_tokens = sampling_params.truncate_prompt_tokens
+
+            _validate_truncation_size(
+                self.model_config.max_model_len,
+                truncate_prompt_tokens,
+                tokenization_kwargs,
+            )
+
+            q = await self.add_request(
+                request_id,
+                prompt,
+                sampling_params,
+                lora_request=lora_request,
+                trace_headers=trace_headers,
+                priority=priority,
+                tokenization_kwargs=tokenization_kwargs,
+                data_parallel_rank=data_parallel_rank,
+            )
+
+            # The output_handler task pushes items into the queue.
+            # This task pulls from the queue and yields to caller.
+            finished = False
+            while not finished:
+                # Note: drain queue without await if possible (avoids
+                # task switching under load which helps performance).
+                out = q.get_nowait() or await q.get()
+
+                # Note: both OutputProcessor and EngineCore handle their
+                # own request cleanup based on finished.
+                finished = out.finished
+                yield out
+
+        # If the request is disconnected by the client, generate()
+        # is cancelled or the generator is garbage collected. So,
+        # we abort the request if we end up here.
+        except (asyncio.CancelledError, GeneratorExit):
+            await self.abort(request_id)
+            if self.log_requests:
+                logger.info("Request %s aborted.", request_id)
+            raise
+
+        # Engine is dead. Do not abort since we shut down.
+        except EngineDeadError:
+            if self.log_requests:
+                logger.info("Request %s failed (engine dead).", request_id)
+            raise
+
+        # Request validation error.
+        except ValueError:
+            if self.log_requests:
+                logger.info("Request %s failed (bad request).", request_id)
+            raise
+
+        # Unexpected error in the generate() task (possibly recoverable).
+        except Exception as e:
+            await self.abort(request_id)
+            if self.log_requests:
+                logger.info("Request %s failed.", request_id)
+            raise EngineGenerateError() from e
+
+    def _run_output_handler(self):
+        """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
+
+        if self.output_handler is not None:
+            return
+
+        # Ensure that the task doesn't have a circular ref back to the AsyncLLM
+        # object, or else it won't be garbage collected and cleaned up properly.
+        engine_core = self.engine_core
+        output_processor = self.output_processor
+        log_stats = self.log_stats
+        logger_manager = self.logger_manager
+
+        async def output_handler():
+            try:
+                while True:
+                    # 1) Pull EngineCoreOutputs from the EngineCore.
+                    outputs = await engine_core.get_output_async()
+                    num_outputs = len(outputs.outputs)
+
+                    iteration_stats = IterationStats() if (
+                        log_stats and num_outputs) else None
+
+                    # Split outputs into chunks of at most
+                    # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the
+                    # event loop for too long.
+                    if num_outputs <= VLLM_V1_OUTPUT_PROC_CHUNK_SIZE:
+                        slices = (outputs.outputs, )
+                    else:
+                        slices = np.array_split(
+                            outputs.outputs,
+                            cdiv(num_outputs, VLLM_V1_OUTPUT_PROC_CHUNK_SIZE))
+
+                    for i, outputs_slice in enumerate(slices):
+                        # 2) Process EngineCoreOutputs.
+                        processed_outputs = output_processor.process_outputs(
+                            outputs_slice, outputs.timestamp, iteration_stats)
+                        # NOTE: RequestOutputs are pushed to their queues.
+                        assert not processed_outputs.request_outputs
+
+                        # Allow other asyncio tasks to run between chunks
+                        if i + 1 < len(slices):
+                            await asyncio.sleep(0)
+
+                        # 3) Abort any reqs that finished due to stop strings.
+                        await engine_core.abort_requests_async(
+                            processed_outputs.reqs_to_abort)
+
+                    # 4) Logging.
+                    # TODO(rob): make into a coroutine and launch it in
+                    # background thread once Prometheus overhead is non-trivial.
+                    if logger_manager:
+                        logger_manager.record(
+                            engine_idx=outputs.engine_index,
+                            scheduler_stats=outputs.scheduler_stats,
+                            iteration_stats=iteration_stats,
+                        )
+            except Exception as e:
+                logger.exception("AsyncLLM output_handler failed.")
+                output_processor.propagate_error(e)
+
+        self.output_handler = asyncio.create_task(output_handler())
+
+    async def abort(self, request_id: Union[str, Iterable[str]]) -> None:
+        """Abort RequestId in OutputProcessor and EngineCore."""
+
+        request_ids = (request_id, ) if isinstance(
+            request_id, str) else as_list(request_id)
+        all_request_ids = self.output_processor.abort_requests(request_ids)
+        await self.engine_core.abort_requests_async(all_request_ids)
+
+        if self.log_requests:
+            logger.info("Aborted request(s) %s.", ",".join(request_ids))
+
+    async def encode(
+        self,
+        prompt: PromptType,
+        pooling_params: PoolingParams,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        priority: int = 0,
+        truncate_prompt_tokens: Optional[int] = None,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+    ) -> AsyncGenerator[PoolingRequestOutput, None]:
+        """
+        Main function called by the API server to kick off a request
+            * 1) Making an AsyncStream corresponding to the Request.
+            * 2) Processing the Input.
+            * 3) Adding the Request to the EngineCore (separate process).
+
+        A separate output_handler loop runs in a background AsyncIO task,
+        pulling outputs from EngineCore and putting them into the
+        per-request AsyncStream.
+
+        The caller of generate() iterates the returned AsyncGenerator,
+        returning the RequestOutput back to the caller.
+        """
+
+        try:
+            # We start the output_handler on the first call to generate() so
+            # we can call __init__ before the event loop, which enables us
+            # to handle startup failure gracefully in the OpenAI server.
+            self._run_output_handler()
+
+            if tokenization_kwargs is None:
+                tokenization_kwargs = dict[str, Any]()
+            _validate_truncation_size(
+                self.model_config.max_model_len,
+                truncate_prompt_tokens,
+                tokenization_kwargs,
+            )
+
+            q = await self.add_request(
+                request_id,
+                prompt,
+                pooling_params,
+                lora_request=lora_request,
+                trace_headers=trace_headers,
+                priority=priority,
+                tokenization_kwargs=tokenization_kwargs,
+            )
+
+            # The output_handler task pushes items into the queue.
+            # This task pulls from the queue and yields to caller.
+            finished = False
+            while not finished:
+                # Note: drain queue without await if possible (avoids
+                # task switching under load which helps performance).
+                out = q.get_nowait() or await q.get()
+                assert isinstance(out, PoolingRequestOutput)
+                # Note: both OutputProcessor and EngineCore handle their
+                # own request cleanup based on finished.
+                finished = out.finished
+                yield out
+
+        # If the request is disconnected by the client, generate()
+        # is cancelled. So, we abort the request if we end up here.
+        except asyncio.CancelledError:
+            await self.abort(request_id)
+            if self.log_requests:
+                logger.info("Request %s aborted.", request_id)
+            raise
+
+        # Engine is dead. Do not abort since we shut down.
+        except EngineDeadError:
+            if self.log_requests:
+                logger.info("Request %s failed (engine dead).", request_id)
+            raise
+
+        # Request validation error.
+        except ValueError:
+            if self.log_requests:
+                logger.info("Request %s failed (bad request).", request_id)
+            raise
+
+        # Unexpected error in the generate() task (possibly recoverable).
+        except Exception as e:
+            await self.abort(request_id)
+            if self.log_requests:
+                logger.info("Request %s failed.", request_id)
+            raise EngineGenerateError() from e
+
+    async def get_vllm_config(self) -> VllmConfig:
+        return self.vllm_config
+
+    async def get_model_config(self) -> ModelConfig:
+        return self.model_config
+
+    async def get_input_preprocessor(self) -> InputPreprocessor:
+        return self.processor.input_preprocessor
+
+    async def get_tokenizer(self) -> AnyTokenizer:
+        if self.tokenizer is None:
+            raise ValueError("Unable to get tokenizer because "
+                             "skip_tokenizer_init is True")
+
+        return self.tokenizer
+
+    async def is_tracing_enabled(self) -> bool:
+        return self.observability_config.otlp_traces_endpoint is not None
+
+    async def do_log_stats(self) -> None:
+        if self.logger_manager:
+            self.logger_manager.log()
+
+    async def check_health(self) -> None:
+        logger.debug("Called check_health.")
+        if self.errored:
+            raise self.dead_error
+
+    async def start_profile(self) -> None:
+        coros = [self.engine_core.profile_async(True)]
+        if self.profiler is not None:
+            coros.append(asyncio.to_thread(self.profiler.start))
+        await asyncio.gather(*coros)
+
+    async def stop_profile(self) -> None:
+        coros = [self.engine_core.profile_async(False)]
+        if self.profiler is not None:
+            coros.append(asyncio.to_thread(self.profiler.stop))
+        await asyncio.gather(*coros)
+
+    async def reset_mm_cache(self) -> None:
+        self.processor.clear_cache()
+        await self.engine_core.reset_mm_cache_async()
+
+    async def reset_prefix_cache(self,
+                                 device: Optional[Device] = None) -> None:
+        if device == Device.CPU:
+            raise ValueError("Not supported on CPU.")
+        await self.engine_core.reset_prefix_cache_async()
+
+    async def sleep(self, level: int = 1) -> None:
+        await self.reset_prefix_cache()
+        await self.engine_core.sleep_async(level)
+
+    async def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        await self.engine_core.wake_up_async(tags)
+
+    async def is_sleeping(self) -> bool:
+        return await self.engine_core.is_sleeping_async()
+
+    async def add_lora(self, lora_request: LoRARequest) -> bool:
+        """Load a new LoRA adapter into the engine for future requests."""
+        return await self.engine_core.add_lora_async(lora_request)
+
+    async def remove_lora(self, lora_id: int) -> bool:
+        """Remove an already loaded LoRA adapter."""
+        return await self.engine_core.remove_lora_async(lora_id)
+
+    async def list_loras(self) -> set[int]:
+        """List all registered adapters."""
+        return await self.engine_core.list_loras_async()
+
+    async def pin_lora(self, lora_id: int) -> bool:
+        """Prevent an adapter from being evicted."""
+        return await self.engine_core.pin_lora_async(lora_id)
+
+    async def collective_rpc(self,
+                             method: str,
+                             timeout: Optional[float] = None,
+                             args: tuple = (),
+                             kwargs: Optional[dict] = None):
+        """
+        Perform a collective RPC call to the given path.
+        """
+        return await self.engine_core.collective_rpc_async(
+            method, timeout, args, kwargs)
+
+    async def wait_for_requests_to_drain(self, drain_timeout: int = 300):
+        """Wait for all requests to be drained."""
+        start_time = time.time()
+        while time.time() - start_time < drain_timeout:
+            if not self.engine_core.dp_engines_running():
+                logger.info("Engines are idle, requests have been drained")
+                return
+
+            logger.info(
+                "Engines are still running, waiting for requests to drain...")
+            await asyncio.sleep(1)  # Wait 1 second before checking again
+
+        raise TimeoutError(f"Timeout reached after {drain_timeout} seconds "
+                           "waiting for requests to drain.")
+
+    async def scale_elastic_ep(self,
+                               new_data_parallel_size: int,
+                               drain_timeout: int = 300):
+        """
+        Scale up or down the data parallel size by adding or removing
+        engine cores.
+        Args:
+            new_data_parallel_size: The new number of data parallel workers
+            drain_timeout:
+                Maximum time to wait for requests to drain (seconds)
+        """
+        old_data_parallel_size = \
+            self.vllm_config.parallel_config.data_parallel_size
+        if old_data_parallel_size == new_data_parallel_size:
+            logger.info("Data parallel size is already %s, skipping scale",
+                        new_data_parallel_size)
+            return
+        logger.info(
+            "Waiting for requests to drain before "
+            "scaling up to %s engines...", new_data_parallel_size)
+        await self.wait_for_requests_to_drain(drain_timeout)
+        logger.info(
+            "Requests have been drained, proceeding with scale "
+            "to %s engines", new_data_parallel_size)
+        await self.engine_core.scale_elastic_ep(new_data_parallel_size)
+        self.vllm_config.parallel_config.data_parallel_size = \
+            new_data_parallel_size
+
+        # recreate stat loggers
+        if new_data_parallel_size > old_data_parallel_size and self.log_stats:
+            # TODO(rob): fix this after talking with Ray team.
+            # This resets all the prometheus metrics since we
+            # unregister during initialization. Need to understand
+            # the intended behavior here better.
+            self.logger_manager = StatLoggerManager(
+                vllm_config=self.vllm_config,
+                engine_idxs=list(range(new_data_parallel_size)),
+                custom_stat_loggers=None,
+            )
+
+    @property
+    def is_running(self) -> bool:
+        # Is None before the loop is started.
+        return self.output_handler is None or not self.output_handler.done()
+
+    @property
+    def is_stopped(self) -> bool:
+        return self.errored
+
+    @property
+    def errored(self) -> bool:
+        return self.engine_core.resources.engine_dead or not self.is_running
+
+    @property
+    def dead_error(self) -> BaseException:
+        return EngineDeadError()
--- a/vllm/v1/engine/coordinator.py
+++ b/vllm/v1/engine/coordinator.py
@@ -0,0 +1,357 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import copy
+import multiprocessing
+import time
+import weakref
+from typing import Optional
+
+import msgspec.msgpack
+import zmq
+
+from vllm.config import ParallelConfig
+from vllm.logger import init_logger
+from vllm.utils import get_mp_context, make_zmq_socket, set_process_title
+from vllm.v1.engine import EngineCoreOutputs, EngineCoreRequestType
+from vllm.v1.serial_utils import MsgpackDecoder
+from vllm.v1.utils import get_engine_client_zmq_addr, shutdown
+
+logger = init_logger(__name__)
+
+
+class DPCoordinator:
+    """Coordinator process used for data-parallel deployments (DP>1).
+
+    Intermediates between multiple DP engine rank processes and one or more
+    front-end API server processes.
+
+    * Collects stats from each DP engine (currently just waiting and running
+      queue lengths), and publishes these to all front-ends for use in
+      load-balancing decisions.
+
+    * Keeps track of the current DP "request wave" number and running state
+      of the engines. This is received from the DP rank 0 engine and published
+      to the front-end processes along with the current load stats.
+
+      The engines alternate between a global running/paused state. The global
+      "request wave" number is a count of the number of times that the workers
+      collectively move from a running state to a paused state. This transition
+      is synchronized via the all-reduce operation performed in the
+      DPEngineCoreProc._has_global_unfinished_reqs method.
+
+    * Broadcasts the START_DP_WAVE message to engines to move them from paused
+      to running state when one engine receives a new request. This can happen
+      in two cases:
+      1) A front-end sending a new request while the engines are paused will
+         concurrently notify the coordinator.
+      2) An engine receiving a request for a stale request wave while in paused
+         state will notify the coordinator.
+
+    Engines will move into running state when receiving a new request or
+    START_DP_WAVE message.
+
+    Note that when deployed in External LB mode, no stats will be published by
+    the engines and thus updates will only be sent to front-ends when the
+    request wave / running state changes.
+    """
+
+    def __init__(self, parallel_config: ParallelConfig):
+
+        dp_size = parallel_config.data_parallel_size
+        assert dp_size > 1, "Coordinator only used for data parallel"
+
+        host = parallel_config.data_parallel_master_ip
+        external_lb = parallel_config.data_parallel_external_lb
+        hybrid_lb = parallel_config.data_parallel_hybrid_lb
+
+        # Assume coordinator is colocated with front-end procs when not in
+        # either external or hybrid DP LB mode.
+        local_only = not (external_lb or hybrid_lb)
+        front_publish_address = get_engine_client_zmq_addr(
+            local_only=local_only, host=host)
+
+        local_only_eng = dp_size == parallel_config.data_parallel_size_local
+        back_publish_address = get_engine_client_zmq_addr(local_only_eng, host)
+        back_output_address = get_engine_client_zmq_addr(local_only_eng, host)
+
+        context = get_mp_context()
+        self.proc: multiprocessing.Process = context.Process(
+            target=DPCoordinatorProc.run_coordinator,
+            name="VLLM_DP_Coordinator",
+            kwargs={
+                "engine_count": parallel_config.data_parallel_size,
+                "front_publish_address": front_publish_address,
+                "back_output_address": back_output_address,
+                "back_publish_address": back_publish_address,
+            },
+            daemon=True)
+        self.proc.start()
+
+        self.stats_publish_address = front_publish_address
+        self.coord_in_address = back_publish_address
+        self.coord_out_address = back_output_address
+        self._finalizer = weakref.finalize(self, shutdown, [self.proc])
+
+    def get_stats_publish_address(self) -> str:
+        return self.stats_publish_address
+
+    def get_engine_socket_addresses(self) -> tuple[str, str]:
+        """Returns tuple of ZMQ input address, output address."""
+        return self.coord_in_address, self.coord_out_address
+
+    def close(self):
+        self._finalizer()
+
+
+class EngineState:
+
+    def __init__(self):
+        self.request_counts = [0, 0]  # [waiting, running]
+
+
+class DPCoordinatorProc:
+
+    def __init__(self,
+                 engine_count: int,
+                 min_stats_update_interval_ms: int = 100):
+        set_process_title("DPCoordinator")
+        self.ctx = zmq.Context()
+
+        self.engines = [EngineState() for _ in range(engine_count)]
+
+        self.stats_update_interval_ms = min_stats_update_interval_ms
+
+    @staticmethod
+    def run_coordinator(
+        engine_count: int,
+        front_publish_address: str,
+        back_output_address: str,
+        back_publish_address: str,
+        min_stats_update_interval_ms: int = 100,
+    ):
+        coordinator = DPCoordinatorProc(
+            engine_count=engine_count,
+            min_stats_update_interval_ms=min_stats_update_interval_ms)
+        try:
+            coordinator.process_input_socket(
+                front_publish_address,
+                back_output_address,
+                back_publish_address,
+            )
+        except KeyboardInterrupt:
+            logger.info("DP Coordinator process exiting")
+
+    def process_input_socket(self, front_publish_address: str,
+                             back_output_address: str,
+                             back_publish_address: str):
+
+        decoder = MsgpackDecoder(EngineCoreOutputs)
+
+        # For tracking request wave progression.
+        current_wave = 0
+        engines_running = False
+
+        # For tracking request counts for internal load-balancing.
+        stats_changed = False
+        last_stats_step = -1
+        last_stats_wave = -1
+        last_step_counts: Optional[list[list[int]]] = None
+
+        with make_zmq_socket(
+                path=front_publish_address,  # IPC
+                ctx=self.ctx,
+                socket_type=zmq.XPUB,
+                bind=True,
+        ) as publish_front, make_zmq_socket(
+                path=back_output_address,  # IPC or TCP
+                ctx=self.ctx,
+                socket_type=zmq.PULL,
+                bind=True,
+        ) as output_back, make_zmq_socket(
+                path=back_publish_address,  # IPC or TCP
+                ctx=self.ctx,
+                socket_type=zmq.XPUB,
+                bind=True,
+        ) as publish_back:
+
+            # Wait until all engines subscribe.
+            for _ in self.engines:
+                if publish_back.recv() != b'\x01':
+                    logger.error(
+                        "DP Coordinator received unexpected message while "
+                        "waiting for engines to subscribe")
+                    return
+            # Send ready message to engines.
+            publish_back.send(b"READY")
+
+            logger.info("All engine subscriptions received by DP coordinator")
+
+            poller = zmq.Poller()
+            poller.register(publish_front, zmq.POLLIN)
+            poller.register(output_back, zmq.POLLIN)
+            last_publish_time = 0
+            while True:
+                elapsed = int(time.time() * 1000) - last_publish_time
+                # Send at stats_update_interval_ms interval if the stats have
+                # changed, or otherwise every 5 seconds.
+                wait_for = (self.stats_update_interval_ms
+                            if stats_changed else 5000)
+
+                # Wait at least 50ms to ensure we've received all stats for
+                # the current step.
+                min_timeout = 50 if last_step_counts is None else 0
+
+                events = poller.poll(timeout=max(min_timeout, wait_for -
+                                                 elapsed))
+                if not events:
+                    # Poller timeout - publish current stats to front-ends.
+                    if last_step_counts is not None:
+                        engine_req_counts_list = last_step_counts
+                        last_step_counts = None
+                    else:
+                        engine_req_counts_list = self._get_engine_counts()
+                        stats_changed = False
+
+                    to_publish = (engine_req_counts_list, current_wave,
+                                  engines_running)
+                    publish_front.send(msgspec.msgpack.encode(to_publish))
+                    last_publish_time = int(time.time() * 1000)
+                    continue
+
+                events = dict(events)
+                wave_state_changed = False
+
+                if publish_front in events:
+                    buffer = publish_front.recv()
+                    if buffer in (b'\x01', b'\x00'):
+                        # Ignore subscription messages.
+                        continue
+
+                    decoded = msgspec.msgpack.decode(buffer)
+                    if isinstance(decoded, (list, tuple)) and len(
+                            decoded) == 2 and decoded[0] == "SCALE_ELASTIC_EP":
+                        # Handle scale up notification
+                        new_engine_count = decoded[1]
+                        current_count = len(self.engines)
+                        if new_engine_count > current_count:
+                            for _ in range(new_engine_count - current_count):
+                                self.engines.append(EngineState())
+                            # NOTE(yongji): handle the case
+                            # where newly started engines have current_wave = 0
+                            # if existing engines just finished a wave
+                            # and engine_running isn't updated yet at
+                            # CoordinatorProc requests routed to newly started
+                            # engines may not wake up existing engines, as long
+                            # as 0 < request.wave < existing engines'
+                            # current_wave
+                            # we note that 0 is the wave number for the new
+                            # engine
+                            engines_running = False
+                            logger.info(
+                                "DPCoordinator scaled up from %s to %s "
+                                "engines", current_count, new_engine_count)
+                        else:
+                            self.engines = self.engines[:new_engine_count]
+                            logger.info(
+                                "DPCoordinator scaled down from %s to %s "
+                                "engines", current_count, new_engine_count)
+                        continue  # Skip normal engine notification processing
+
+                    # We received a message on the front-end XPUB socket,
+                    # from an API server sending a new request while the
+                    # engines are paused, so that we can wake the other
+                    # engines.
+                    engine_to_exclude, wave = decoded
+                    if not engines_running:
+                        if wave < current_wave:
+                            # If the wave number is stale, ensure the message
+                            # is handled by all the engines.
+                            engine_to_exclude = None
+
+                        engines_running = True
+                        wave_state_changed = True
+                        self._send_start_wave(publish_back, current_wave,
+                                              engine_to_exclude)
+
+                if output_back in events:
+                    # We received a message from one of the engines.
+
+                    buffer = output_back.recv()
+                    outputs: EngineCoreOutputs = decoder.decode(buffer)
+
+                    assert not outputs.outputs
+                    assert outputs.utility_output is None
+
+                    eng_index = outputs.engine_index
+                    scheduler_stats = outputs.scheduler_stats
+                    if scheduler_stats:
+                        # 1. Updated request load stats - update our local
+                        # state with these.
+                        stats = self.engines[eng_index].request_counts
+                        stats_step = scheduler_stats.step_counter
+                        stats_wave = scheduler_stats.current_wave
+                        if (stats_wave > last_stats_wave
+                                or stats_wave == last_stats_wave
+                                and stats_step > last_stats_step):
+                            if stats_changed:
+                                last_step_counts = self._get_engine_counts(
+                                    do_copy=True)
+                            last_stats_step = stats_step
+                            last_stats_wave = stats_wave
+                        elif stats_wave != last_stats_wave or (
+                                stats_step != last_stats_step):
+                            logger.warning(
+                                "Received stats for out-of-order "
+                                "step (%d, %d) from engine %d (expected "
+                                "> (%d, %d))", stats_wave, stats_step,
+                                eng_index, last_stats_wave, last_stats_step)
+                        stats[0] = scheduler_stats.num_waiting_reqs
+                        stats[1] = scheduler_stats.num_running_reqs
+                        stats_changed = True
+
+                    if (wave := outputs.wave_complete) is not None:
+                        # 2. Notification from rank 0 engine that we've
+                        # moved into the global paused state
+                        # (engines_running==False).
+                        if current_wave <= wave:
+                            new_wave = wave + 1
+                            logger.debug("Moving DP wave from %d to %d.",
+                                         current_wave, new_wave)
+                            current_wave = new_wave
+                            engines_running = False
+                            wave_state_changed = True
+                    elif (wave := outputs.start_wave) is not None and (
+                            wave > current_wave or
+                        (wave == current_wave and not engines_running)):
+                        # 3. The engine received request for a non-current wave
+                        # so we must ensure that other engines progress to the
+                        # next wave (race condition handling).
+                        logger.debug(
+                            "Starting wave %d after notification of "
+                            "stale wave request from engine.", wave)
+                        current_wave = wave
+                        engines_running = True
+                        wave_state_changed = True
+                        self._send_start_wave(publish_back, wave, eng_index)
+
+                if wave_state_changed:
+                    message = (None, current_wave, engines_running)
+                    publish_front.send(msgspec.msgpack.encode(message))
+
+    @staticmethod
+    def _send_start_wave(socket: zmq.Socket, wave: int,
+                         exclude_engine_index: Optional[int]):
+        """Broadcast the START_DP_WAVE message to all the engines.
+        It includes the current wave number and index of engine which
+        has already received a request with this wave number and so doesn't
+        require additional notification.
+        """
+        wave_encoded = msgspec.msgpack.encode((wave, exclude_engine_index))
+        socket.send_multipart(
+            (EngineCoreRequestType.START_DP_WAVE.value, wave_encoded))
+
+    def _get_engine_counts(self, do_copy=False) -> list[list[int]]:
+        """Return list of [waiting, running] count lists for each engine."""
+        if do_copy:
+            return [copy.copy(e.request_counts) for e in self.engines]
+        return [e.request_counts for e in self.engines]
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -0,0 +1,349 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+from typing import Optional
+
+import tokenizers
+from packaging import version
+from tokenizers import Tokenizer
+from tokenizers.decoders import DecodeStream
+from transformers import PreTrainedTokenizerFast
+
+from vllm.logger import init_logger
+from vllm.transformers_utils.detokenizer_utils import (
+    AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
+from vllm.utils import length_from_prompt_token_ids_or_embeds
+from vllm.v1.engine import EngineCoreRequest
+
+logger = init_logger(__name__)
+
+# Only tokenizers >= 0.21.1 supports DecodeStream used for
+# FastIncrementalDetokenizer.
+USE_FAST_DETOKENIZER = version.parse(
+    tokenizers.__version__) >= version.parse("0.21.1")
+
+# Error string from https://github.com/huggingface/tokenizers/blob/909fdde2a4ffedd9295206f705eb612be2a91b12/tokenizers/src/tokenizer/mod.rs#L1042
+INVALID_PREFIX_ERR_MSG = "Invalid prefix encountered"
+
+
+class IncrementalDetokenizer:
+
+    def __init__(self):
+        self.token_ids: list[int] = []
+
+    @property
+    def output_token_ids(self) -> list[int]:
+        return self.token_ids
+
+    def update(self, new_token_ids: list[int],
+               stop_terminated: bool) -> Optional[str]:
+        self.token_ids.extend(new_token_ids)
+        return None
+
+    def get_next_output_text(self, finished: bool, delta: bool) -> str:
+        return ""
+
+    @classmethod
+    def from_new_request(
+        cls,
+        tokenizer: Optional[AnyTokenizer],
+        request: EngineCoreRequest,
+    ) -> "IncrementalDetokenizer":
+
+        assert request.sampling_params is not None
+
+        if tokenizer is None:
+            # No tokenizer => skipping detokenization.
+            return IncrementalDetokenizer()
+
+        if USE_FAST_DETOKENIZER and isinstance(tokenizer,
+                                               PreTrainedTokenizerFast):
+            # Fast tokenizer => use tokenizers library DecodeStream.
+            return FastIncrementalDetokenizer(tokenizer, request)
+
+        # Fall back to slow python-based incremental detokenization.
+        return SlowIncrementalDetokenizer(tokenizer, request)
+
+
+class BaseIncrementalDetokenizer(IncrementalDetokenizer, ABC):
+
+    def __init__(self, request: EngineCoreRequest):
+        super().__init__()
+
+        # Stop strings
+        params = request.sampling_params
+        assert params is not None
+        self.stop = stop = params.stop
+        self.min_tokens = params.min_tokens
+        self.include_stop_str_in_output = params.include_stop_str_in_output
+
+        # Number of chars to hold back when stop strings are to be excluded
+        # from streamed output.
+        if stop and not self.include_stop_str_in_output:
+            self.stop_buffer_length = max(len(s) for s in stop) - 1
+        else:
+            self.stop_buffer_length = 0
+        self._last_output_text_offset: int = 0
+
+        # Generation data
+        self.output_text = ""
+
+    def update(self, new_token_ids: list[int],
+               stop_terminated: bool) -> Optional[str]:
+        """
+        Update RequestState for the request_id by:
+            1) Detokenize the new token ids incrementally.
+            2) Evaluate stop criteria.
+
+        Return matched stop string or None.
+        """
+        if not new_token_ids:
+            # Skip detokenization if no new token ids.
+            return None
+
+        if stop_terminated and not self.include_stop_str_in_output:
+            # If stop-terminated, exclude last token from detokenization
+            # based on include_stop_str_in_output parameter.
+            skipped_stop_token_id = new_token_ids[-1]
+            new_token_ids = new_token_ids[:-1]
+        else:
+            skipped_stop_token_id = None
+
+        # 1) Detokenize the new token ids incrementally.
+        # TODO(woosuk): This method becomes very inefficient when the number of
+        # new_token_ids is more than 1. We need to optimize this.
+        stop_check_offset = len(self.output_text)
+        for new_token_id in new_token_ids:
+            self.token_ids.append(new_token_id)
+            self.output_text += self.decode_next(new_token_id)
+            # Support min_tokens, see https://github.com/vllm-project/vllm/pull/22014
+            if self.min_tokens and len(
+                    self.output_token_ids) <= self.min_tokens:
+                stop_check_offset = len(self.output_text)
+
+        if skipped_stop_token_id is not None:
+            # Cleanup after skipping detokenization.
+            self.token_ids.append(skipped_stop_token_id)
+
+        # 2) Evaluate stop strings.
+        stop_string = None
+        if self.stop and len(self.output_token_ids) > self.min_tokens:
+            stop = check_stop_strings(
+                output_text=self.output_text,
+                new_char_count=len(self.output_text) - stop_check_offset,
+                stop=self.stop,
+                include_in_output=self.include_stop_str_in_output,
+            )
+            if stop is not None:
+                stop_string, truncate_to = stop
+                if truncate_to != -1:
+                    self.output_text = self.output_text[:truncate_to]
+
+        return stop_string
+
+    @abstractmethod
+    def decode_next(self, next_token_id: int) -> str:
+        raise NotImplementedError
+
+    def get_next_output_text(self, finished: bool, delta: bool) -> str:
+        """If delta is True, only new text since the last call to
+        this method is returned"""
+
+        # We return the full output text if the sequence is finished.
+        buffer_length = 0 if finished else self.stop_buffer_length
+        if not delta:
+            return self.output_text[:-buffer_length] if buffer_length else (
+                self.output_text)
+        length = len(self.output_text) - buffer_length
+        last_offset = self._last_output_text_offset
+        if last_offset < length:
+            self._last_output_text_offset = length
+            return self.output_text[last_offset:length]
+        return ""
+
+
+class FastIncrementalDetokenizer(BaseIncrementalDetokenizer):
+
+    def __init__(self, tokenizer: PreTrainedTokenizerFast,
+                 request: EngineCoreRequest):
+        super().__init__(request)
+
+        sampling_params = request.sampling_params
+        assert sampling_params is not None
+
+        self.request_id = request.request_id
+        self.skip_special_tokens = sampling_params.skip_special_tokens
+        self.stream = DecodeStream(
+            skip_special_tokens=self.skip_special_tokens)
+
+        self.tokenizer: Tokenizer = tokenizer._tokenizer
+
+        # Find a safe place to start.
+        prompt_token_ids = request.prompt_token_ids or []
+        prompt_suffix = prompt_token_ids
+        prompt_len = len(prompt_suffix)
+        if prompt_len > 4:
+            for i in range(4, min(prompt_len + 1, 24)):
+                suffix = prompt_token_ids[-i:]
+                if '<EFBFBD>' not in self.tokenizer.decode(suffix):
+                    prompt_suffix = suffix
+                    break
+
+        # Prime the stream.
+        for tid in prompt_suffix:
+            self._protected_step(tid)
+
+        self.spaces_between_special_tokens = (
+            sampling_params.skip_special_tokens
+            or sampling_params.spaces_between_special_tokens)
+
+        if not self.spaces_between_special_tokens:
+            # Store dict of added token ids so that we can suppress
+            # the spaces between them.
+            if (added_token_ids := getattr(self.tokenizer, "added_token_ids",
+                                           None)) is None:
+                self.tokenizer.added_token_ids = added_token_ids = {
+                    tid: tok.content
+                    for tid, tok in
+                    self.tokenizer.get_added_tokens_decoder().items()
+                }
+
+            if added_token_ids:
+                self.last_special = False
+                self.added_token_ids = added_token_ids
+            else:
+                # No added tokens.
+                self.spaces_between_special_tokens = True
+
+    def decode_next(self, next_token_id: int) -> str:
+        token = self._protected_step(next_token_id)
+
+        if not self.spaces_between_special_tokens:
+            special_token = self.added_token_ids.get(next_token_id)
+            is_special = special_token is not None
+            if is_special and self.last_special:
+                # Return raw token string without any prefixed spaces.
+                token = special_token
+            self.last_special = is_special
+
+        return token or ""
+
+    def _protected_step(self, next_token_id: int) -> Optional[str]:
+        try:
+            token = self.stream.step(self.tokenizer, next_token_id)
+        except OverflowError:
+            # Handle rare observed overflow, still to be diagnosed.
+            # See https://github.com/vllm-project/vllm/issues/21951.
+            logger.exception("Encountered invalid token id: %d", next_token_id)
+            token = None
+        except Exception as e:
+            if not str(e).startswith(INVALID_PREFIX_ERR_MSG):
+                raise e
+            # Recover from edge case where tokenizer can produce non-monotonic,
+            # invalid UTF-8 output, which breaks the internal state of
+            # tokenizers' DecodeStream.
+            # See https://github.com/vllm-project/vllm/issues/17448.
+            logger.warning(
+                "Encountered invalid prefix detokenization error"
+                " for request %s, resetting decode stream.", self.request_id)
+            self.stream = DecodeStream(
+                skip_special_tokens=self.skip_special_tokens)
+            token = self.stream.step(self.tokenizer, next_token_id)
+        return token
+
+
+class SlowIncrementalDetokenizer(BaseIncrementalDetokenizer):
+
+    def __init__(self, tokenizer: AnyTokenizer, request: EngineCoreRequest):
+        super().__init__(request)
+
+        self.tokenizer = tokenizer
+        params = request.sampling_params
+        assert params is not None
+
+        self.prompt_len = length_from_prompt_token_ids_or_embeds(
+            request.prompt_token_ids, request.prompt_embeds)
+
+        # Metadata for incremental detokenization.
+        if request.prompt_token_ids is not None:
+            self.tokens, self.prefix_offset, self.read_offset = (
+                convert_prompt_ids_to_tokens(
+                    tokenizer=tokenizer,
+                    prompt_ids=request.prompt_token_ids,
+                    skip_special_tokens=params.skip_special_tokens,
+                ))
+        else:
+            # Prompt embedding requests cannot be detokenized, in general.
+            self.tokens = [""] * self.prompt_len
+            self.prefix_offset = 0
+            self.read_offest = 0
+
+        self.token_ids.extend(request.prompt_token_ids
+                              or [0] * self.prompt_len)
+
+        self.skip_special_tokens = params.skip_special_tokens
+        self.spaces_between_special_tokens = (
+            params.spaces_between_special_tokens)
+
+    @property
+    def output_token_ids(self) -> list[int]:
+        return self.token_ids if not self.prompt_len else (
+            self.token_ids[self.prompt_len:])
+
+    def decode_next(self, next_token_id: int) -> str:
+        new_tokens, decoded_text, prefix_offset, read_offset = (
+            detokenize_incrementally(
+                tokenizer=self.tokenizer,
+                all_input_ids=self.token_ids,
+                prev_tokens=self.tokens,
+                prefix_offset=self.prefix_offset,
+                read_offset=self.read_offset,
+                skip_special_tokens=self.skip_special_tokens,
+                spaces_between_special_tokens=self.
+                spaces_between_special_tokens,
+            ))
+
+        self.tokens.extend(new_tokens)
+        self.prefix_offset = prefix_offset
+        self.read_offset = read_offset
+
+        return decoded_text
+
+
+def check_stop_strings(
+    output_text: str,
+    new_char_count: int,
+    stop: list[str],
+    include_in_output: bool,
+) -> Optional[tuple[str, int]]:
+    """Check if any stop strings are matched and truncate sequence
+    output text accordingly.
+
+    Returns tuple (stop_string, offset) if matched or else None.
+
+    Where stop_string is the matched stop string and offset is the
+    length to which output_text should be truncated, or -1 for no
+    truncation.
+    """
+    if not new_char_count or not stop:
+        return None
+
+    for stop_str in stop:
+        stop_string_len = len(stop_str)
+        # Avoid searching already-searched text.
+        stop_index = output_text.find(stop_str,
+                                      1 - new_char_count - stop_string_len)
+        if stop_index == -1:
+            continue
+
+        if include_in_output:
+            # Truncate to end of stop string.
+            stop_index += stop_string_len
+            if stop_index >= len(output_text):
+                # No truncation required.
+                return stop_str, -1
+
+        # Truncate the output text to either the beginning
+        # or end of the stop string.
+        return stop_str, stop_index
+    return None
--- a/vllm/v1/engine/exceptions.py
+++ b/vllm/v1/engine/exceptions.py
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+class EngineGenerateError(Exception):
+    """Raised when a AsyncLLM.generate() fails. Recoverable."""
+    pass
+
+
+class EngineDeadError(Exception):
+    """Raised when the EngineCore dies. Unrecoverable."""
+
+    def __init__(self, *args, suppress_context: bool = False, **kwargs):
+        ENGINE_DEAD_MESSAGE = "EngineCore encountered an issue. See stack trace (above) for the root cause."  # noqa: E501
+
+        super().__init__(ENGINE_DEAD_MESSAGE, *args, **kwargs)
+        # Make stack trace clearer when using with LLMEngine by
+        # silencing irrelevant ZMQError.
+        self.__suppress_context__ = suppress_context
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -0,0 +1,370 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import time
+from collections.abc import Mapping
+from copy import copy
+from typing import Any, Callable, Optional, Union
+
+import torch.nn as nn
+from typing_extensions import TypeVar
+
+import vllm.envs as envs
+from vllm.config import ParallelConfig, VllmConfig
+from vllm.distributed import stateless_destroy_torch_distributed_process_group
+from vllm.distributed.parallel_state import get_dp_group
+from vllm.engine.arg_utils import EngineArgs
+from vllm.inputs import PromptType
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.outputs import PoolingRequestOutput, RequestOutput
+from vllm.pooling_params import PoolingParams
+from vllm.sampling_params import SamplingParams
+from vllm.tasks import SupportedTask
+from vllm.tracing import init_tracer
+from vllm.transformers_utils.tokenizer import (AnyTokenizer,
+                                               init_tokenizer_from_configs)
+from vllm.usage.usage_lib import UsageContext
+from vllm.utils import Device
+from vllm.v1.engine.core_client import EngineCoreClient
+from vllm.v1.engine.output_processor import OutputProcessor
+from vllm.v1.engine.parallel_sampling import ParentRequest
+from vllm.v1.engine.processor import Processor
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.metrics.loggers import StatLoggerFactory, StatLoggerManager
+from vllm.v1.metrics.reader import Metric, get_metrics_snapshot
+from vllm.v1.metrics.stats import IterationStats
+from vllm.v1.worker.worker_base import WorkerBase
+
+logger = init_logger(__name__)
+
+_R = TypeVar("_R", default=Any)
+
+
+class LLMEngine:
+    """Legacy LLMEngine for backwards compatibility."""
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        executor_class: type[Executor],
+        log_stats: bool,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[list[StatLoggerFactory]] = None,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+        use_cached_outputs: bool = False,
+        multiprocess_mode: bool = False,
+    ) -> None:
+        if not envs.VLLM_USE_V1:
+            raise ValueError(
+                "Using V1 LLMEngine, but envs.VLLM_USE_V1=False. "
+                "This should not happen. As a workaround, try using "
+                "LLMEngine.from_vllm_config(...) or explicitly set "
+                "VLLM_USE_V1=0 or 1 and report this issue on Github.")
+
+        if stat_loggers is not None:
+            raise NotImplementedError(
+                "Passing StatLoggers to LLMEngine in V1 is not yet supported. "
+                "Set VLLM_USE_V1=0 and file and issue on Github.")
+
+        self.vllm_config = vllm_config
+        self.observability_config = vllm_config.observability_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+
+        self.log_stats = log_stats
+
+        executor_backend = (
+            self.vllm_config.parallel_config.distributed_executor_backend)
+        parallel_config = vllm_config.parallel_config
+        self.external_launcher_dp = (parallel_config.data_parallel_size > 1 and
+                                     executor_backend == "external_launcher")
+        # important: init dp group before init the engine_core
+        # In the decoupled engine case this is handled in EngineCoreProc.
+        if not multiprocess_mode and parallel_config.data_parallel_size > 1 \
+            and not self.external_launcher_dp:
+            self.dp_group = parallel_config.stateless_init_dp_group()
+        else:
+            self.dp_group = None
+        self.should_execute_dummy_batch = False
+
+        if self.model_config.skip_tokenizer_init:
+            self.tokenizer = None
+        else:
+            # Tokenizer (+ ensure liveness if running in another process).
+            self.tokenizer = init_tokenizer_from_configs(
+                model_config=vllm_config.model_config)
+
+        # Processor (convert Inputs --> EngineCoreRequests)
+        self.processor = Processor(vllm_config=vllm_config,
+                                   tokenizer=self.tokenizer,
+                                   mm_registry=mm_registry)
+
+        # OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
+        self.output_processor = OutputProcessor(self.tokenizer,
+                                                log_stats=self.log_stats)
+        if self.observability_config.otlp_traces_endpoint is not None:
+            tracer = init_tracer(
+                "vllm.llm_engine",
+                self.observability_config.otlp_traces_endpoint)
+            self.output_processor.tracer = tracer
+
+        # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs)
+        self.engine_core = EngineCoreClient.make_client(
+            multiprocess_mode=multiprocess_mode,
+            asyncio_mode=False,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=self.log_stats,
+        )
+
+        self.logger_manager: Optional[StatLoggerManager] = None
+        if self.log_stats:
+            self.logger_manager = StatLoggerManager(
+                vllm_config=vllm_config,
+                custom_stat_loggers=stat_loggers,
+                enable_default_loggers=log_stats,
+            )
+            self.logger_manager.log_engine_initialized()
+
+        if not multiprocess_mode:
+            # for v0 compatibility
+            self.model_executor = self.engine_core.engine_core.model_executor  # type: ignore
+
+        if self.external_launcher_dp:
+            # If we use DP in external launcher mode, we reuse the
+            # existing DP group used for data communication.
+            self.dp_group = get_dp_group().cpu_group
+
+        # Don't keep the dummy data in memory
+        self.reset_mm_cache()
+
+    @classmethod
+    def from_vllm_config(
+        cls,
+        vllm_config: VllmConfig,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[list[StatLoggerFactory]] = None,
+        disable_log_stats: bool = False,
+    ) -> "LLMEngine":
+        return cls(vllm_config=vllm_config,
+                   executor_class=Executor.get_class(vllm_config),
+                   log_stats=(not disable_log_stats),
+                   usage_context=usage_context,
+                   stat_loggers=stat_loggers,
+                   multiprocess_mode=envs.VLLM_ENABLE_V1_MULTIPROCESSING)
+
+    @classmethod
+    def from_engine_args(
+        cls,
+        engine_args: EngineArgs,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[list[StatLoggerFactory]] = None,
+        enable_multiprocessing: bool = False,
+    ) -> "LLMEngine":
+        """Creates an LLM engine from the engine arguments."""
+
+        # Create the engine configs.
+        vllm_config = engine_args.create_engine_config(usage_context)
+        executor_class = Executor.get_class(vllm_config)
+
+        if envs.VLLM_ENABLE_V1_MULTIPROCESSING:
+            logger.debug("Enabling multiprocessing for LLMEngine.")
+            enable_multiprocessing = True
+
+        # Create the LLMEngine.
+        return cls(vllm_config=vllm_config,
+                   executor_class=executor_class,
+                   log_stats=not engine_args.disable_log_stats,
+                   usage_context=usage_context,
+                   stat_loggers=stat_loggers,
+                   multiprocess_mode=enable_multiprocessing)
+
+    def get_num_unfinished_requests(self) -> int:
+        return self.output_processor.get_num_unfinished_requests()
+
+    def has_unfinished_requests(self) -> bool:
+        has_unfinished = self.output_processor.has_unfinished_requests()
+        if self.dp_group is None:
+            return has_unfinished or self.engine_core.dp_engines_running()
+        return self.has_unfinished_requests_dp(has_unfinished)
+
+    def has_unfinished_requests_dp(self, has_unfinished: bool) -> bool:
+        aggregated_has_unfinished = ParallelConfig.has_unfinished_dp(
+            self.dp_group, has_unfinished)
+        if not has_unfinished and aggregated_has_unfinished:
+            self.should_execute_dummy_batch = True
+        return aggregated_has_unfinished
+
+    @classmethod
+    def validate_outputs(cls, outputs, output_type):
+        return outputs
+
+    def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
+        return self.engine_core.get_supported_tasks()
+
+    def abort_request(self, request_ids: list[str]) -> None:
+        """Remove request_ids from EngineCore and Detokenizer."""
+
+        request_ids = self.output_processor.abort_requests(request_ids)
+        self.engine_core.abort_requests(request_ids)
+
+    def add_request(
+        self,
+        request_id: str,
+        prompt: PromptType,
+        params: Union[SamplingParams, PoolingParams],
+        arrival_time: Optional[float] = None,
+        lora_request: Optional[LoRARequest] = None,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        priority: int = 0,
+    ) -> None:
+        # Validate the request_id type.
+        if not isinstance(request_id, str):
+            raise TypeError(
+                f"request_id must be a string, got {type(request_id)}")
+
+        # Process raw inputs into the request.
+        prompt_str, request = self.processor.process_inputs(
+            request_id, prompt, params, arrival_time, lora_request,
+            tokenization_kwargs, trace_headers, priority)
+
+        n = params.n if isinstance(params, SamplingParams) else 1
+
+        if n == 1:
+            # Make a new RequestState and queue.
+            self.output_processor.add_request(request, prompt_str, None, 0)
+            # Add the request to EngineCore.
+            self.engine_core.add_request(request)
+            return
+
+        # Fan out child requests (for n>1).
+        parent_req = ParentRequest(request_id, params)
+        for idx in range(n):
+            request_id, params = parent_req.get_child_info(idx)
+            child_request = request if idx == n - 1 else copy(request)
+            child_request.request_id = request_id
+            child_request.sampling_params = params
+
+            # Make a new RequestState and queue.
+            self.output_processor.add_request(child_request, prompt_str,
+                                              parent_req, idx)
+            # Add the request to EngineCore.
+            self.engine_core.add_request(child_request)
+
+    def step(self) -> Union[list[RequestOutput], list[PoolingRequestOutput]]:
+
+        if self.should_execute_dummy_batch:
+            self.should_execute_dummy_batch = False
+            self.engine_core.execute_dummy_batch()
+            return []
+
+        # 1) Get EngineCoreOutput from the EngineCore.
+        outputs = self.engine_core.get_output()
+
+        # 2) Process EngineCoreOutputs.
+        iteration_stats = IterationStats() if self.log_stats else None
+        processed_outputs = self.output_processor.process_outputs(
+            outputs.outputs,
+            engine_core_timestamp=outputs.timestamp,
+            iteration_stats=iteration_stats)
+
+        # 3) Abort any reqs that finished due to stop strings.
+        self.engine_core.abort_requests(processed_outputs.reqs_to_abort)
+
+        # 4) Record stats
+        if self.logger_manager is not None:
+            assert outputs.scheduler_stats is not None
+            self.logger_manager.record(
+                scheduler_stats=outputs.scheduler_stats,
+                iteration_stats=iteration_stats,
+            )
+            self.do_log_stats_with_interval()
+
+        return processed_outputs.request_outputs
+
+    def get_vllm_config(self):
+        return self.vllm_config
+
+    def get_model_config(self):
+        return self.model_config
+
+    def start_profile(self):
+        self.engine_core.profile(True)
+
+    def stop_profile(self):
+        self.engine_core.profile(False)
+
+    def reset_mm_cache(self):
+        self.processor.clear_cache()
+        self.engine_core.reset_mm_cache()
+
+    def reset_prefix_cache(self, device: Optional[Device] = None):
+        self.engine_core.reset_prefix_cache()
+
+    def sleep(self, level: int = 1):
+        self.engine_core.sleep(level)
+
+    def wake_up(self, tags: Optional[list[str]] = None):
+        self.engine_core.wake_up(tags)
+
+    def is_sleeping(self) -> bool:
+        return self.engine_core.is_sleeping()
+
+    def get_metrics(self) -> list[Metric]:
+        assert self.log_stats, "Stat logging disabled"
+        return get_metrics_snapshot()
+
+    def get_tokenizer(self) -> AnyTokenizer:
+        if self.tokenizer is None:
+            raise ValueError("Unable to get tokenizer because "
+                             "skip_tokenizer_init is True")
+
+        return self.tokenizer
+
+    def do_log_stats(self) -> None:
+        """Log stats if logging is enabled."""
+        if self.logger_manager:
+            self.logger_manager.log()
+
+    def do_log_stats_with_interval(self) -> None:
+        """Log stats when the time interval has passed."""
+        now = time.time()
+        if not hasattr(self, "_last_log_time"):
+            self._last_log_time = now
+        if now - self._last_log_time >= envs.VLLM_LOG_STATS_INTERVAL:
+            self.do_log_stats()
+            self._last_log_time = now
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        """Load a new LoRA adapter into the engine for future requests."""
+        return self.engine_core.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        """Remove an already loaded LoRA adapter."""
+        return self.engine_core.remove_lora(lora_id)
+
+    def list_loras(self) -> set[int]:
+        """List all registered adapters."""
+        return self.engine_core.list_loras()
+
+    def pin_lora(self, lora_id: int) -> bool:
+        """Prevent an adapter from being evicted."""
+        return self.engine_core.pin_lora(lora_id)
+
+    def collective_rpc(self,
+                       method: Union[str, Callable[[WorkerBase], _R]],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        return self.engine_core.collective_rpc(method, timeout, args, kwargs)
+
+    def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
+        return self.collective_rpc("apply_model", args=(func, ))
+
+    def __del__(self):
+        if dp_group := getattr(self, "dp_group",
+                               None) and not self.external_launcher_dp:
+            stateless_destroy_torch_distributed_process_group(dp_group)
--- a/vllm/v1/engine/logprobs.py
+++ b/vllm/v1/engine/logprobs.py
@@ -0,0 +1,201 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import itertools
+from collections.abc import Iterable
+from dataclasses import dataclass
+from typing import Optional
+
+from vllm.logger import init_logger
+from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
+from vllm.transformers_utils.detokenizer_utils import (
+    AnyTokenizer, convert_ids_list_to_tokens)
+from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
+from vllm.v1.outputs import LogprobsLists, LogprobsTensors
+
+logger = init_logger(__name__)
+
+NONES = itertools.repeat(None)
+
+
+@dataclass
+class LogprobsProcessor:
+
+    # Tokenizer for this request,
+    # None if detokenization is disabled.
+    tokenizer: Optional[AnyTokenizer]
+
+    # Logprobs for this request
+    logprobs: Optional[SampleLogprobs]
+    prompt_logprobs: Optional[PromptLogprobs]
+    cumulative_logprob: Optional[float]
+    num_logprobs: Optional[int]
+    num_prompt_logprobs: Optional[int]
+
+    @classmethod
+    def from_new_request(
+        cls,
+        tokenizer: Optional[AnyTokenizer],
+        request: EngineCoreRequest,
+    ) -> "LogprobsProcessor":
+        assert request.sampling_params is not None
+        num_logprobs = request.sampling_params.logprobs
+        num_prompt_logprobs = request.sampling_params.prompt_logprobs
+        return cls(
+            tokenizer=tokenizer,
+            cumulative_logprob=(None if num_logprobs is None else 0.),
+            logprobs=(None if num_logprobs is None else []),
+            # NOTE: logprob of first prompt token is None.
+            prompt_logprobs=(None if num_prompt_logprobs is None else [None]),
+            num_prompt_logprobs=num_prompt_logprobs,
+            num_logprobs=num_logprobs,
+        )
+
+    def _update_sample_logprobs(self, logprobs_lists: LogprobsLists) -> None:
+        """Update with sample logprobs from EngineCore.
+
+        Outer lists are only of len > 1 if EngineCore made
+        >1 tokens in prior step (e.g. in spec decoding).
+
+        Args:
+          logprobs_lists: the lists of logprob tokens, logprobs, and ranks.
+
+        """
+
+        assert self.num_logprobs is not None
+        assert self.logprobs is not None
+        assert self.cumulative_logprob is not None
+
+        token_ids_lst, logprobs_lst, ranks_lst = logprobs_lists
+
+        for rank, logprobs, token_ids in zip(ranks_lst, logprobs_lst,
+                                             token_ids_lst):
+
+            # Detokenize (non-incrementally).
+            decoded_tokens = NONES if self.tokenizer is None else (
+                convert_ids_list_to_tokens(self.tokenizer, token_ids))
+
+            # Sampler puts the sampled logprob in first.
+            sampled_token_logprob = logprobs[0]
+            self.cumulative_logprob += sampled_token_logprob
+
+            # Update with the Logprob dictionary for this pos.
+            self.logprobs.append(
+                self._make_logprob_dict(
+                    logprobs,
+                    token_ids,
+                    decoded_tokens,
+                    rank,
+                    self.num_logprobs,
+                ))
+
+    def _update_prompt_logprobs(
+        self,
+        prompt_logprobs_tensors: LogprobsTensors,
+    ) -> None:
+        """Update with prompt logprobs from EngineCore.
+
+        Args:
+          prompt_logprobs_tensors: tuple containing the prompt logprobs
+                                   tensors.
+
+        """
+
+        # Prompt logprobs are enabled.
+        assert self.num_prompt_logprobs is not None
+        assert self.prompt_logprobs is not None
+
+        token_ids, logprobs, ranks = prompt_logprobs_tensors
+
+        # Detokenize non-incrementally.
+        # Output is flat: [num_tok, num_lps] -> [num_tok * num_lps]
+        decoded_tokens = None if self.tokenizer is None else (
+            convert_ids_list_to_tokens(self.tokenizer,
+                                       token_ids.flatten().tolist()))
+
+        # Recover shapes.
+        num_prompt_tokens, num_logprobs = logprobs.shape
+
+        # Pythonize the torch tensors.
+        prompt_token_ranks = ranks.tolist()
+        prompt_logprobs = logprobs.tolist()
+        token_ids = token_ids.tolist()
+
+        # Make Logprob for each position.
+        for pos in range(num_prompt_tokens):
+            # Handle flattening.
+            offset = pos * num_logprobs
+            offset_end = offset + num_logprobs
+            decoded_tokens_for_pos = NONES \
+            if decoded_tokens is None else decoded_tokens[offset:offset_end]
+
+            # Update with the Logprob dictionary for this pos.
+            self.prompt_logprobs.append(
+                self._make_logprob_dict(prompt_logprobs[pos], token_ids[pos],
+                                        decoded_tokens_for_pos,
+                                        prompt_token_ranks[pos],
+                                        self.num_prompt_logprobs))
+
+    def pop_prompt_logprobs(self) -> Optional[PromptLogprobs]:
+        """Pop and return all request prompt logprobs
+
+        The logprobs processor aggregates prompt chunk logprobs
+        over one or more prefill chunks. This method returns
+        all prompt logprobs at once and then forgets them.
+        Ensures correct RequestOutputKind.DELTA semantics
+        wherein all prompt logprobs are returned at once at
+        the end of prefill.
+
+        Returns:
+          None if prompt logprobs are disabled for this request.
+          List of all prompt logprobs, otherwise.
+        """
+        plp = self.prompt_logprobs
+        if plp:
+            self.prompt_logprobs = []
+        return plp
+
+    @staticmethod
+    def _make_logprob_dict(
+        logprobs: list[float],
+        logprob_token_ids: list[int],
+        decoded_tokens: Iterable[Optional[str]],
+        rank: int,
+        num_logprobs: int,
+    ) -> dict[int, Logprob]:
+        """Make a Logprob dictionary for a position.
+
+        Args:
+          logprobs: list of log probabilities
+          logprob_token_ids: list of top token ids
+          decoded_tokens: list of decoded top tokens
+          rank: rank of the sampled token
+          num_logprobs: number of logprobs requested
+            by the user (in addition to sampled logprob)
+
+        Returns:
+          dict[token id, Logprob]
+        """
+        if num_logprobs == -1:
+            num_logprobs = len(logprobs)
+        # We do not need a special case for the sampled token
+        # being in the topk, since inserting duplicated data
+        # into a dictionary twice is the same as doing it once.
+        topk_ranks = range(1, num_logprobs + 1)
+        ranks = itertools.chain((rank, ), topk_ranks)
+
+        return {
+            token_id: Logprob(
+                logprob=logprob,
+                rank=rank,
+                decoded_token=token,
+            )
+            for token_id, logprob, rank, token in zip(
+                logprob_token_ids, logprobs, ranks, decoded_tokens)
+        }
+
+    def update_from_output(self, output: EngineCoreOutput) -> None:
+        if output.new_logprobs is not None:
+            self._update_sample_logprobs(output.new_logprobs)
+        if output.new_prompt_logprobs_tensors is not None:
+            self._update_prompt_logprobs(output.new_prompt_logprobs_tensors)
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -0,0 +1,576 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+from collections.abc import Iterable
+from dataclasses import dataclass
+from typing import Any, Optional, Union, cast
+
+import torch
+
+from vllm.outputs import (CompletionOutput, PoolingOutput,
+                          PoolingRequestOutput, RequestOutput)
+from vllm.sampling_params import RequestOutputKind
+from vllm.tracing import (SpanAttributes, SpanKind, Tracer,
+                          extract_trace_context)
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import length_from_prompt_token_ids_or_embeds
+from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
+from vllm.v1.engine.detokenizer import IncrementalDetokenizer
+from vllm.v1.engine.logprobs import LogprobsProcessor
+from vllm.v1.engine.parallel_sampling import ParentRequest
+from vllm.v1.metrics.stats import (IterationStats, LoRARequestStates,
+                                   RequestStateStats)
+
+
+class RequestOutputCollector:
+    """
+    Collects streamed RequestOutputs per individual request,
+    for hand-off to the consuming asyncio generate task.
+
+    When streaming deltas, RequestOutputs are merged if the
+    producer gets ahead of the consumer.
+    """
+
+    def __init__(self, output_kind: RequestOutputKind):
+        self.aggregate = output_kind == RequestOutputKind.DELTA
+        self.output: Optional[Union[RequestOutput, PoolingRequestOutput,
+                                    Exception]] = None
+        self.ready = asyncio.Event()
+
+    def put(self, output: Union[RequestOutput, PoolingRequestOutput,
+                                Exception]) -> None:
+        """Non-blocking put operation."""
+        if self.output is None or isinstance(output, Exception):
+            self.output = output
+            self.ready.set()
+        elif isinstance(self.output, (RequestOutput, PoolingRequestOutput)):
+            # This ensures that request outputs with different request indexes
+            # (if n > 1) do not override each other.
+            self.output.add(output, aggregate=self.aggregate)
+
+    async def get(self) -> Union[RequestOutput, PoolingRequestOutput]:
+        """Get operation blocks on put event."""
+        while (output := self.output) is None:
+            await self.ready.wait()
+        self.output = None
+        self.ready.clear()
+        if isinstance(output, Exception):
+            raise output
+        return output
+
+    def get_nowait(
+            self) -> Optional[Union[RequestOutput, PoolingRequestOutput]]:
+        """Non-blocking get operation."""
+        output = self.output
+        if output is not None:
+            self.output = None
+            self.ready.clear()
+        if isinstance(output, Exception):
+            raise output
+        return output
+
+
+@dataclass
+class OutputProcessorOutput:
+    request_outputs: list[Union[RequestOutput, PoolingRequestOutput]]
+    reqs_to_abort: list[str]
+
+
+class RequestState:
+
+    def __init__(
+        self,
+        request_id: str,
+        parent_req: Optional[ParentRequest],
+        request_index: int,
+        lora_name: Optional[str],
+        output_kind: RequestOutputKind,
+        prompt: Optional[str],
+        prompt_token_ids: Optional[list[int]],
+        prompt_embeds: Optional[torch.Tensor],
+        logprobs_processor: Optional[LogprobsProcessor],
+        detokenizer: Optional[IncrementalDetokenizer],
+        max_tokens_param: Optional[int],
+        arrival_time: float,
+        queue: Optional[RequestOutputCollector],
+        log_stats: bool,
+        top_p: Optional[float] = None,
+        n: Optional[int] = None,
+        temperature: Optional[float] = None,
+    ):
+        self.request_id = request_id
+        self.parent_req = parent_req
+        self.request_index = request_index
+        self.lora_name = lora_name
+        self.output_kind = output_kind
+        self.prompt = prompt
+        self.prompt_token_ids = prompt_token_ids
+        self.prompt_embeds = prompt_embeds
+        self.prompt_len = length_from_prompt_token_ids_or_embeds(
+            self.prompt_token_ids, self.prompt_embeds)
+        self.logprobs_processor = logprobs_processor
+        self.detokenizer = detokenizer
+        self.max_tokens_param = max_tokens_param
+        self.top_p = top_p
+        self.n = n
+        self.temperature = temperature
+        self.is_prefilling = True
+        self.queue = queue
+        self.num_cached_tokens = 0
+
+        self.stats = RequestStateStats(
+            arrival_time=arrival_time) if log_stats else None
+
+    @classmethod
+    def from_new_request(
+        cls,
+        tokenizer: AnyTokenizer,
+        request: EngineCoreRequest,
+        prompt: Optional[str],
+        parent_req: Optional[ParentRequest],
+        request_index: int,
+        queue: Optional[RequestOutputCollector],
+        log_stats: bool,
+    ) -> "RequestState":
+
+        if sampling_params := request.sampling_params:
+            if not sampling_params.detokenize:
+                tokenizer = None
+            output_kind = sampling_params.output_kind
+            logprobs_processor = LogprobsProcessor.from_new_request(
+                tokenizer=tokenizer,
+                request=request,
+            )
+            detokenizer = IncrementalDetokenizer.from_new_request(
+                tokenizer=tokenizer,
+                request=request,
+            )
+            max_tokens_param = sampling_params.max_tokens
+            top_p = sampling_params.top_p
+            n = sampling_params.n
+            temperature = sampling_params.temperature
+        else:
+            logprobs_processor = None
+            detokenizer = None
+            max_tokens_param = None
+            top_p = None
+            n = None
+            temperature = None
+            assert request.pooling_params is not None
+            output_kind = request.pooling_params.output_kind
+
+        return cls(
+            request_id=request.request_id,
+            parent_req=parent_req,
+            request_index=request_index,
+            lora_name=(request.lora_request.name
+                       if request.lora_request is not None else None),
+            output_kind=output_kind,
+            prompt=prompt,
+            prompt_token_ids=request.prompt_token_ids,
+            prompt_embeds=request.prompt_embeds,
+            logprobs_processor=logprobs_processor,
+            detokenizer=detokenizer,
+            max_tokens_param=max_tokens_param,
+            top_p=top_p,
+            n=n,
+            temperature=temperature,
+            arrival_time=request.arrival_time,
+            queue=queue,
+            log_stats=log_stats,
+        )
+
+    def make_request_output(
+        self,
+        new_token_ids: list[int],
+        pooling_output: Optional[torch.Tensor],
+        finish_reason: Optional[FinishReason],
+        stop_reason: Union[int, str, None],
+        kv_transfer_params: Optional[dict[str, Any]] = None,
+    ) -> Optional[Union[RequestOutput, PoolingRequestOutput]]:
+
+        finished = finish_reason is not None
+        final_only = self.output_kind == RequestOutputKind.FINAL_ONLY
+
+        if not finished and final_only:
+            # Only the final output is required in FINAL_ONLY mode.
+            return None
+
+        request_id = self.request_id
+        if pooling_output is not None:
+            return self._new_request_output(
+                request_id, [self._new_pooling_output(pooling_output)],
+                finished)
+
+        output = self._new_completion_output(new_token_ids, finish_reason,
+                                             stop_reason)
+
+        if self.parent_req is None:
+            outputs = [output]
+        else:
+            request_id, outputs, finished = self.parent_req.get_outputs(
+                request_id, output)
+            if not outputs:
+                return None
+
+        return self._new_request_output(request_id, outputs, finished,
+                                        kv_transfer_params)
+
+    def _new_request_output(
+        self,
+        request_id: str,
+        outputs: Union[list[CompletionOutput], list[PoolingOutput]],
+        finished: bool,
+        kv_transfer_params: Optional[dict[str, Any]] = None,
+    ) -> Union[RequestOutput, PoolingRequestOutput]:
+
+        first_output = outputs[0]
+        if isinstance(first_output, PoolingOutput):
+            assert len(outputs) == 1
+            # Prompt embeddings are currently not supported by pooling requests.
+            assert self.prompt_token_ids is not None
+            return PoolingRequestOutput(
+                request_id=request_id,
+                outputs=first_output,
+                prompt_token_ids=self.prompt_token_ids,
+                finished=finished,
+            )
+        assert self.logprobs_processor is not None
+        if self.output_kind == RequestOutputKind.DELTA:
+            # Side effect: logprobs processor forgets prompt logprobs
+            prompt_logprobs = self.logprobs_processor.pop_prompt_logprobs()
+        else:
+            prompt_logprobs = self.logprobs_processor.prompt_logprobs
+
+        # If prompt embeds were used, put placeholder prompt token ids
+        prompt_token_ids = self.prompt_token_ids
+        if prompt_token_ids is None and self.prompt_embeds is not None:
+            prompt_token_ids = [0] * len(self.prompt_embeds)
+
+        return RequestOutput(
+            request_id=request_id,
+            prompt=self.prompt,
+            prompt_token_ids=prompt_token_ids,
+            prompt_logprobs=prompt_logprobs,
+            outputs=cast(list[CompletionOutput], outputs),
+            finished=finished,
+            kv_transfer_params=kv_transfer_params,
+            num_cached_tokens=self.num_cached_tokens,
+        )
+
+    def _new_completion_output(
+        self,
+        token_ids: list[int],
+        finish_reason: Optional[FinishReason],
+        stop_reason: Union[int, str, None],
+    ) -> CompletionOutput:
+
+        assert self.detokenizer is not None
+        assert self.logprobs_processor is not None
+        finished = finish_reason is not None
+        delta = self.output_kind == RequestOutputKind.DELTA
+
+        # Prepare text and token_ids, based on delta mode
+        text = self.detokenizer.get_next_output_text(finished, delta)
+        if not delta:
+            token_ids = self.detokenizer.output_token_ids
+
+        # Prepare logprobs, based on delta mode
+        logprobs = self.logprobs_processor.logprobs
+        if delta and logprobs:
+            logprobs = logprobs[-len(token_ids):]
+
+        return CompletionOutput(
+            index=self.request_index,
+            text=text,
+            token_ids=token_ids,
+            logprobs=logprobs,
+            cumulative_logprob=self.logprobs_processor.cumulative_logprob,
+            finish_reason=str(finish_reason) if finished else None,
+            stop_reason=stop_reason if finished else None)
+
+    def _new_pooling_output(
+        self,
+        pooling_output: torch.Tensor,
+    ) -> PoolingOutput:
+
+        return PoolingOutput(data=pooling_output)
+
+
+class OutputProcessor:
+    """Process EngineCoreOutputs into RequestOutputs."""
+
+    def __init__(self, tokenizer: AnyTokenizer, log_stats: bool):
+        self.log_stats = log_stats
+        self.tokenizer = tokenizer
+        self.request_states: dict[str, RequestState] = {}
+        self.parent_requests: dict[str, ParentRequest] = {}
+        self.lora_states = LoRARequestStates()
+        self.tracer: Optional[Tracer] = None
+
+    def get_num_unfinished_requests(self):
+        return len(self.request_states)
+
+    def has_unfinished_requests(self) -> bool:
+        return len(self.request_states) > 0
+
+    def propagate_error(self, e: Exception):
+        """Propagate error to all generate() tasks."""
+
+        for _, state in self.request_states.items():
+            assert state.queue is not None
+            state.queue.put(e)
+
+    def abort_requests(
+        self,
+        request_ids: Iterable[str],
+    ) -> list[str]:
+        request_ids_to_abort = []
+        for request_id in request_ids:
+            req_state = self.request_states.pop(request_id, None)
+            if req_state is not None:
+                self.lora_states.abort_request(req_state)
+                request_ids_to_abort.append(request_id)
+                # Produce final abort output.
+                if req_state.queue is not None and (
+                        request_output := req_state.make_request_output(
+                            new_token_ids=[],
+                            # Set pooling_output is not None to
+                            # correctly enter the abort pooling branch
+                            pooling_output=torch.randn(0, device="cpu")
+                            if req_state.detokenizer is None else None,
+                            finish_reason=FinishReason.ABORT,
+                            stop_reason=None,
+                            kv_transfer_params=None)):
+                    req_state.queue.put(request_output)
+            elif parent := self.parent_requests.get(request_id):
+                # Abort children prior to removing the parent.
+                if parent.child_requests:
+                    child_reqs = list(parent.child_requests)
+                    child_reqs = self.abort_requests(child_reqs)
+                    request_ids_to_abort.extend(child_reqs)
+                self.parent_requests.pop(request_id, None)
+        return request_ids_to_abort
+
+    def add_request(
+        self,
+        request: EngineCoreRequest,
+        prompt: Optional[str],
+        parent_req: Optional[ParentRequest] = None,
+        request_index: int = 0,
+        queue: Optional[RequestOutputCollector] = None,
+    ) -> None:
+        request_id = request.request_id
+        if request_id in self.request_states:
+            raise ValueError(f"Request id {request_id} already running.")
+
+        req_state = RequestState.from_new_request(tokenizer=self.tokenizer,
+                                                  request=request,
+                                                  prompt=prompt,
+                                                  parent_req=parent_req,
+                                                  request_index=request_index,
+                                                  queue=queue,
+                                                  log_stats=self.log_stats)
+        self.request_states[request_id] = req_state
+        self.lora_states.add_request(req_state)
+        if parent_req:
+            self.parent_requests[parent_req.request_id] = parent_req
+
+    def process_outputs(
+        self,
+        engine_core_outputs: list[EngineCoreOutput],
+        engine_core_timestamp: Optional[float] = None,
+        iteration_stats: Optional[IterationStats] = None,
+    ) -> OutputProcessorOutput:
+        """
+        Process the EngineCoreOutputs:
+        1) Compute stats for logging
+        2) Detokenize
+        3) Create and handle RequestOutput objects:
+            * If there is a queue (for usage with AsyncLLM),
+              put the RequestOutput objects into the queue for
+              handling by the per-request generate() tasks.
+
+            * If there is no queue (for usage with LLMEngine),
+              return a list of RequestOutput objects.
+
+        NOTE FOR DEVELOPERS
+
+        vLLM V1 minimizes the number of python loops over the full
+        batch to ensure system overheads are minimized. This is the
+        only function that should loop over EngineCoreOutputs.
+
+        If you need to touch every element of the batch, do it from
+        within the loop below.
+        """
+
+        request_outputs: Union[list[RequestOutput],
+                               list[PoolingRequestOutput]] = []
+        reqs_to_abort: list[str] = []
+        for engine_core_output in engine_core_outputs:
+            req_id = engine_core_output.request_id
+            req_state = self.request_states.get(req_id)
+            if req_state is None:
+                # Ignore output for already-aborted request.
+                continue
+
+            # 1) Compute stats for this iteration.
+            self._update_stats_from_output(req_state, engine_core_output,
+                                           engine_core_timestamp,
+                                           iteration_stats)
+
+            new_token_ids = engine_core_output.new_token_ids
+            pooling_output = engine_core_output.pooling_output
+            finish_reason = engine_core_output.finish_reason
+            stop_reason = engine_core_output.stop_reason
+            kv_transfer_params = engine_core_output.kv_transfer_params
+            req_state.num_cached_tokens = engine_core_output.num_cached_tokens
+            req_state.is_prefilling = False
+
+            if pooling_output is None:
+                assert req_state.detokenizer is not None
+                assert req_state.logprobs_processor is not None
+                # 2) Detokenize the token ids into text and perform stop checks.
+                stop_string = req_state.detokenizer.update(
+                    new_token_ids, finish_reason == FinishReason.STOP)
+                if stop_string:
+                    finish_reason = FinishReason.STOP
+                    stop_reason = stop_string
+
+                # 3) Compute sample and prompt logprobs for request,
+                # if required.
+                req_state.logprobs_processor.update_from_output(
+                    engine_core_output)
+
+            # 4) Create and handle RequestOutput objects.
+            if request_output := req_state.make_request_output(
+                    new_token_ids, pooling_output, finish_reason, stop_reason,
+                    kv_transfer_params):
+                if req_state.queue is not None:
+                    # AsyncLLM: put into queue for handling by generate().
+                    req_state.queue.put(request_output)
+                else:
+                    # LLMEngine: return list of RequestOutputs.
+                    request_outputs.append(request_output)
+
+            # Free completed requests.
+            if finish_reason is not None:
+                self.request_states.pop(req_id)
+                # Remove parent request if applicable.
+                parent_req = req_state.parent_req
+                if parent_req and not parent_req.child_requests:
+                    self.parent_requests.pop(parent_req.request_id, None)
+                if not engine_core_output.finished:
+                    # If req not finished in EngineCore, but Detokenizer
+                    # detected stop string, abort needed in EngineCore.
+                    reqs_to_abort.append(req_id)
+
+                # Track per-request stats
+                self._update_stats_from_finished(req_state, finish_reason,
+                                                 iteration_stats)
+                if self.tracer:
+                    self.do_tracing(engine_core_output, req_state,
+                                    iteration_stats)
+        self.lora_states.update_iteration_stats(iteration_stats)
+
+        return OutputProcessorOutput(
+            request_outputs=request_outputs,
+            reqs_to_abort=reqs_to_abort,
+        )
+
+    def do_tracing(self, engine_core_output: EngineCoreOutput,
+                   req_state: RequestState,
+                   iteration_stats: Optional[IterationStats]) -> None:
+        assert req_state.stats is not None
+        assert iteration_stats is not None
+        assert self.tracer is not None
+
+        arrival_time_nano_seconds = int(req_state.stats.arrival_time * 1e9)
+        trace_context = extract_trace_context(engine_core_output.trace_headers)
+        prompt_length = length_from_prompt_token_ids_or_embeds(
+            req_state.prompt_token_ids, req_state.prompt_embeds)
+        with (self.tracer.start_as_current_span(
+                "llm_request",
+                kind=SpanKind.SERVER,
+                context=trace_context,
+                start_time=arrival_time_nano_seconds) as span):
+            metrics = req_state.stats
+            e2e_time = iteration_stats.iteration_timestamp - \
+                       metrics.arrival_time
+            queued_time = metrics.scheduled_ts - metrics.queued_ts
+            prefill_time = metrics.first_token_ts - metrics.scheduled_ts
+            decode_time = metrics.last_token_ts - metrics.first_token_ts
+            inference_time = metrics.last_token_ts - metrics.scheduled_ts
+            span.set_attribute(
+                SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN,
+                metrics.first_token_latency)
+            span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E, e2e_time)
+            span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE,
+                               queued_time)
+            span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS,
+                               prompt_length)
+            span.set_attribute(SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS,
+                               metrics.num_generation_tokens)
+            span.set_attribute(
+                SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL,
+                prefill_time)
+            span.set_attribute(
+                SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_DECODE,
+                decode_time)
+            span.set_attribute(
+                SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE,
+                inference_time)
+
+            # meta
+            span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID,
+                               req_state.request_id)
+            if req_state.top_p:
+                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P,
+                                   req_state.top_p)
+            if req_state.max_tokens_param:
+                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS,
+                                   req_state.max_tokens_param)
+            if req_state.temperature:
+                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE,
+                                   req_state.temperature)
+            if req_state.n:
+                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_N,
+                                   req_state.n)
+
+    def _update_stats_from_output(self, req_state: RequestState,
+                                  engine_core_output: EngineCoreOutput,
+                                  engine_core_timestamp: Optional[float],
+                                  iteration_stats: Optional[IterationStats]):
+        if iteration_stats is None:
+            return
+
+        lora_stats = self.lora_states.get_stats(req_state)
+
+        assert engine_core_timestamp is not None
+        assert req_state.stats is not None
+        iteration_stats.update_from_output(engine_core_output,
+                                           engine_core_timestamp,
+                                           req_state.is_prefilling,
+                                           req_state.prompt_len,
+                                           req_state.stats, lora_stats)
+
+    def _update_stats_from_finished(self, req_state: RequestState,
+                                    finish_reason: Optional[FinishReason],
+                                    iteration_stats: Optional[IterationStats]):
+        if iteration_stats is None:
+            return
+
+        assert finish_reason is not None
+        assert req_state.stats is not None
+        iteration_stats.update_from_finished_request(
+            finish_reason=finish_reason,
+            num_prompt_tokens=length_from_prompt_token_ids_or_embeds(
+                req_state.prompt_token_ids, req_state.prompt_embeds),
+            max_tokens_param=req_state.max_tokens_param,
+            req_stats=req_state.stats)
+        self.lora_states.finish_request(req_state)
+
+        ParentRequest.observe_finished_request(
+            req_state.parent_req, iteration_stats,
+            req_state.stats.num_generation_tokens)
--- a/vllm/v1/engine/parallel_sampling.py
+++ b/vllm/v1/engine/parallel_sampling.py
@@ -0,0 +1,133 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from copy import copy
+from typing import Optional
+
+from vllm.outputs import CompletionOutput
+from vllm.sampling_params import RequestOutputKind, SamplingParams
+from vllm.v1.metrics.stats import IterationStats
+
+
+class ParentRequest:
+    """Info, state & processing for parallel sampling request.
+
+    Store parent request ID and sampling params.
+    Facilitate generating child request sampling params.
+    """
+
+    request_id: str
+    sampling_params: SamplingParams
+
+    # To track the completion of child requests
+    child_requests: set[str]
+
+    # To aggregate child completions when not streaming
+    output_aggregator: list[CompletionOutput]
+
+    # To find the max number of generated tokens across all children
+    max_num_generation_tokens: int
+
+    # To efficiently obtain child sampling params
+    cached_child_sampling_params: Optional[SamplingParams]
+
+    def __init__(self, request_id: str,
+                 sampling_params: SamplingParams) -> None:
+        self.request_id = request_id
+        self.sampling_params = sampling_params
+
+        self.child_requests = set()
+        self.output_aggregator = [None] * sampling_params.n if (
+            sampling_params.output_kind
+            == RequestOutputKind.FINAL_ONLY) else []
+        self.max_num_generation_tokens = 0
+        self.cached_child_sampling_params = None
+
+    def _get_child_sampling_params(
+        self,
+        index: int,
+    ) -> SamplingParams:
+        """Efficiently obtain child `sampling_params`
+
+        If `sampling_params.seed` is not `None` then 
+        each child request requires a unique clone of
+        parent `sampling_params` with a unique seed.
+
+        Args:
+          index: index within `n` child requests
+
+        Returns:
+          Child `sampling_params` instance.
+        """
+        seed = self.sampling_params.seed
+        if self.cached_child_sampling_params:
+            # Reuse child sampling_params data structure
+            return self.cached_child_sampling_params
+        # Build child sampling_params
+        child_sampling_params = copy(self.sampling_params)
+        child_sampling_params.n = 1
+        if seed is None:
+            # Cache child sampling_params for later reuse
+            self.cached_child_sampling_params = child_sampling_params
+        else:
+            # Each child gets a clone with a unique seed
+            child_sampling_params.seed = seed + index
+        return child_sampling_params
+
+    def get_child_info(self, index: int) -> tuple[str, SamplingParams]:
+        """Get child request ID and sampling params.
+        
+        Args:
+          index: index within `n` child requests.
+        
+        Returns:
+          (request ID, sampling_params) tuple
+        """
+        child_req_id = f"{index}_{self.request_id}"
+        self.child_requests.add(child_req_id)
+        return child_req_id, self._get_child_sampling_params(index)
+
+    @property
+    def n(self) -> int:
+        return self.sampling_params.n
+
+    def get_outputs(
+        self,
+        child_request_id: str,
+        completion_output: CompletionOutput,
+    ) -> tuple[str, list[CompletionOutput], bool]:
+        if completion_output.finished():
+            self.child_requests.remove(child_request_id)
+
+        if self.sampling_params.output_kind != RequestOutputKind.FINAL_ONLY:
+            # If streaming, just return the current output.
+            outputs = [completion_output]
+        else:
+            # If not streaming, aggregate the n final outputs.
+            self.output_aggregator[completion_output.index] = completion_output
+            outputs = [] if self.child_requests else self.output_aggregator
+
+        finished = not self.child_requests
+        return self.request_id, outputs, finished
+
+    def observe_num_generation_tokens(self, num_generation_tokens: int):
+        self.max_num_generation_tokens = max(num_generation_tokens,
+                                             self.max_num_generation_tokens)
+        return self.max_num_generation_tokens
+
+    @staticmethod
+    def observe_finished_request(parent_req: Optional['ParentRequest'],
+                                 iteration_stats: IterationStats,
+                                 num_generation_tokens: int):
+
+        n_param = parent_req.n if parent_req is not None else 1
+
+        if parent_req is not None:
+            num_generation_tokens = parent_req.observe_num_generation_tokens(
+                num_generation_tokens)
+
+        # Child requests finished, we can now record to iteration stats
+        if parent_req is None or not parent_req.child_requests:
+            iteration_stats.max_num_generation_tokens_iter.append(
+                num_generation_tokens)
+            iteration_stats.n_params_iter.append(n_param)
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -0,0 +1,545 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import time
+from collections.abc import Mapping
+from typing import Any, Literal, Optional, Union
+
+from vllm.config import VllmConfig
+from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs
+from vllm.inputs.parse import split_enc_dec_inputs
+from vllm.inputs.preprocess import InputPreprocessor
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.multimodal.cache import processor_cache_from_config
+from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalUUIDDict
+from vllm.multimodal.processing import EncDecMultiModalProcessor
+from vllm.multimodal.utils import argsort_mm_positions
+from vllm.pooling_params import PoolingParams
+from vllm.sampling_params import SamplingParams
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import length_from_prompt_token_ids_or_embeds
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.structured_output.backend_guidance import (
+    validate_guidance_grammar)
+from vllm.v1.structured_output.backend_lm_format_enforcer import (
+    validate_structured_output_request_lm_format_enforcer)
+from vllm.v1.structured_output.backend_outlines import (
+    validate_structured_output_request_outlines)
+from vllm.v1.structured_output.backend_xgrammar import (
+    validate_xgrammar_grammar)
+
+logger = init_logger(__name__)
+
+
+class Processor:
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        tokenizer: AnyTokenizer,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+    ):
+
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.structured_outputs_config = vllm_config.structured_outputs_config
+        self.tokenizer = tokenizer
+
+        self.generation_config_fields = (
+            self.model_config.try_get_generation_config())
+
+        self.mm_registry = mm_registry
+        self.mm_processor_cache = processor_cache_from_config(
+            vllm_config, mm_registry)
+
+        self.input_preprocessor = InputPreprocessor(
+            self.model_config,
+            self.tokenizer,
+            mm_registry,
+            mm_processor_cache=self.mm_processor_cache,
+        )
+
+    def _validate_logprobs(
+        self,
+        params: SamplingParams,
+    ) -> None:
+        max_logprobs = self.model_config.max_logprobs
+        if max_logprobs == -1:
+            max_logprobs = self.model_config.get_vocab_size()
+
+        # Validate sample logprobs.
+        if params.logprobs:
+            num_logprobs = params.logprobs
+            if num_logprobs == -1:
+                num_logprobs = self.model_config.get_vocab_size()
+            if num_logprobs > max_logprobs:
+                raise ValueError(
+                    f"Requested sample logprobs of {num_logprobs}, "
+                    f"which is greater than max allowed: {max_logprobs}")
+
+        # Validate prompt logprobs.
+        if params.prompt_logprobs:
+            num_prompt_logprobs = params.prompt_logprobs
+            if num_prompt_logprobs == -1:
+                num_prompt_logprobs = self.model_config.get_vocab_size()
+            if num_prompt_logprobs > max_logprobs:
+                raise ValueError(
+                    f"Requested prompt logprobs of {num_prompt_logprobs}, "
+                    f"which is greater than max allowed: {max_logprobs}")
+
+    def _validate_sampling_params(
+        self,
+        params: SamplingParams,
+    ) -> None:
+        self._validate_structured_output(params)
+        self._validate_logit_bias(params)
+
+        if params.allowed_token_ids is None:
+            return
+        if not params.allowed_token_ids:
+            raise ValueError("allowed_token_ids is not None and empty!")
+        if self.tokenizer is None:
+            # When skip_tokenizer_init=True, we can't validate token IDs
+            # Skip validation and let the model handle invalid tokens
+            return
+        vocab_size = len(self.tokenizer)
+        if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids):
+            raise ValueError(
+                "allowed_token_ids contains out-of-vocab token id!")
+
+    def _validate_logit_bias(
+        self,
+        params: SamplingParams,
+    ) -> None:
+        """Validate logit_bias token IDs are within vocabulary range."""
+        if not params.logit_bias:
+            return
+
+        vocab_size = self.model_config.get_vocab_size()
+        invalid_token_ids = []
+
+        for token_id in params.logit_bias:
+            if token_id < 0 or token_id >= vocab_size:
+                invalid_token_ids.append(token_id)
+
+        if invalid_token_ids:
+            raise ValueError(
+                f"token_id(s) {invalid_token_ids} in logit_bias contain "
+                f"out-of-vocab token ids. Vocabulary size: {vocab_size}")
+
+    def _validate_supported_sampling_params(
+        self,
+        params: SamplingParams,
+    ) -> None:
+        # Best of not yet supported.
+        if params.best_of is not None and params.best_of > 1:
+            raise ValueError("vLLM V1 does not yet support best_of.")
+        # Logits processors not supported.
+        if params.logits_processors:
+            raise ValueError("vLLM V1 does not support per request "
+                             "user provided logits processors.")
+
+    def _validate_params(
+        self,
+        params: Union[SamplingParams, PoolingParams],
+    ):
+        """
+        Validate supported SamplingParam.
+        Should raise ValueError if unsupported for API Server.
+        """
+
+        if isinstance(params, PoolingParams):
+            return
+
+        self._validate_logprobs(params)
+        self._validate_sampling_params(params)
+        self._validate_supported_sampling_params(params)
+
+    def _validate_multi_modal_uuids(self, prompt: PromptType) -> None:
+        """
+        Validate that user-provided multi_modal_uuids align with
+        multi_modal_data in the incoming request prompt(s).
+        Only checks lengths; `None` entries are allowed and will be
+        auto-hashed downstream.
+        """
+
+        def _validate_single_prompt(single_prompt: Union[dict, str]) -> None:
+            if not isinstance(single_prompt, dict):
+                return
+            mm_data = single_prompt.get("multi_modal_data")
+            mm_uuids = single_prompt.get("multi_modal_uuids")
+            if not mm_data or not mm_uuids:
+                return
+
+            for modality, items in mm_data.items():
+                if modality in mm_uuids:
+                    data_len = len(items) if isinstance(items, list) else 1
+                    uuid_len = len(mm_uuids[modality]) if isinstance(
+                        mm_uuids[modality], list) else 1
+                    if uuid_len != data_len:
+                        raise ValueError(
+                            f"multi_modal_uuids for modality '{modality}' "
+                            "must have same length as data: got "
+                            f"{uuid_len} uuids vs "
+                            f"{data_len} items.")
+                else:
+                    raise ValueError(
+                        f"multi_modal_uuids for modality '{modality}' must "
+                        "be provided if multi_modal_data is provided.")
+
+        # Handle explicit encoder/decoder prompts or singleton prompt
+        if isinstance(prompt, dict) and "encoder_prompt" in prompt:
+            enc = prompt.get("encoder_prompt")
+            dec = prompt.get("decoder_prompt")
+            if enc is not None:
+                _validate_single_prompt(enc)
+            if dec is not None:
+                _validate_single_prompt(dec)
+        else:
+            _validate_single_prompt(prompt)  # type: ignore[arg-type]
+
+    def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None:
+        if lora_request is None:
+            return
+
+        # LoRA request passed in while LoRA is not enabled
+        if not self.lora_config:
+            raise ValueError(f"Got lora_request {lora_request} but LoRA is "
+                             "not enabled!")
+
+        if self.tokenizer is not None:
+            logger.warning_once(
+                "vLLM has deprecated support for supporting different "
+                "tokenizers for different LoRAs. By default, vLLM uses base "
+                "model's tokenizer. If you are using a LoRA "
+                "with its own tokenizer, consider specifying `--tokenizer "
+                "[lora_path]` to use the LoRA tokenizer.")
+
+    def _validate_structured_output(self, params: SamplingParams) -> None:
+        if not params.structured_outputs or not self.structured_outputs_config:
+            return
+
+        if self.model_config.skip_tokenizer_init and params.structured_outputs:
+            raise ValueError(
+                "Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'"  # noqa: E501
+            )
+
+        backend = self.structured_outputs_config.backend
+        if _backend := params.structured_outputs._backend:
+            # Request-level backend selection is not supported.
+            # The values may differ if `params` is reused and was set
+            # to a specific backend based on `auto` behavior in a previous
+            # request. We remember that it was set as a result of `auto`
+            # using the `_backend_was_auto` field set in the params.
+            if (backend != _backend
+                    and not (backend == "auto"
+                             and params.structured_outputs._backend_was_auto)):
+                raise ValueError(
+                    "Request-level structured output backend selection is not "
+                    f"supported. The request specified '{_backend}', but vLLM "
+                    f"was initialised with '{backend}'. This error can be "
+                    "resolved by removing '_backend' from the request.")
+        else:
+            params.structured_outputs._backend = backend
+
+        # Request content validation
+        if (isinstance(params.structured_outputs.choice, list)
+                and not params.structured_outputs.choice):
+            # It is invalid for choice to be an empty list
+            raise ValueError(
+                f"Choice '{params.structured_outputs.choice}' cannot be an empty list"  # noqa: E501
+            )
+
+        if backend.startswith("xgrammar"):
+            # xgrammar with no fallback
+            validate_xgrammar_grammar(params)
+        elif backend.startswith("guidance"):
+            # TODO: ideally we would have the LLTokenizer here as Lark syntax
+            # allows <|special_token|> and similar, see
+            # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens
+            # Without tokenizer these are disallowed in grammars.
+            validate_guidance_grammar(params, tokenizer=None)
+        elif backend == "outlines":
+            # outlines backend
+            validate_structured_output_request_outlines(params)
+        elif backend == "lm-format-enforcer":
+            # lm format enforcer backend
+            validate_structured_output_request_lm_format_enforcer(params)
+        else:
+            # NOTE: backend must be "auto" here, because we have
+            # checked supported_backends above.
+            # In this mode, we set opinionated defaults based on what we think
+            # will satisfy the most use cases without having to worry about
+            # this setting. We include fallback behavior here, but not with any
+            # other setting where a specific backend was specified.
+            try:
+                validate_xgrammar_grammar(params)
+                params.structured_outputs._backend = "xgrammar"
+            except ValueError:
+                # The request either failed validation
+                # or includes some jsonschema feature(s) that
+                # are not supported in xgrammar. Fall back to guidance.
+                validate_guidance_grammar(params, tokenizer=None)
+                params.structured_outputs._backend = "guidance"
+            # Remember that this backend was set automatically
+            params.structured_outputs._backend_was_auto = True
+
+    def _maybe_build_mm_uuids(
+        self,
+        request_id: str,
+        prompt: PromptType,
+    ) -> Optional[MultiModalUUIDDict]:
+        """Build per-item multimodal hash overrides when enabled. In this case,
+        multimodal data items are identified by their request id, modality and
+        index rather than their content.
+
+        Returns a dictionary of modality -> list[str] of overrides, or None if
+        disabled or no multimodal data is present.
+        """
+
+        def _extract_mm_data(p: PromptType):
+            if isinstance(p, dict) and "encoder_prompt" in p:
+                enc = p.get("encoder_prompt")
+                if isinstance(enc, dict):
+                    return enc.get("multi_modal_data")
+                return None
+            if isinstance(p, dict):
+                return p.get("multi_modal_data")
+            return None
+
+        mm_data = _extract_mm_data(prompt)
+        if not mm_data:
+            return None
+
+        mm_uuids: MultiModalUUIDDict = {}
+        for modality, data in mm_data.items():
+            n = len(data) if isinstance(data, list) else 1
+            mm_uuids[modality] = [
+                f"{request_id}-{modality}-{i}" for i in range(n)
+            ]
+        return mm_uuids
+
+    def process_inputs(
+        self,
+        request_id: str,
+        prompt: PromptType,
+        params: Union[SamplingParams, PoolingParams],
+        arrival_time: Optional[float] = None,
+        lora_request: Optional[LoRARequest] = None,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        priority: int = 0,
+        data_parallel_rank: Optional[int] = None,
+    ) -> tuple[Optional[str], EngineCoreRequest]:
+
+        # TODO(woosuk): Support pooling models.
+        self._validate_lora(lora_request)
+        self._validate_params(params)
+
+        data_parallel_size = self.vllm_config.parallel_config.data_parallel_size
+        if data_parallel_rank is not None and not (0 <= data_parallel_rank <
+                                                   data_parallel_size):
+            raise ValueError(f"data_parallel_rank {data_parallel_rank} "
+                             f"is out of range [0, {data_parallel_size}).")
+
+        if arrival_time is None:
+            arrival_time = time.time()
+
+        # Optionally generate multimodal hash overrides to avoid hashing
+        # multimodal data items by their content as their identifiers.
+
+        # NOTE: when users explicitly turn off BOTH prefix caching and input
+        # processing caching, no multimodal features or embeddings will be
+        # reused across requests, therefore identifying multimodal data items
+        # by their content is no longer necessary, and we create uuids with
+        # request id-modality-index as multimodal hash overrides.
+        if (self.model_config.multimodal_config and
+                self.model_config.multimodal_config.mm_processor_cache_gb == 0
+                and not self.cache_config.enable_prefix_caching):
+            mm_uuids = self._maybe_build_mm_uuids(request_id, prompt)
+        else:
+            # Otherwise, use user-provided uuids as multimodal hash overrides
+            # if provided.
+            self._validate_multi_modal_uuids(prompt)
+            if isinstance(prompt, dict):
+                mm_uuids = prompt.get("multi_modal_uuids")
+            else:
+                mm_uuids = None
+
+        # Process inputs, which includes:
+        # 1. Tokenize text prompt, with LoRA request if one exists.
+        # 2. For multimodal models with a merged preprocessor, preprocess
+        #   multimodal data and expand prompt token ids accordingly.
+        processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
+            prompt,
+            tokenization_kwargs=tokenization_kwargs,
+            mm_uuids=mm_uuids,
+        )
+        from vllm.platforms import current_platform
+        current_platform.validate_request(
+            prompt=prompt,
+            params=params,
+            processed_inputs=processed_inputs,
+        )
+
+        eos_token_id = self.input_preprocessor.get_eos_token_id()
+
+        encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
+        self._validate_model_inputs(encoder_inputs, decoder_inputs)
+
+        # Mypy does not always properly infer the types of some elements of
+        # discriminated unions of TypedDicts, because of how it handles
+        # inheritance of TypedDict. If we explicitly extract the items we want
+        # we can avoid type errors from using `dict.get` later in the method.
+        prompt_str: Optional[str] = None if decoder_inputs[
+            "type"] == "embeds" else decoder_inputs.get("prompt")
+        prompt_token_ids = decoder_inputs[
+            "prompt_token_ids"] if decoder_inputs["type"] != "embeds" else None
+        prompt_embeds = decoder_inputs["prompt_embeds"] if decoder_inputs[
+            "type"] == "embeds" else None
+
+        sampling_params = None
+        pooling_params = None
+        if isinstance(params, SamplingParams):
+            # TODO: can we avoid cloning here in multiproc case?
+            sampling_params = params.clone()
+            # If unset max tokens, then generate up to the max_model_len.
+            if sampling_params.max_tokens is None:
+                seq_len = length_from_prompt_token_ids_or_embeds(
+                    prompt_token_ids, prompt_embeds)
+                sampling_params.max_tokens = \
+                    self.model_config.max_model_len - seq_len
+            sampling_params.update_from_generation_config(
+                self.generation_config_fields, eos_token_id)
+            if self.tokenizer is not None:
+                sampling_params.update_from_tokenizer(self.tokenizer)
+        else:
+            pooling_params = params.clone()
+
+        # Multimodal related.
+        mm_features: Optional[list[MultiModalFeatureSpec]] = None
+
+        if decoder_inputs["type"] == "multimodal":
+            decoder_mm_inputs = decoder_inputs["mm_kwargs"]
+            decoder_mm_positions = decoder_inputs["mm_placeholders"]
+            decoder_mm_hashes = decoder_inputs["mm_hashes"]
+
+            # Merge and flatten multimodal placeholders, hashes and inputs
+            # from dictionaries to lists, and sort them by each item's position
+            # in the input sequence.
+            sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions)
+
+            mm_features = []
+            for modality, idx in sorted_mm_idxs:
+                mm_features.append(
+                    MultiModalFeatureSpec(
+                        data=decoder_mm_inputs[modality][idx],
+                        modality=modality,
+                        identifier=decoder_mm_hashes[modality][idx],
+                        mm_position=decoder_mm_positions[modality][idx]))
+
+        return prompt_str, EngineCoreRequest(
+            request_id=request_id,
+            prompt_token_ids=prompt_token_ids,
+            prompt_embeds=prompt_embeds,
+            mm_features=mm_features,
+            sampling_params=sampling_params,
+            pooling_params=pooling_params,
+            eos_token_id=eos_token_id,
+            arrival_time=arrival_time,
+            lora_request=lora_request,
+            cache_salt=decoder_inputs.get("cache_salt"),
+            priority=priority,
+            data_parallel_rank=data_parallel_rank,
+            trace_headers=trace_headers,
+        )
+
+    def _validate_model_inputs(self, encoder_inputs: Optional[SingletonInputs],
+                               decoder_inputs: SingletonInputs):
+        if encoder_inputs is not None:
+            self._validate_model_input(encoder_inputs, prompt_type="encoder")
+
+        self._validate_model_input(decoder_inputs, prompt_type="decoder")
+
+    def _validate_model_input(
+        self,
+        prompt_inputs: SingletonInputs,
+        *,
+        prompt_type: Literal["encoder", "decoder"],
+    ):
+        model_config = self.model_config
+
+        prompt_ids = None if prompt_inputs[
+            "type"] == "embeds" else prompt_inputs["prompt_token_ids"]
+        prompt_embeds = prompt_inputs["prompt_embeds"] if prompt_inputs[
+            "type"] == "embeds" else None
+        prompt_len = length_from_prompt_token_ids_or_embeds(
+            prompt_ids, prompt_embeds)
+        if not prompt_ids:
+            if prompt_type == "encoder" and model_config.is_multimodal_model:
+                pass  # Mllama may have empty encoder inputs for text-only data
+            elif prompt_inputs["type"] == "embeds":
+                pass  # Prompt embeds should not have prompt_ids.
+            else:
+                raise ValueError(f"The {prompt_type} prompt cannot be empty")
+
+        if self.model_config.skip_tokenizer_init:
+            tokenizer = None
+        else:
+            tokenizer = self.tokenizer
+            max_input_id = max(prompt_ids or [], default=0)
+
+            # NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while
+            # self.model_config.get_vocab_size() is the model’s vocab size.
+            # For Qwen3 models, the language model has extra tokens that do
+            # not exist in the tokenizer, and vice versa for multimodal
+            # placeholder tokens in some multimodal models.
+            # See https://github.com/QwenLM/Qwen3/issues/29#issuecomment-1933720399 # noqa: E501
+            # and https://github.com/vllm-project/vllm/pull/22471#discussion_r2312251421 # noqa: E501
+
+            # Here we take the max of the two to determine if a token id is
+            # truly out-of-vocabulary.
+            if max_input_id > max(tokenizer.max_token_id,
+                                  self.model_config.get_vocab_size() - 1):
+                raise ValueError(
+                    f"Token id {max_input_id} is out of vocabulary")
+
+        max_prompt_len = self.model_config.max_model_len
+        if prompt_len > max_prompt_len:
+            if prompt_type == "encoder" and model_config.is_multimodal_model:
+                mm_registry = self.input_preprocessor.mm_registry
+                mm_processor = mm_registry.create_processor(
+                    model_config,
+                    tokenizer=tokenizer,
+                )
+                assert isinstance(mm_processor, EncDecMultiModalProcessor)
+
+                if mm_processor.pad_dummy_encoder_prompt:
+                    return  # Skip encoder length check for Whisper
+
+            if model_config.is_multimodal_model:
+                suggestion = (
+                    "Make sure that `max_model_len` is no smaller than the "
+                    "number of text tokens plus multimodal tokens. For image "
+                    "inputs, the number of image tokens depends on the number "
+                    "of images, and possibly their aspect ratios as well.")
+            else:
+                suggestion = (
+                    "Make sure that `max_model_len` is no smaller than the "
+                    "number of text tokens.")
+
+            raise ValueError(
+                f"The {prompt_type} prompt (length {prompt_len}) is "
+                f"longer than the maximum model length of {max_prompt_len}. "
+                f"{suggestion}")
+
+            # TODO: Find out how many placeholder tokens are there so we can
+            # check that chunked prefill does not truncate them
+            # max_batch_len = self.scheduler_config.max_num_batched_tokens
+
+    def clear_cache(self) -> None:
+        self.input_preprocessor.clear_cache()
--- a/vllm/v1/engine/utils.py
+++ b/vllm/v1/engine/utils.py
@@ -0,0 +1,860 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import contextlib
+import os
+import weakref
+from collections.abc import Iterator
+from dataclasses import dataclass
+from enum import Enum, auto
+from multiprocessing import Process, connection
+from multiprocessing.process import BaseProcess
+from typing import TYPE_CHECKING, Callable, Optional, Union
+from unittest.mock import patch
+
+import msgspec
+import zmq
+
+from vllm.config import CacheConfig, ParallelConfig, VllmConfig
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.ray.ray_env import get_env_vars_to_copy
+from vllm.utils import get_mp_context, get_open_zmq_ipc_path, zmq_socket_ctx
+from vllm.v1.engine.coordinator import DPCoordinator
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.utils import get_engine_client_zmq_addr, shutdown
+
+if TYPE_CHECKING:
+    from ray.util.placement_group import PlacementGroup
+
+logger = init_logger(__name__)
+
+STARTUP_POLL_PERIOD_MS = 10000
+
+
+class CoreEngineState(Enum):
+    NEW = auto()
+    CONNECTED = auto()
+    READY = auto()
+
+
+class CoreEngine:
+    """One per data parallel rank, used to track state during handshaking."""
+
+    def __init__(self, index: int = 0, local: bool = True):
+        self.local = local
+        self.identity = index.to_bytes(2, "little")
+
+        self.state = CoreEngineState.NEW
+
+
+@dataclass
+class EngineZmqAddresses:
+    # ZMQ input socket addresses for each front-end client (requests)
+    inputs: list[str]
+    # ZMQ output socket addresses for each front-end client (responses)
+    outputs: list[str]
+    # ZMQ input socket address of DP coordinator if applicable
+    coordinator_input: Optional[str] = None
+    # ZMQ output socket address of DP coordinator if applicable
+    coordinator_output: Optional[str] = None
+    # ZMQ socket for front-end to connect to DP coordinator.
+    # Not used by engine, just relayed to front-end in handshake response.
+    # Only required for external DP LB case.
+    frontend_stats_publish_address: Optional[str] = None
+
+
+@dataclass
+class EngineHandshakeMetadata:
+    """Metadata sent to each engine process during startup handshake,
+    including addresses of the front-end ZMQ queues that they should
+    connect to.
+    """
+    addresses: EngineZmqAddresses
+    parallel_config: dict[str, Union[int, str, list[int]]]
+
+
+class CoreEngineProcManager:
+    """
+    Utility class to handle creation, readiness, and shutdown
+    of background processes used by the AsyncLLM and LLMEngine.
+    """
+
+    def __init__(
+        self,
+        target_fn: Callable,
+        local_engine_count: int,
+        start_index: int,
+        local_start_index: int,
+        vllm_config: VllmConfig,
+        local_client: bool,
+        handshake_address: str,
+        executor_class: type[Executor],
+        log_stats: bool,
+        client_handshake_address: Optional[str] = None,
+    ):
+        context = get_mp_context()
+        common_kwargs = {
+            "vllm_config": vllm_config,
+            "local_client": local_client,
+            "handshake_address": handshake_address,
+            "executor_class": executor_class,
+            "log_stats": log_stats,
+        }
+
+        if client_handshake_address:
+            common_kwargs[
+                "client_handshake_address"] = client_handshake_address
+
+        self.processes: list[BaseProcess] = []
+        local_dp_ranks = []
+        for index in range(local_engine_count):
+            local_index = local_start_index + index
+            global_index = start_index + index
+
+            # Start EngineCore in background process.
+            local_dp_ranks.append(local_index)
+            self.processes.append(
+                context.Process(target=target_fn,
+                                name=f"EngineCore_DP{global_index}",
+                                kwargs=common_kwargs | {
+                                    "dp_rank": global_index,
+                                    "local_dp_rank": local_index,
+                                }))
+
+        self._finalizer = weakref.finalize(self, shutdown, self.processes)
+
+        data_parallel = vllm_config.parallel_config.data_parallel_size > 1
+        try:
+            for proc, local_dp_rank in zip(self.processes, local_dp_ranks):
+                with set_device_control_env_var(
+                        vllm_config, local_dp_rank) if (
+                            data_parallel) else contextlib.nullcontext():
+                    proc.start()
+        finally:
+            # Kill other procs if not all are running.
+            if self.finished_procs():
+                self.close()
+
+    def close(self):
+        """Shutdown all procs."""
+        self._finalizer()
+
+    def join_first(self):
+        """Wait for any process to exit."""
+        connection.wait(proc.sentinel for proc in self.processes)
+
+    def sentinels(self) -> list:
+        return [proc.sentinel for proc in self.processes]
+
+    def finished_procs(self) -> dict[str, int]:
+        """Returns dict of proc name -> exit code for any finished procs."""
+        return {
+            proc.name: proc.exitcode
+            for proc in self.processes if proc.exitcode is not None
+        }
+
+
+@contextlib.contextmanager
+def set_device_control_env_var(vllm_config: VllmConfig,
+                               local_dp_rank: int) -> Iterator[None]:
+    """
+    Temporarily set CUDA_VISIBLE_DEVICES or equivalent
+    for engine subprocess.
+    """
+    world_size = vllm_config.parallel_config.world_size
+    evar = current_platform.device_control_env_var
+
+    value = get_device_indices(evar, local_dp_rank, world_size)
+    with patch.dict(os.environ, values=((evar, value), )):
+        yield
+
+
+def get_device_indices(device_control_env_var: str, local_dp_rank: int,
+                       world_size: int):
+    """
+    Returns a comma-separated string of device indices for the specified
+    data parallel rank.
+
+    For example, if world_size=2 and local_dp_rank=1, and there are 4 devices,
+    this will select devices 2 and 3 for local_dp_rank=1.
+    """
+    try:
+        value = ",".join(
+            str(current_platform.device_id_to_physical_device_id(i))
+            for i in range(local_dp_rank * world_size, (local_dp_rank + 1) *
+                           world_size))
+    except IndexError as e:
+        raise Exception(f"Error setting {device_control_env_var}: "
+                        f"local range: [{local_dp_rank * world_size}, "
+                        f"{(local_dp_rank + 1) * world_size}) "
+                        "base value: "
+                        f"\"{os.getenv(device_control_env_var)}\"") from e
+    return value
+
+
+class CoreEngineActorManager:
+    """
+    Utility class to handle creation, readiness, and shutdown
+    of core engine Ray actors used by the AsyncLLM and LLMEngine.
+
+    Different from CoreEngineProcManager, this class manages
+    core engines for both local and remote nodes.
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        addresses: EngineZmqAddresses,
+        executor_class: type[Executor],
+        log_stats: bool,
+        placement_groups: Optional[list["PlacementGroup"]] = None,
+        local_dp_ranks: Optional[list[int]] = None,
+    ):
+        import copy
+
+        import ray
+        from ray.runtime_env import RuntimeEnv
+        from ray.util.scheduling_strategies import (
+            PlacementGroupSchedulingStrategy)
+
+        from vllm.v1.engine.core import DPEngineCoreActor
+
+        self.local_engine_actors: list[ray.ActorHandle] = []
+        self.remote_engine_actors: list[ray.ActorHandle] = []
+
+        env_vars_list = get_env_vars_to_copy(destination="DPEngineCoreActor")
+        self.env_vars_dict = {
+            name: os.environ[name]
+            for name in env_vars_list if name in os.environ
+        }
+        runtime_env = RuntimeEnv(env_vars=self.env_vars_dict)
+
+        self.addresses = addresses
+        self.executor_class = executor_class
+        self.log_stats = log_stats
+        dp_size = vllm_config.parallel_config.data_parallel_size
+        local_engine_count = \
+            vllm_config.parallel_config.data_parallel_size_local
+        world_size = vllm_config.parallel_config.world_size
+
+        if ray.is_initialized():
+            logger.info(
+                "Ray is already initialized. Skipping Ray initialization.")
+        else:
+            ray.init()
+
+        if placement_groups is not None:
+            assert local_dp_ranks is not None, (
+                "local_dp_ranks must be provided if "
+                "placement_groups is provided")
+            assert len(placement_groups) == len(local_dp_ranks), (
+                "placement_groups and local_dp_ranks must "
+                "have the same length")
+            logger.info("Using provided placement groups")
+            # TODO(rui): validate passed-in placement groups
+            self.created_placement_groups = []
+        else:
+            placement_groups, local_dp_ranks = \
+                CoreEngineActorManager.create_dp_placement_groups(vllm_config)
+            self.created_placement_groups = placement_groups
+        assert len(placement_groups) == dp_size, (
+            "Number of placement groups must match data parallel size")
+
+        self.placement_group_is_local = []
+        refs = []
+        for index, local_index, pg in zip(range(dp_size), local_dp_ranks,
+                                          placement_groups):
+            dp_vllm_config = copy.deepcopy(vllm_config)
+            dp_vllm_config.parallel_config.placement_group = pg
+            local_client = index < local_engine_count
+
+            # Ray XPU known issue: dpctl initializes the GPU runtime early, so
+            # setting device env vars in Ray actor's initialization method
+            # will not affect device selection. See:
+            # https://github.com/ray-project/ray/blob/master/python/ray/_private/accelerators/intel_gpu.py#L56 # noqa: E501
+            if current_platform.is_xpu():
+                device_evar = current_platform.device_control_env_var
+                device_indices = get_device_indices(device_evar, local_index,
+                                                    world_size)
+                actor_env_vars = self.env_vars_dict.copy()
+                actor_env_vars[device_evar] = device_indices
+                runtime_env = RuntimeEnv(env_vars=actor_env_vars)
+
+            actor = ray.remote(DPEngineCoreActor).options(
+                scheduling_strategy=PlacementGroupSchedulingStrategy(
+                    placement_group=pg,
+                    placement_group_bundle_index=world_size,
+                ),
+                runtime_env=runtime_env).remote(vllm_config=dp_vllm_config,
+                                                executor_class=executor_class,
+                                                log_stats=log_stats,
+                                                local_client=local_client,
+                                                addresses=addresses,
+                                                dp_rank=index,
+                                                local_dp_rank=local_index)
+            if local_client:
+                self.local_engine_actors.append(actor)
+            else:
+                self.remote_engine_actors.append(actor)
+            self.placement_group_is_local.append(local_client)
+            refs.append(actor.wait_for_init.remote())
+
+        ray.get(refs)
+        self.run_refs = []
+        for actor in self.local_engine_actors + self.remote_engine_actors:
+            self.run_refs.append(actor.run.remote())
+
+    @staticmethod
+    def create_dp_placement_groups(
+            vllm_config: VllmConfig
+    ) -> tuple[list["PlacementGroup"], list[int]]:
+        """
+        Create placement groups for data parallel.
+        """
+
+        import ray
+        from ray._private.state import available_resources_per_node
+
+        logger.info("Creating placement groups for data parallel")
+        dp_master_ip = \
+            vllm_config.parallel_config.data_parallel_master_ip
+        num_pg_to_create = vllm_config.parallel_config.data_parallel_size
+        local_engine_count = \
+            vllm_config.parallel_config.data_parallel_size_local
+
+        available_resources = available_resources_per_node()
+        world_size = vllm_config.parallel_config.world_size
+        placement_groups: list[PlacementGroup] = []
+        local_dp_ranks: list[int] = []
+        dp_master_ip_key = f'node:{dp_master_ip}'
+        nodes = sorted(available_resources.values(),
+                       key=lambda x: dp_master_ip_key not in x)
+        assert len(nodes) > 0, (
+            "No nodes with resources found in Ray cluster.")
+        assert dp_master_ip_key in nodes[0], (
+            "The DP master node (ip: %s) is missing or dead", dp_master_ip)
+        device_str = current_platform.ray_device_key
+        for node_resources in nodes:
+            if device_str not in node_resources:
+                continue
+            # For now, each DP rank can only be assigned to one node
+            # TODO(rui): support allocating a single DP rank
+            # to multiple nodes
+            available_engine_count = int(
+                node_resources[device_str]) // world_size
+            if dp_master_ip_key in node_resources:
+                assert available_engine_count >= local_engine_count, (
+                    "Not enough resources to allocate DP ranks "
+                    f"on DP master node {dp_master_ip}")
+                for i in range(local_engine_count):
+                    bundles = [{
+                        device_str: 1.0,
+                        "node:" + dp_master_ip: 0.001
+                    }] * world_size + [{
+                        "CPU": 1.0
+                    }]
+                    pg = ray.util.placement_group(
+                        name=f"dp_rank_{len(placement_groups)}",
+                        strategy="STRICT_PACK",
+                        bundles=bundles,
+                    )
+                    placement_groups.append(pg)
+                    local_dp_ranks.append(i)
+            else:
+                for i in range(available_engine_count):
+                    if len(placement_groups) == num_pg_to_create:
+                        break
+                    bundles = [{device_str: 1.0}] * world_size + [{"CPU": 1.0}]
+                    pg = ray.util.placement_group(
+                        name=f"dp_rank_{len(placement_groups)}",
+                        strategy="STRICT_PACK",
+                        bundles=bundles,
+                    )
+                    placement_groups.append(pg)
+                    local_dp_ranks.append(i)
+        if len(placement_groups) < num_pg_to_create:
+            raise ValueError(
+                f"Not enough resources to allocate {num_pg_to_create} "
+                "placement groups, only created "
+                f"{len(placement_groups)} placement groups. "
+                "Available resources: "
+                f"{available_resources}")
+        return placement_groups, local_dp_ranks
+
+    @staticmethod
+    def add_dp_placement_groups(
+        old_vllm_config: VllmConfig, new_data_parallel_size: int
+    ) -> tuple[list["PlacementGroup"], list[int]]:
+        """
+        Add placement groups for new data parallel size.
+        """
+        import ray
+        from ray._private.state import (available_resources_per_node,
+                                        total_resources_per_node)
+        from ray.util.state import list_nodes
+
+        old_dp_size = old_vllm_config.parallel_config.data_parallel_size
+        num_pg_to_create = new_data_parallel_size - old_dp_size
+
+        if num_pg_to_create <= 0:
+            return [], []
+
+        dp_master_ip = old_vllm_config.parallel_config.data_parallel_master_ip
+        world_size = old_vllm_config.parallel_config.world_size
+
+        nodes = list_nodes()
+        nodes = sorted(nodes, key=lambda node: node.node_ip != dp_master_ip)
+        assert nodes[0].node_ip == dp_master_ip, (
+            "The first node must be the head node")
+        assert len(nodes) == 1 or nodes[1].node_ip != dp_master_ip, (
+            "There can only be one head node")
+
+        available_resources = available_resources_per_node()
+        total_resources = total_resources_per_node()
+
+        placement_groups = []
+        local_dp_ranks = []
+        num_pg_created = 0
+
+        device_str = current_platform.ray_device_key
+        for node in nodes:
+            if num_pg_created >= num_pg_to_create:
+                break
+
+            node_ip = node.node_ip
+            node_id = node.node_id
+            available_gpus = int(available_resources[node_id][device_str])
+
+            # Get total GPUs on this node from the node's resources
+            # Ray stores node resources with node ID as key
+            total_gpus = int(total_resources[node_id][device_str])
+
+            # Calculate used GPUs and used engines on this node
+            used_gpus = max(0, total_gpus - available_gpus)
+            used_engines_on_node = used_gpus // world_size
+
+            # Calculate how many new engines this node can accommodate
+            available_engine_count = available_gpus // world_size
+
+            # Create placement groups for new engines on this node
+            for i in range(available_engine_count):
+                if num_pg_created >= num_pg_to_create:
+                    break
+
+                rank = old_dp_size + num_pg_created
+
+                # Create bundles with node constraint for master node
+                if node_ip == dp_master_ip:
+                    bundles = [{
+                        device_str: 1.0,
+                        "node:" + dp_master_ip: 0.001
+                    }] * world_size + [{
+                        "CPU": 1.0
+                    }]
+                else:
+                    bundles = [{device_str: 1.0}] * world_size + [{"CPU": 1.0}]
+
+                pg = ray.util.placement_group(
+                    name=f"dp_rank_{rank}",
+                    strategy="STRICT_PACK",
+                    bundles=bundles,
+                )
+                placement_groups.append(pg)
+
+                # Local rank starts from the number of engines already used
+                # on this node
+                local_rank = used_engines_on_node + i
+                local_dp_ranks.append(local_rank)
+                num_pg_created += 1
+
+        return placement_groups, local_dp_ranks
+
+    def scale_up_elastic_ep(self, cur_vllm_config: VllmConfig,
+                            new_data_parallel_size: int) -> None:
+        import copy
+
+        import ray
+        from ray.runtime_env import RuntimeEnv
+        from ray.util.scheduling_strategies import (
+            PlacementGroupSchedulingStrategy)
+
+        from vllm.v1.engine.core import DPEngineCoreActor
+
+        cur_data_parallel_size = len(self.local_engine_actors) + \
+            len(self.remote_engine_actors)
+
+        assert new_data_parallel_size > cur_data_parallel_size, (
+            f"New data parallel size {new_data_parallel_size} must be greater "
+            f"than current data parallel size {cur_data_parallel_size} "
+            "for scale up")
+
+        placement_groups, local_dp_ranks = \
+            self.add_dp_placement_groups(
+                cur_vllm_config, new_data_parallel_size)
+
+        world_size = cur_vllm_config.parallel_config.world_size
+        dp_master_ip = cur_vllm_config.parallel_config.data_parallel_master_ip
+        new_local_engines = 0
+
+        runtime_env = RuntimeEnv(env_vars=self.env_vars_dict
+                                 | {"VLLM_ELASTIC_EP_SCALE_UP_LAUNCH": "1"})
+        for i, (pg,
+                local_rank) in enumerate(zip(placement_groups,
+                                             local_dp_ranks)):
+            rank = cur_data_parallel_size + i
+            dp_vllm_config = copy.deepcopy(cur_vllm_config)
+            dp_vllm_config.parallel_config.data_parallel_size = \
+                new_data_parallel_size
+            dp_vllm_config.parallel_config.placement_group = pg
+
+            # Check if this placement group is on the head node
+            local_client = any(
+                bundle.get("node:" + dp_master_ip, 0) > 0
+                for bundle in pg.bundle_specs)
+
+            if local_client:
+                new_local_engines += 1
+                # Update data_parallel_size_local
+                dp_vllm_config.parallel_config.data_parallel_size_local = (
+                    cur_vllm_config.parallel_config.data_parallel_size_local +
+                    new_local_engines)
+
+            actor = ray.remote(DPEngineCoreActor).options(
+                scheduling_strategy=PlacementGroupSchedulingStrategy(
+                    placement_group=pg,
+                    placement_group_bundle_index=world_size,
+                ),
+                runtime_env=runtime_env).remote(
+                    vllm_config=dp_vllm_config,
+                    executor_class=self.executor_class,
+                    log_stats=self.log_stats,
+                    local_client=local_client,
+                    addresses=self.addresses,
+                    dp_rank=rank,
+                    local_dp_rank=local_rank)
+
+            if local_client:
+                self.local_engine_actors.append(actor)
+            else:
+                self.remote_engine_actors.append(actor)
+            self.created_placement_groups.append(pg)
+            self.placement_group_is_local.append(local_client)
+
+        ray.get([
+            actor.wait_for_init.remote()
+            for actor in (self.local_engine_actors[-new_local_engines:]
+                          if new_local_engines > 0 else []) +
+            self.remote_engine_actors[-(len(placement_groups) -
+                                        new_local_engines):]
+        ])
+
+        actors = (self.local_engine_actors[-new_local_engines:]
+                  if new_local_engines > 0 else []) + \
+            self.remote_engine_actors[-(len(placement_groups) -
+                                        new_local_engines):]
+
+        for actor in actors:
+            self.run_refs.append(actor.run.remote())
+
+        cur_vllm_config.parallel_config.data_parallel_size = \
+            new_data_parallel_size
+        # Update old_vllm_config with new data_parallel_size_local if any new
+        # local engines were added
+        if new_local_engines > 0:
+            cur_vllm_config.parallel_config.data_parallel_size_local += \
+                new_local_engines
+
+    def scale_down_elastic_ep(self, cur_data_parallel_size: int,
+                              new_data_parallel_size: int) -> None:
+        import ray
+        assert cur_data_parallel_size > new_data_parallel_size, (
+            f"cur_data_parallel_size {cur_data_parallel_size} must be greater "
+            f"than new_data_parallel_size {new_data_parallel_size} "
+            "for scale down")
+        for _ in range(cur_data_parallel_size - new_data_parallel_size):
+            pg = self.created_placement_groups.pop()
+            is_local = self.placement_group_is_local.pop()
+            if is_local:
+                self.local_engine_actors.pop()
+            else:
+                self.remote_engine_actors.pop()
+            ray.util.remove_placement_group(pg)
+
+    def get_run_refs(self):
+        return self.run_refs
+
+    def close(self):
+        import ray
+        for actor in self.local_engine_actors + self.remote_engine_actors:
+            ray.kill(actor)
+        for pg in self.created_placement_groups:
+            ray.util.remove_placement_group(pg)
+
+
+@contextlib.contextmanager
+def launch_core_engines(
+    vllm_config: VllmConfig,
+    executor_class: type[Executor],
+    log_stats: bool,
+    num_api_servers: int = 1,
+) -> Iterator[tuple[
+        Optional[Union[CoreEngineProcManager, CoreEngineActorManager]],
+        Optional[DPCoordinator],
+        EngineZmqAddresses,
+]]:
+    """Launch engine and DP coordinator processes as needed."""
+
+    parallel_config = vllm_config.parallel_config
+    dp_size = parallel_config.data_parallel_size
+    local_engine_count = parallel_config.data_parallel_size_local
+    local_start_index = parallel_config.data_parallel_rank_local
+    dp_rank = parallel_config.data_parallel_rank
+    host = parallel_config.data_parallel_master_ip
+    local_engines_only = (parallel_config.data_parallel_hybrid_lb
+                          or parallel_config.data_parallel_external_lb)
+
+    # In offline mode there is an LLM instance per DP rank and
+    # one core engine per LLM, see
+    # examples/offline_inference/data_parallel.py.
+    offline_mode = local_start_index is not None
+
+    # client_local_only = True for cases where this front-end
+    # sends requests only to colocated engines.
+    client_local_only = (offline_mode or local_engines_only
+                         or (local_engine_count == dp_size))
+
+    # Set up input and output addresses.
+    addresses = EngineZmqAddresses(
+        inputs=[
+            get_engine_client_zmq_addr(client_local_only, host)
+            for _ in range(num_api_servers)
+        ],
+        outputs=[
+            get_engine_client_zmq_addr(client_local_only, host)
+            for _ in range(num_api_servers)
+        ],
+    )
+
+    # Run the DP Coordinator process with rank 0 when in
+    # online DP mode.
+    run_coordinator = dp_size > 1 and not offline_mode and dp_rank == 0
+
+    if run_coordinator:
+        coordinator = DPCoordinator(parallel_config)
+
+        addresses.coordinator_input, addresses.coordinator_output = (
+            coordinator.get_engine_socket_addresses())
+        addresses.frontend_stats_publish_address = (
+            coordinator.get_stats_publish_address())
+
+        logger.info("Started DP Coordinator process (PID: %d)",
+                    coordinator.proc.pid)
+    else:
+        coordinator = None
+
+    if parallel_config.data_parallel_backend == "ray":
+        logger.info("Starting ray-based data parallel backend")
+
+        engine_actor_manager = CoreEngineActorManager(
+            vllm_config=vllm_config,
+            addresses=addresses,
+            executor_class=executor_class,
+            log_stats=log_stats,
+        )
+
+        yield engine_actor_manager, coordinator, addresses
+        return
+
+    if offline_mode:
+        assert local_engine_count == 1
+        engines_to_handshake = [CoreEngine(index=dp_rank, local=True)]
+    elif dp_rank == 0:
+        # Rank 0 holds Coordinator, so it handshakes with all Cores
+        # in both external dplb and internal dplb mode.
+        # Note this also covers the case where we have zero local engines
+        # and rank 0 is headless.
+        engines_to_handshake = [
+            CoreEngine(index=i, local=(i < local_engine_count))
+            for i in range(dp_size)
+        ]
+    else:
+        # Rank > 0 handshakes with just the local cores it is managing.
+        assert local_engines_only, (
+            "Attempting to launch core_engines from dp_rank > 0, but "
+            "found internal DPLB, which is incompatible.")
+        engines_to_handshake = [
+            CoreEngine(index=i, local=True)
+            for i in range(dp_rank, dp_rank + local_engine_count)
+        ]
+
+    # Whether the started engines will handshake only with co-located
+    # front-end processes. In external_dp_lb mode, ranks > 0 handshake with
+    # their co-located frontend and also the rank 0 front-end, and hence this
+    # will be False.
+    handshake_local_only = offline_mode or local_engine_count == dp_size
+
+    handshake_address = get_engine_client_zmq_addr(
+        handshake_local_only, host, parallel_config.data_parallel_rpc_port)
+
+    if local_engines_only and dp_rank > 0:
+        assert not handshake_local_only
+        local_handshake_address = get_open_zmq_ipc_path()
+        client_handshake_address = local_handshake_address
+    else:
+        local_handshake_address = handshake_address
+        client_handshake_address = None
+
+    with zmq_socket_ctx(local_handshake_address, zmq.ROUTER,
+                        bind=True) as handshake_socket:
+
+        from vllm.v1.engine.core import EngineCoreProc
+
+        # Start local engines.
+        if local_engine_count:
+            local_engine_manager = CoreEngineProcManager(
+                EngineCoreProc.run_engine_core,
+                vllm_config=vllm_config,
+                executor_class=executor_class,
+                log_stats=log_stats,
+                handshake_address=handshake_address,
+                client_handshake_address=client_handshake_address,
+                local_client=True,
+                local_engine_count=local_engine_count,
+                start_index=dp_rank,
+                local_start_index=local_start_index or 0)
+        else:
+            local_engine_manager = None
+
+        yield local_engine_manager, coordinator, addresses
+
+        # Now wait for engines to start.
+        wait_for_engine_startup(
+            handshake_socket,
+            addresses,
+            engines_to_handshake,
+            parallel_config,
+            vllm_config.cache_config,
+            local_engine_manager,
+            coordinator.proc if coordinator else None,
+        )
+
+
+def wait_for_engine_startup(
+    handshake_socket: zmq.Socket,
+    addresses: EngineZmqAddresses,
+    core_engines: list[CoreEngine],
+    parallel_config: ParallelConfig,
+    cache_config: CacheConfig,
+    proc_manager: Optional[CoreEngineProcManager],
+    coord_process: Optional[Process],
+):
+    # Wait for engine core process(es) to send ready messages.
+    local_count = parallel_config.data_parallel_size_local
+    remote_count = len(core_engines) - local_count
+    # [local, remote] counts
+    conn_pending, start_pending = [local_count, remote_count], [0, 0]
+    poller = zmq.Poller()
+    poller.register(handshake_socket, zmq.POLLIN)
+
+    remote_should_be_headless = not parallel_config.data_parallel_hybrid_lb \
+        and not parallel_config.data_parallel_external_lb
+
+    if proc_manager is not None:
+        for sentinel in proc_manager.sentinels():
+            poller.register(sentinel, zmq.POLLIN)
+    if coord_process is not None:
+        poller.register(coord_process.sentinel, zmq.POLLIN)
+    while any(conn_pending) or any(start_pending):
+        events = poller.poll(STARTUP_POLL_PERIOD_MS)
+        if not events:
+            if any(conn_pending):
+                logger.debug(
+                    "Waiting for %d local, %d remote core engine proc(s) "
+                    "to connect.", *conn_pending)
+            if any(start_pending):
+                logger.debug(
+                    "Waiting for %d local, %d remote core engine proc(s) "
+                    "to start.", *start_pending)
+            continue
+        if len(events) > 1 or events[0][0] != handshake_socket:
+            # One of the local core processes exited.
+            finished = proc_manager.finished_procs() if proc_manager else {}
+            if coord_process is not None and coord_process.exitcode is not None:
+                finished[coord_process.name] = coord_process.exitcode
+            raise RuntimeError("Engine core initialization failed. "
+                               "See root cause above. "
+                               f"Failed core proc(s): {finished}")
+
+        # Receive HELLO and READY messages from the input socket.
+        eng_identity, ready_msg_bytes = handshake_socket.recv_multipart()
+        eng_index = int.from_bytes(eng_identity, "little")
+        engine = next((e for e in core_engines if e.identity == eng_identity),
+                      None)
+        if engine is None:
+            raise RuntimeError(f"Message from engine with unexpected data "
+                               f"parallel rank: {eng_index}")
+        msg = msgspec.msgpack.decode(ready_msg_bytes)
+        status, local, headless = msg["status"], msg["local"], msg["headless"]
+        if local != engine.local:
+            raise RuntimeError(f"{status} message from "
+                               f"{'local' if local else 'remote'} "
+                               f"engine {eng_index}, expected it to be "
+                               f"{'local' if engine.local else 'remote'}")
+
+        # Remote engines must be headless iff we aren't in hybrid dp lb mode.
+        if not local and headless != remote_should_be_headless:
+            if headless:
+                raise RuntimeError(f"Remote engine {eng_index} must not use "
+                                   f"--headless in external or hybrid dp lb "
+                                   f"mode")
+            else:
+                raise RuntimeError(f"Remote engine {eng_index} must use "
+                                   f"--headless unless in external or hybrid "
+                                   f"dp lb mode")
+
+        if status == "HELLO" and engine.state == CoreEngineState.NEW:
+
+            # Send init message with DP config info.
+            init_message = msgspec.msgpack.encode(
+                EngineHandshakeMetadata(
+                    addresses=addresses,
+                    parallel_config={
+                        "data_parallel_master_ip":
+                        parallel_config.data_parallel_master_ip,
+                        "data_parallel_master_port":
+                        parallel_config.data_parallel_master_port,
+                        "_data_parallel_master_port_list":
+                        parallel_config._data_parallel_master_port_list,
+                        "data_parallel_size":
+                        parallel_config.data_parallel_size,
+                    }))
+            handshake_socket.send_multipart((eng_identity, init_message),
+                                            copy=False)
+            conn_pending[0 if local else 1] -= 1
+            start_pending[0 if local else 1] += 1
+            engine.state = CoreEngineState.CONNECTED
+        elif status == "READY" and engine.state == CoreEngineState.CONNECTED:
+            # Setup KV cache config with initialization state from
+            # engine core process. Sum values from all engines in DP case.
+            num_gpu_blocks = cache_config.num_gpu_blocks or 0
+            num_gpu_blocks += msg["num_gpu_blocks"]
+            cache_config.num_gpu_blocks = num_gpu_blocks
+
+            # In external DP LB mode, the coordinator address that the
+            # front-end procs connect to is obtained from rank 0 via
+            # one of the engine handshakes, and passed to the local
+            # front-end process in the response from the other.
+            if addresses.frontend_stats_publish_address is None:
+                addresses.frontend_stats_publish_address = msg.get(
+                    "dp_stats_address")
+
+            start_pending[0 if local else 1] -= 1
+            engine.state = CoreEngineState.READY
+        else:
+            raise RuntimeError(f"Unexpected {status} message for "
+                               f"{'local' if local else 'remote'} engine "
+                               f"{eng_index} in {engine.state} state.")
+
+        logger.debug("%s from %s core engine process %s.", status,
+                     "local" if local else "remote", eng_index)