Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/vllm/v1/executor/init.py
+++ b/vllm/v1/executor/init.py
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from .abstract import Executor
+from .uniproc_executor import UniProcExecutor
+
+__all__ = ["Executor", "UniProcExecutor"]
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -0,0 +1,352 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import time
+from abc import ABC, abstractmethod
+from collections.abc import Callable
+from concurrent.futures import Future
+from functools import cached_property
+from typing import TYPE_CHECKING, Literal, TypeVar, overload
+
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorHandshakeMetadata,
+)
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.tasks import SupportedTask
+from vllm.utils.import_utils import resolve_obj_by_qualname
+from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
+from vllm.v1.engine import ReconfigureDistributedRequest
+from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
+from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
+from vllm.v1.worker.worker_base import WorkerBase
+
+if TYPE_CHECKING:
+    from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
+
+logger = init_logger(__name__)
+
+_R = TypeVar("_R")
+
+FailureCallback = Callable[[], None]
+
+
+class Executor(ABC):
+    """Abstract base class for vLLM executors."
+
+    An executor is responsible for executing the model on one device,
+    or it can be a distributed executor that can execute the model on multiple devices.
+    """
+
+    uses_ray: bool = False  # whether the executor uses Ray for orchestration.
+    supports_pp: bool = False  # whether the executor supports PP
+
+    @staticmethod
+    def get_class(vllm_config: VllmConfig) -> type["Executor"]:
+        executor_class: type[Executor]
+        parallel_config = vllm_config.parallel_config
+        distributed_executor_backend = parallel_config.distributed_executor_backend
+        # distributed_executor_backend must be set in VllmConfig.__post_init__
+        if isinstance(distributed_executor_backend, type):
+            if not issubclass(distributed_executor_backend, Executor):
+                raise TypeError(
+                    "distributed_executor_backend must be a subclass of "
+                    f"Executor. Got {distributed_executor_backend}."
+                )
+            executor_class = distributed_executor_backend
+        elif distributed_executor_backend == "ray":
+            from vllm.v1.executor.ray_executor import RayDistributedExecutor
+
+            executor_class = RayDistributedExecutor
+        elif distributed_executor_backend == "mp":
+            from vllm.v1.executor.multiproc_executor import MultiprocExecutor
+
+            executor_class = MultiprocExecutor
+        elif distributed_executor_backend == "uni":
+            from vllm.v1.executor.uniproc_executor import UniProcExecutor
+
+            executor_class = UniProcExecutor
+        elif distributed_executor_backend == "external_launcher":
+            # TODO: make v1 scheduling deterministic
+            # to support external launcher
+            executor_class = ExecutorWithExternalLauncher
+        elif isinstance(distributed_executor_backend, str):
+            executor_class = resolve_obj_by_qualname(distributed_executor_backend)
+            if not issubclass(executor_class, Executor):
+                raise TypeError(
+                    "distributed_executor_backend must be a subclass of "
+                    f"Executor. Got {executor_class}."
+                )
+        else:
+            raise ValueError(
+                f"Unknown distributed executor backend: {distributed_executor_backend}"
+            )
+        return executor_class
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+    ) -> None:
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.load_config = vllm_config.load_config
+        self.parallel_config = vllm_config.parallel_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.device_config = vllm_config.device_config
+        self.speculative_config = vllm_config.speculative_config
+        self.observability_config = vllm_config.observability_config
+        self._init_executor()
+        self.is_sleeping = False
+        self.sleeping_tags: set[str] = set()
+        self.kv_output_aggregator: KVOutputAggregator | None = None
+
+    @abstractmethod
+    def _init_executor(self) -> None:
+        raise NotImplementedError
+
+    def initialize_from_config(self, kv_cache_configs: list[KVCacheConfig]) -> None:
+        """
+        Initialize the KV caches and begin the model execution loop of the
+        underlying workers.
+        """
+        self.collective_rpc("initialize_from_config", args=(kv_cache_configs,))
+        self.collective_rpc("compile_or_warm_up_model")
+
+    def register_failure_callback(self, callback: FailureCallback):  # noqa: B027
+        """
+        Register a function to be called if the executor enters a permanent
+        failed state.
+        """
+        pass
+
+    def determine_available_memory(self) -> list[int]:  # in bytes
+        return self.collective_rpc("determine_available_memory")
+
+    def get_kv_cache_specs(self) -> list[dict[str, KVCacheSpec]]:
+        return self.collective_rpc("get_kv_cache_spec")
+
+    @overload
+    def collective_rpc(
+        self,
+        method: str | Callable[[WorkerBase], _R],
+        timeout: float | None = None,
+        args: tuple = (),
+        kwargs: dict | None = None,
+        non_block: Literal[False] = False,
+    ) -> list[_R]:
+        """
+        Execute an RPC call on all workers.
+
+        Args:
+            method: Name of the worker method to execute, or a callable that
+                is serialized and sent to all workers to execute.
+
+                If the method is a callable, it should accept an additional
+                `self` argument, in addition to the arguments passed in `args`
+                and `kwargs`. The `self` argument will be the worker object.
+            timeout: Maximum time in seconds to wait for execution. Raises a
+                [`TimeoutError`][] on timeout. `None` means wait indefinitely.
+            args: Positional arguments to pass to the worker method.
+            kwargs: Keyword arguments to pass to the worker method.
+            non_block: If `True`, returns a list of Futures instead of waiting
+                for the results.
+
+        Returns:
+            A list containing the results from each worker.
+
+        Note:
+            It is recommended to use this API to only pass control messages,
+            and set up data-plane communication to pass data.
+        """
+        pass
+
+    @overload
+    def collective_rpc(
+        self,
+        method: str | Callable[[WorkerBase], _R],
+        timeout: float | None = None,
+        args: tuple = (),
+        kwargs: dict | None = None,
+        non_block: Literal[True] = True,
+    ) -> Future[list[_R]]:
+        pass
+
+    @abstractmethod
+    def collective_rpc(
+        self, method, timeout=None, args=(), kwargs=None, non_block: bool = False
+    ):
+        raise NotImplementedError
+
+    def get_kv_connector_handshake_metadata(
+        self,
+    ) -> list[dict[int, KVConnectorHandshakeMetadata]]:
+        return self.collective_rpc("get_kv_connector_handshake_metadata")
+
+    @overload
+    def execute_model(
+        self, scheduler_output: SchedulerOutput, non_block: Literal[False] = False
+    ) -> ModelRunnerOutput | None:
+        pass
+
+    @overload
+    def execute_model(
+        self, scheduler_output: SchedulerOutput, non_block: Literal[True] = True
+    ) -> Future[ModelRunnerOutput | None]:
+        pass
+
+    def execute_model(
+        self, scheduler_output: SchedulerOutput, non_block: bool = False
+    ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
+        output = self.collective_rpc(  # type: ignore[call-overload]
+            "execute_model", args=(scheduler_output,), non_block=non_block
+        )
+        return output[0]
+
+    @overload
+    def sample_tokens(
+        self, grammar_output: GrammarOutput | None, non_block: Literal[False] = False
+    ) -> ModelRunnerOutput:
+        pass
+
+    @overload
+    def sample_tokens(
+        self, grammar_output: GrammarOutput | None, non_block: Literal[True] = True
+    ) -> Future[ModelRunnerOutput]:
+        pass
+
+    def sample_tokens(
+        self, grammar_output: GrammarOutput | None, non_block: bool = False
+    ) -> ModelRunnerOutput | Future[ModelRunnerOutput]:
+        output = self.collective_rpc(  # type: ignore[call-overload]
+            "sample_tokens", args=(grammar_output,), non_block=non_block
+        )
+        return output[0]
+
+    def execute_dummy_batch(self) -> None:
+        self.collective_rpc("execute_dummy_batch")
+
+    def take_draft_token_ids(self) -> DraftTokenIds | None:
+        output: list[DraftTokenIds] = self.collective_rpc("take_draft_token_ids")
+        return output[0]
+
+    @property
+    def max_concurrent_batches(self) -> int:
+        return 1
+
+    def profile(self, is_start: bool = True):
+        self.collective_rpc("profile", args=(is_start,))
+
+    def save_sharded_state(
+        self,
+        path: str,
+        pattern: str | None = None,
+        max_size: int | None = None,
+    ) -> None:
+        self.collective_rpc(
+            "save_sharded_state",
+            kwargs=dict(path=path, pattern=pattern, max_size=max_size),
+        )
+
+    @abstractmethod
+    def check_health(self) -> None:
+        """Checks if the executor is healthy. If not, it should raise an
+        exception."""
+        raise NotImplementedError
+
+    def shutdown(self) -> None:
+        """Shutdown the executor."""
+        self.collective_rpc("shutdown")
+
+    def init_kv_output_aggregator(self, connector: "KVConnectorBase") -> None:
+        """Init KVOutputAggregator"""
+        self.kv_output_aggregator = KVOutputAggregator.from_connector(
+            connector, self.parallel_config.world_size
+        )
+
+    @cached_property  # Avoid unnecessary RPC calls
+    def supported_tasks(self) -> tuple[SupportedTask, ...]:
+        output: list[tuple[SupportedTask, ...]]
+        output = self.collective_rpc("get_supported_tasks")
+        return output[0]
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
+        return all(self.collective_rpc("add_lora", args=(lora_request,)))
+
+    def remove_lora(self, lora_id: int) -> bool:
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return all(self.collective_rpc("remove_lora", args=(lora_id,)))
+
+    def pin_lora(self, lora_id: int) -> bool:
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return all(self.collective_rpc("pin_lora", args=(lora_id,)))
+
+    def list_loras(self) -> set[int]:
+        sets: list[set[int]] = self.collective_rpc("list_loras")
+        for s in sets:
+            assert s == sets[0], "All workers should have the same LORAs."
+        return sets[0]
+
+    def reset_mm_cache(self) -> None:
+        """Reset the multi-modal cache in each worker."""
+        self.collective_rpc("reset_mm_cache")
+
+    def sleep(self, level: int = 1):
+        if self.is_sleeping:
+            logger.warning("Executor is already sleeping.")
+            return
+        time_before_sleep = time.perf_counter()
+        self.collective_rpc("sleep", kwargs=dict(level=level))
+        time_after_sleep = time.perf_counter()
+        self.sleeping_tags = {"weights", "kv_cache"}
+        self.is_sleeping = True
+        logger.info(
+            "It took %.6f seconds to fall asleep.", time_after_sleep - time_before_sleep
+        )
+
+    def wake_up(self, tags: list[str] | None = None):
+        if not self.is_sleeping:
+            logger.warning("Executor is not sleeping.")
+            return
+        if tags:
+            for tag in tags:
+                if tag not in self.sleeping_tags:
+                    logger.warning(
+                        "Tag %s is not in sleeping tags %s", tag, self.sleeping_tags
+                    )
+                    return
+        time_before_wakeup = time.perf_counter()
+        self.collective_rpc("wake_up", kwargs=dict(tags=tags))
+        time_after_wakeup = time.perf_counter()
+        logger.info(
+            "It took %.6f seconds to wake up tags %s.",
+            time_after_wakeup - time_before_wakeup,
+            tags if tags is not None else self.sleeping_tags,
+        )
+        if tags:
+            for tag in tags:
+                self.sleeping_tags.remove(tag)
+        else:
+            self.sleeping_tags.clear()
+        if not self.sleeping_tags:
+            self.is_sleeping = False
+
+    def reinitialize_distributed(
+        self, reconfig_request: ReconfigureDistributedRequest
+    ) -> None:
+        raise NotImplementedError
+
+
+from vllm.v1.executor.uniproc_executor import (  # noqa: E402
+    ExecutorWithExternalLauncher as _ExecutorWithExternalLauncher,
+)
+from vllm.v1.executor.uniproc_executor import (  # noqa: E402
+    UniProcExecutor as _UniProcExecutor,
+)
+
+# For backwards compatibility.
+UniProcExecutor = _UniProcExecutor
+ExecutorWithExternalLauncher = _ExecutorWithExternalLauncher
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -0,0 +1,890 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import multiprocessing
+import os
+import pickle
+import queue
+import signal
+import threading
+import time
+import traceback
+import weakref
+from collections import deque
+from collections.abc import Callable, Sequence
+from concurrent.futures import Future, InvalidStateError
+from contextlib import suppress
+from dataclasses import dataclass
+from enum import Enum, auto
+from functools import cached_property, partial
+from multiprocessing.connection import Connection
+from multiprocessing.process import BaseProcess
+from multiprocessing.synchronize import Lock as LockType
+from threading import Thread
+from typing import Any, cast
+
+import cloudpickle
+import torch
+
+import vllm.envs as envs
+from vllm.config import VllmConfig
+from vllm.distributed import destroy_distributed_environment, destroy_model_parallel
+from vllm.distributed.device_communicators.shm_broadcast import Handle, MessageQueue
+from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
+from vllm.distributed.parallel_state import (
+    get_dcp_group,
+    get_dp_group,
+    get_ep_group,
+    get_inner_dp_world_group,
+    get_pcp_group,
+    get_pp_group,
+    get_tp_group,
+)
+from vllm.envs import enable_envs_cache
+from vllm.logger import init_logger
+from vllm.utils.network_utils import (
+    get_distributed_init_method,
+    get_loopback_ip,
+    get_open_port,
+)
+from vllm.utils.system_utils import (
+    _maybe_force_spawn,
+    decorate_logs,
+    get_mp_context,
+    set_process_title,
+)
+from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
+from vllm.v1.executor.abstract import Executor, FailureCallback
+from vllm.v1.outputs import AsyncModelRunnerOutput, DraftTokenIds, ModelRunnerOutput
+from vllm.v1.worker.worker_base import WorkerWrapperBase
+
+logger = init_logger(__name__)
+
+
+class FutureWrapper(Future):
+    def __init__(
+        self,
+        futures_queue: deque[tuple["FutureWrapper", Callable]],
+        aggregate: Callable = lambda x: x,
+    ):
+        self.futures_queue = futures_queue
+        self.aggregate = aggregate
+        super().__init__()
+
+    def result(self, timeout=None):
+        if timeout is not None:
+            raise RuntimeError("timeout not implemented")
+        # Drain any futures ahead of us in the queue.
+        while not self.done():
+            future, get_response = self.futures_queue.pop()
+            future.wait_for_response(get_response)
+        return super().result()
+
+    def wait_for_response(self, get_response: Callable):
+        try:
+            response = self.aggregate(get_response())
+            with suppress(InvalidStateError):
+                self.set_result(response)
+        except Exception as e:
+            with suppress(InvalidStateError):
+                self.set_exception(e)
+
+
+class MultiprocExecutor(Executor):
+    supports_pp: bool = True
+
+    def __init__(self, vllm_config: VllmConfig, monitor_workers: bool = True):
+        self.monitor_workers = monitor_workers
+        super().__init__(vllm_config)
+
+    def _init_executor(self) -> None:
+        # Call self.shutdown at exit to clean up
+        # and ensure workers will be terminated.
+        self._finalizer = weakref.finalize(self, self.shutdown)
+        self.is_failed = False
+        self.shutdown_event = threading.Event()
+        self.failure_callback: FailureCallback | None = None
+
+        self.world_size = self.parallel_config.world_size
+        assert self.world_size % self.parallel_config.nnodes_within_dp == 0, (
+            f"global world_size ({self.parallel_config.world_size}) must be "
+            f"divisible by nnodes_within_dp "
+            f"({self.parallel_config.nnodes_within_dp}). "
+        )
+        self.local_world_size = self.parallel_config.local_world_size
+        tp_size = self.parallel_config.tensor_parallel_size
+        pp_size = self.parallel_config.pipeline_parallel_size
+        pcp_size = self.parallel_config.prefill_context_parallel_size
+        assert self.world_size == tp_size * pp_size * pcp_size, (
+            f"world_size ({self.world_size}) must be equal to the "
+            f"tensor_parallel_size ({tp_size}) x pipeline"
+            f"_parallel_size ({pp_size}) x prefill_context"
+            f"_parallel_size ({pcp_size}). "
+        )
+
+        # Set multiprocessing envs
+        set_multiprocessing_worker_envs()
+
+        # use the loopback address get_loopback_ip() for communication.
+        distributed_init_method = get_distributed_init_method(
+            get_loopback_ip(), get_open_port()
+        )
+        self.rpc_broadcast_mq: MessageQueue | None = None
+        scheduler_output_handle: Handle | None = None
+        # Initialize worker and set up message queues for SchedulerOutputs
+        # and ModelRunnerOutputs
+        if self.parallel_config.node_rank_within_dp == 0:
+            # For leader node within each dp rank,
+            # each dp will have its own leader multiproc executor.
+            max_chunk_bytes = envs.VLLM_MQ_MAX_CHUNK_BYTES_MB * 1024 * 1024
+            self.rpc_broadcast_mq = MessageQueue(
+                self.world_size,
+                self.local_world_size,
+                max_chunk_bytes=max_chunk_bytes,
+                connect_ip=self.parallel_config.master_addr,
+            )
+            scheduler_output_handle = self.rpc_broadcast_mq.export_handle()
+        # Create workers
+        context = get_mp_context()
+        shared_worker_lock = context.Lock()
+        unready_workers: list[UnreadyWorkerProcHandle] = []
+        success = False
+        try:
+            global_start_rank = (
+                self.local_world_size * self.parallel_config.node_rank_within_dp
+            )
+            for local_rank in range(self.local_world_size):
+                global_rank = global_start_rank + local_rank
+                unready_workers.append(
+                    WorkerProc.make_worker_process(
+                        vllm_config=self.vllm_config,
+                        local_rank=local_rank,
+                        rank=global_rank,
+                        distributed_init_method=distributed_init_method,
+                        input_shm_handle=scheduler_output_handle,
+                        shared_worker_lock=shared_worker_lock,
+                    )
+                )
+
+            # Workers must be created before wait_for_ready to avoid
+            # deadlock, since worker.init_device() does a device sync.
+
+            # Wait for all local workers to be ready.
+            self.workers = WorkerProc.wait_for_ready(unready_workers)
+
+            # Start background thread to monitor worker health if not in headless mode.
+            if self.monitor_workers:
+                self.start_worker_monitor()
+
+            self.response_mqs = []
+            # Only leader node have remote response mqs
+            if self.parallel_config.node_rank_within_dp == 0:
+                for rank in range(self.world_size):
+                    if rank < self.local_world_size:
+                        local_message_queue = self.workers[rank].worker_response_mq
+                        assert local_message_queue is not None
+                        self.response_mqs.append(local_message_queue)
+                    else:
+                        remote_message_queue = self.workers[0].peer_worker_response_mqs[
+                            rank
+                        ]
+                        assert remote_message_queue is not None
+                        self.response_mqs.append(remote_message_queue)
+
+            # Ensure message queues are ready. Will deadlock if re-ordered
+            # Must be kept consistent with the WorkerProc.
+
+            # Wait for all input mqs to be ready.
+            if self.rpc_broadcast_mq is not None:
+                self.rpc_broadcast_mq.wait_until_ready()
+            # Wait for all remote response mqs to be ready.
+            for response_mq in self.response_mqs:
+                response_mq.wait_until_ready()
+            success = True
+        finally:
+            if not success:
+                # Clean up the worker procs if there was a failure.
+                # Close death_writers first to signal workers to exit
+                for uw in unready_workers:
+                    if uw.death_writer is not None:
+                        uw.death_writer.close()
+                self._ensure_worker_termination([uw.proc for uw in unready_workers])
+
+        self.futures_queue = deque[tuple[FutureWrapper, Callable]]()
+
+        self.output_rank = self._get_output_rank()
+
+    def start_worker_monitor(self, inline=False) -> None:
+        workers = self.workers
+        self_ref = weakref.ref(self)
+
+        # Monitors worker process liveness. If any die unexpectedly,
+        # logs an error, shuts down the executor and invokes the failure
+        # callback to inform the engine.
+        def monitor_workers():
+            sentinels = [h.proc.sentinel for h in workers]
+            died = multiprocessing.connection.wait(sentinels)
+            _self = self_ref()
+            if not _self or getattr(_self, "shutting_down", False):
+                return
+            _self.is_failed = True
+            proc_name = next(h.proc.name for h in workers if h.proc.sentinel == died[0])
+            logger.error(
+                "Worker proc %s died unexpectedly, shutting down executor.", proc_name
+            )
+            _self.shutdown()
+            callback = _self.failure_callback
+            if callback is not None:
+                _self.failure_callback = None
+                callback()
+
+        if not inline:
+            Thread(
+                target=monitor_workers, daemon=True, name="MultiprocWorkerMonitor"
+            ).start()
+            return
+
+        monitor_workers()
+
+    def register_failure_callback(self, callback: FailureCallback):
+        if self.is_failed:
+            callback()
+        else:
+            self.failure_callback = callback
+
+    def execute_model(  # type: ignore[override]
+        self, scheduler_output: SchedulerOutput, non_block: bool = False
+    ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
+        return self.collective_rpc(
+            "execute_model",
+            args=(scheduler_output,),
+            unique_reply_rank=self.output_rank,
+            non_block=non_block,
+            timeout=envs.VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS,
+            kv_output_aggregator=self.kv_output_aggregator,
+        )
+
+    def sample_tokens(  # type: ignore[override]
+        self, grammar_output: GrammarOutput | None, non_block: bool = False
+    ) -> ModelRunnerOutput | Future[ModelRunnerOutput]:
+        return self.collective_rpc(
+            "sample_tokens",
+            args=(grammar_output,),
+            unique_reply_rank=self.output_rank,
+            non_block=non_block,
+            timeout=envs.VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS,
+            kv_output_aggregator=self.kv_output_aggregator,
+        )
+
+    def execute_dummy_batch(self) -> None:
+        self.collective_rpc("execute_dummy_batch", unique_reply_rank=self.output_rank)
+
+    def take_draft_token_ids(self) -> DraftTokenIds | None:
+        # OPTIMIZATION: Get output only from a single worker (output_rank)
+        return self.collective_rpc(
+            "take_draft_token_ids", unique_reply_rank=self.output_rank
+        )
+
+    def collective_rpc(  # type: ignore[override]
+        self,
+        method: str | Callable,
+        timeout: float | None = None,
+        args: tuple = (),
+        kwargs: dict | None = None,
+        non_block: bool = False,
+        unique_reply_rank: int | None = None,
+        kv_output_aggregator: KVOutputAggregator | None = None,
+    ) -> Any:
+        """Returns single result if unique_reply_rank and/or kv_output_aggregator
+        is provided, otherwise list."""
+        assert self.rpc_broadcast_mq is not None, (
+            "collective_rpc should not be called on follower node"
+        )
+        if self.is_failed:
+            raise RuntimeError("Executor failed.")
+
+        deadline = None if timeout is None else time.monotonic() + timeout
+        kwargs = kwargs or {}
+
+        if kv_output_aggregator is not None:
+            output_rank = None
+            aggregate: Callable[[Any], Any] = partial(
+                kv_output_aggregator.aggregate, output_rank=unique_reply_rank or 0
+            )
+        else:
+            output_rank = unique_reply_rank
+            aggregate = lambda x: x
+
+        if isinstance(method, str):
+            send_method = method
+        else:
+            send_method = cloudpickle.dumps(method, protocol=pickle.HIGHEST_PROTOCOL)
+        self.rpc_broadcast_mq.enqueue((send_method, args, kwargs, output_rank))
+
+        response_mqs: Sequence[MessageQueue] = self.response_mqs
+        if output_rank is not None:
+            response_mqs = (response_mqs[output_rank],)
+
+        shutdown_event = self.shutdown_event
+
+        def get_response():
+            responses = []
+            for mq in response_mqs:
+                dequeue_timeout = (
+                    None if deadline is None else (deadline - time.monotonic())
+                )
+                try:
+                    status, result = mq.dequeue(
+                        timeout=dequeue_timeout, cancel=shutdown_event
+                    )
+                except TimeoutError as e:
+                    raise TimeoutError(f"RPC call to {method} timed out.") from e
+                if status != WorkerProc.ResponseStatus.SUCCESS:
+                    raise RuntimeError(
+                        f"Worker failed with error '{result}', please check the"
+                        " stack trace above for the root cause"
+                    )
+                responses.append(result)
+            return responses[0] if output_rank is not None else responses
+
+        if non_block:
+            future = FutureWrapper(self.futures_queue, aggregate=aggregate)
+            self.futures_queue.appendleft((future, get_response))
+            return future
+
+        # First drain any pending futures in the queue.
+        while self.futures_queue:
+            future, get_fut_response = self.futures_queue.pop()
+            future.wait_for_response(get_fut_response)
+
+        return aggregate(get_response())
+
+    @staticmethod
+    def _ensure_worker_termination(worker_procs: list[BaseProcess]):
+        """Ensure that all worker processes are terminated. Assumes workers have
+        received termination requests. Waits for processing, then sends
+        termination and kill signals if needed."""
+
+        def wait_for_termination(procs, timeout):
+            if not time:
+                # If we are in late stage shutdown, the interpreter may replace
+                # `time` with `None`.
+                return all(not proc.is_alive() for proc in procs)
+            start_time = time.time()
+            while time.time() - start_time < timeout:
+                if all(not proc.is_alive() for proc in procs):
+                    return True
+                time.sleep(0.1)
+            return False
+
+        # Send SIGTERM if still running
+        active_procs = [proc for proc in worker_procs if proc.is_alive()]
+        for p in active_procs:
+            p.terminate()
+        if not wait_for_termination(active_procs, 4):
+            # Send SIGKILL if still running
+            active_procs = [p for p in active_procs if p.is_alive()]
+            for p in active_procs:
+                p.kill()
+
+    def shutdown(self):
+        """Properly shut down the executor and its workers"""
+        if not getattr(self, "shutting_down", False):
+            self.shutting_down = True
+
+            # Make sure all the worker processes are terminated first.
+            if workers := getattr(self, "workers", None):
+                for w in workers:
+                    # Close death_writer to signal child processes to exit
+                    if w.death_writer is not None:
+                        w.death_writer.close()
+                        w.death_writer = None
+                    w.worker_response_mq = None
+                self._ensure_worker_termination([w.proc for w in workers])
+
+            self.shutdown_event.set()
+
+        self.rpc_broadcast_mq = None
+
+    def check_health(self) -> None:
+        self.collective_rpc("check_health", timeout=10)
+        return
+
+    @cached_property
+    def max_concurrent_batches(self) -> int:
+        if self.scheduler_config.async_scheduling:
+            return 2
+        return self.parallel_config.pipeline_parallel_size
+
+    def _get_output_rank(self) -> int:
+        # Only returns ModelRunnerOutput from TP rank=0 and PP rank=-1
+        # (the first TP worker of the last PP stage).
+        # Example:
+        # Assuming TP=8, PP=4, then the world_size=32
+        # 0-7, PP rank 0
+        # 8-15, PP rank 1
+        # 16-23, PP rank 2
+        # 24-31, PP rank 3
+        # so world_size - tp_size = 32 - 8 = 24 should be PP rank = -1 (i.e. 3)
+        return (
+            self.world_size
+            - self.parallel_config.tensor_parallel_size
+            * self.parallel_config.prefill_context_parallel_size
+        )
+
+
+@dataclass
+class UnreadyWorkerProcHandle:
+    """WorkerProcess handle before READY."""
+
+    proc: BaseProcess
+    rank: int
+    ready_pipe: Connection
+    death_writer: Connection | None = None
+
+
+@dataclass
+class WorkerProcHandle:
+    proc: BaseProcess
+    rank: int
+    # The worker process writes to this MQ in single-node mode
+    worker_response_mq: MessageQueue | None
+    # This is only non empty on driver node,
+    # the peer worker process i writes to MQ
+    # `peer_worker_response_mqs[i]`
+    peer_worker_response_mqs: list[MessageQueue | None]
+    death_writer: Connection | None = None
+
+    @classmethod
+    def from_unready_handle(
+        cls,
+        unready_handle: UnreadyWorkerProcHandle,
+        worker_response_mq: MessageQueue | None,
+        peer_worker_response_mqs: list[MessageQueue | None],
+    ) -> "WorkerProcHandle":
+        return cls(
+            proc=unready_handle.proc,
+            rank=unready_handle.rank,
+            worker_response_mq=worker_response_mq,
+            peer_worker_response_mqs=peer_worker_response_mqs,
+            death_writer=unready_handle.death_writer,
+        )
+
+
+class WorkerProc:
+    """Wrapper that runs one Worker in a separate process."""
+
+    READY_STR = "READY"
+    rpc_broadcast_mq: MessageQueue | None
+    worker_response_mq: MessageQueue | None
+
+    def _init_message_queues(
+        self, input_shm_handle: Handle, vllm_config: VllmConfig
+    ) -> None:
+        if vllm_config.parallel_config.nnodes_within_dp == 1:
+            # Initialize MessageQueue for receiving SchedulerOutput
+            self.rpc_broadcast_mq = MessageQueue.create_from_handle(
+                input_shm_handle, self.worker.rank
+            )
+
+            # Initializes a message queue for sending the model output
+            self.worker_response_mq = MessageQueue(1, 1)
+            self.peer_response_handles = []
+        else:
+            # Initialize remote MessageQueue for receiving SchedulerOutput across nodes
+            self.rpc_broadcast_mq = get_inner_dp_world_group().create_mq_broadcaster(
+                external_writer_handle=input_shm_handle,
+                # Since there is external_writer_handle from executor proc,
+                # where the ready signal from actual writer is sent out of the
+                # create_mq_broadcaster method and after this setup, we make it
+                # non blocking. The handshake will be triggered when
+                # worker.rpc_broadcast_mq.wait_until_ready() is called
+                blocking=False,
+            )
+            # Initializes remote message queue for sending the model output to the
+            # driver worker, exposing peer_response_handles for driver worker
+            # that include handles for all ranks
+            self.worker_response_mq, self.peer_response_handles = (
+                get_inner_dp_world_group().create_single_reader_mq_broadcasters(
+                    reader_rank_in_group=0
+                )
+            )
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        local_rank: int,
+        rank: int,
+        distributed_init_method: str,
+        input_shm_handle: Handle,
+        shared_worker_lock: LockType,
+    ):
+        self.rank = rank
+        wrapper = WorkerWrapperBase(
+            vllm_config=vllm_config, rpc_rank=local_rank, global_rank=rank
+        )
+        # TODO: move `init_worker` to executor level as a collective rpc call
+        all_kwargs: list[dict] = [
+            {} for _ in range(vllm_config.parallel_config.world_size)
+        ]
+        is_driver_worker = rank % vllm_config.parallel_config.tensor_parallel_size == 0
+        all_kwargs[local_rank] = {
+            "vllm_config": vllm_config,
+            "local_rank": local_rank,
+            "rank": rank,
+            "distributed_init_method": distributed_init_method,
+            "is_driver_worker": is_driver_worker,
+            "shared_worker_lock": shared_worker_lock,
+        }
+        wrapper.init_worker(all_kwargs)
+        self.worker = wrapper
+
+        scheduler_config = vllm_config.scheduler_config
+        self.use_async_scheduling = scheduler_config.async_scheduling
+        if self.use_async_scheduling:
+            self.async_output_queue: queue.Queue = queue.Queue()
+            self.async_output_copy_thread = Thread(
+                target=self.async_output_busy_loop,
+                daemon=True,
+                name="WorkerAsyncOutputCopy",
+            )
+            self.async_output_copy_thread.start()
+
+        # Initialize device
+        self.worker.init_device()
+
+        # Set process title and log prefix
+        self.setup_proc_title_and_log_prefix(
+            enable_ep=vllm_config.parallel_config.enable_expert_parallel
+        )
+
+        # Load model
+        self._init_message_queues(input_shm_handle, vllm_config)
+        self.worker.load_model()
+
+        # Enable environment variable cache (e.g. assume no more
+        # environment variable overrides after this point)
+        enable_envs_cache()
+
+    @staticmethod
+    def make_worker_process(
+        vllm_config: VllmConfig,
+        local_rank: int,
+        rank: int,
+        distributed_init_method: str,
+        input_shm_handle,  # Receive SchedulerOutput
+        shared_worker_lock: LockType,
+    ) -> UnreadyWorkerProcHandle:
+        context = get_mp_context()
+        # (reader, writer)
+        reader, writer = context.Pipe(duplex=False)
+
+        # Create death pipe to detect parent process exit
+        death_reader, death_writer = context.Pipe(duplex=False)
+
+        process_kwargs = {
+            "vllm_config": vllm_config,
+            "local_rank": local_rank,
+            "rank": rank,
+            "distributed_init_method": distributed_init_method,
+            "input_shm_handle": input_shm_handle,
+            "ready_pipe": (reader, writer),
+            "death_pipe": death_reader,
+            "shared_worker_lock": shared_worker_lock,
+        }
+        # Run EngineCore busy loop in background process.
+        proc = context.Process(
+            target=WorkerProc.worker_main,
+            kwargs=process_kwargs,
+            name=f"VllmWorker-{rank}",
+            daemon=True,
+        )
+
+        proc.start()
+        writer.close()
+        # Keep death_writer open in parent - when parent exits,
+        # death_reader in child will get EOFError
+        return UnreadyWorkerProcHandle(proc, rank, reader, death_writer)
+
+    @staticmethod
+    def wait_for_response_handle_ready(
+        handles: dict[str, Any], proc_handle: UnreadyWorkerProcHandle
+    ) -> WorkerProcHandle:
+        response_handle = handles["handle"]
+        worker_response_mq: MessageQueue | None = None
+        if len(response_handle.local_reader_ranks) > 0:
+            worker_response_mq = MessageQueue.create_from_handle(response_handle, 0)
+        peer_response_handles = handles["peer_response_handles"]
+        peer_worker_response_mqs = [
+            MessageQueue.create_from_handle(handle, -1)
+            if handle.remote_subscribe_addr is not None
+            else None
+            for handle in peer_response_handles
+        ]
+        return WorkerProcHandle.from_unready_handle(
+            proc_handle,
+            worker_response_mq,
+            peer_worker_response_mqs=peer_worker_response_mqs,
+        )
+
+    @staticmethod
+    def wait_for_ready(
+        unready_proc_handles: list[UnreadyWorkerProcHandle],
+    ) -> list[WorkerProcHandle]:
+        e = Exception(
+            "WorkerProc initialization failed due to "
+            "an exception in a background process. "
+            "See stack trace for root cause."
+        )
+
+        pipes = {handle.ready_pipe: handle for handle in unready_proc_handles}
+        ready_proc_handles: list[WorkerProcHandle | None] = [None] * len(
+            unready_proc_handles
+        )
+        while pipes:
+            ready = multiprocessing.connection.wait(pipes.keys())
+            for pipe in ready:
+                assert isinstance(pipe, Connection)
+                try:
+                    # Wait until the WorkerProc is ready.
+                    unready_proc_handle = pipes.pop(pipe)
+                    response: dict[str, Any] = pipe.recv()
+                    if response["status"] != "READY":
+                        raise e
+
+                    idx = unready_proc_handle.rank % len(ready_proc_handles)
+                    ready_proc_handles[idx] = WorkerProc.wait_for_response_handle_ready(
+                        response, unready_proc_handle
+                    )
+                except EOFError:
+                    e.__suppress_context__ = True
+                    raise e from None
+
+                finally:
+                    # Close connection.
+                    pipe.close()
+
+        return cast(list[WorkerProcHandle], ready_proc_handles)
+
+    def shutdown(self):
+        self.worker.shutdown()
+        self.rpc_broadcast_mq = None
+        self.worker_response_mq = None
+        destroy_model_parallel()
+        destroy_distributed_environment()
+
+    @staticmethod
+    def worker_main(*args, **kwargs):
+        """Worker initialization and execution loops.
+        This runs a background process"""
+
+        # Signal handler used for graceful termination.
+        # SystemExit exception is only raised once to allow this and worker
+        # processes to terminate without error
+        shutdown_requested = False
+
+        def signal_handler(signum, frame):
+            nonlocal shutdown_requested
+            if not shutdown_requested:
+                shutdown_requested = True
+                raise SystemExit()
+
+        # Either SIGTERM or SIGINT will terminate the worker
+        signal.signal(signal.SIGTERM, signal_handler)
+        signal.signal(signal.SIGINT, signal_handler)
+
+        worker = None
+        # tuple[Connection, Connection]
+        reader, ready_writer = kwargs.pop("ready_pipe")
+        death_pipe = kwargs.pop("death_pipe", None)
+        shutdown_event = threading.Event()
+        # Start death monitoring thread if death_pipe is provided
+        if death_pipe is not None:
+
+            def monitor_parent_death():
+                try:
+                    # This will block until parent process exits (pipe closes)
+                    death_pipe.recv()
+                except EOFError:
+                    # Parent process has exited, terminate this worker
+                    logger.info_once("Parent process exited, terminating worker")
+                    # Send signal to self to trigger clean shutdown
+                    shutdown_event.set()
+                except Exception as e:
+                    logger.warning("Death monitoring error: %s", e)
+
+            death_monitor = Thread(
+                target=monitor_parent_death, daemon=True, name="WorkerDeathMonitor"
+            )
+            death_monitor.start()
+
+        try:
+            reader.close()
+            worker = WorkerProc(*args, **kwargs)
+            assert worker.worker_response_mq is not None
+
+            # Send READY once we know everything is loaded
+            ready_writer.send(
+                {
+                    "status": WorkerProc.READY_STR,
+                    "handle": worker.worker_response_mq.export_handle(),
+                    "peer_response_handles": worker.peer_response_handles,
+                }
+            )
+
+            # Ensure message queues are ready. Will deadlock if re-ordered.
+            # Must be kept consistent with the Executor
+            if worker.rpc_broadcast_mq is not None:
+                worker.rpc_broadcast_mq.wait_until_ready()
+            worker.worker_response_mq.wait_until_ready()
+            ready_writer.close()
+            ready_writer = None
+
+            worker.worker_busy_loop(cancel=shutdown_event)
+
+        except Exception:
+            # NOTE: if an Exception arises in busy_loop, we send
+            # a FAILURE message over the MQ RPC to notify the Executor,
+            # which triggers system shutdown.
+            # TODO(rob): handle case where the MQ itself breaks.
+
+            if ready_writer is not None:
+                logger.exception("WorkerProc failed to start.")
+            elif shutdown_event.is_set():
+                logger.info("WorkerProc shutting down.")
+            else:
+                logger.exception("WorkerProc failed.")
+
+            # The parent sends a SIGTERM to all worker processes if
+            # any worker dies. Set this value so we don't re-throw
+            # SystemExit() to avoid zmq exceptions in __del__.
+            shutdown_requested = True
+
+        finally:
+            if ready_writer is not None:
+                ready_writer.close()
+            if death_pipe is not None:
+                death_pipe.close()
+            # Clean up once worker exits busy loop
+            if worker is not None:
+                worker.shutdown()
+
+    class ResponseStatus(Enum):
+        SUCCESS = auto()
+        FAILURE = auto()
+
+    def enqueue_output(self, output: Any):
+        """Prepares output from the worker and enqueues it to the
+        worker_response_mq. If the output is an Exception, it is
+        converted to a FAILURE response.
+        """
+        if isinstance(output, AsyncModelRunnerOutput):
+            output = output.get_output()
+
+        if isinstance(output, Exception):
+            result = (WorkerProc.ResponseStatus.FAILURE, str(output))
+        else:
+            result = (WorkerProc.ResponseStatus.SUCCESS, output)
+        if (response_mq := self.worker_response_mq) is not None:
+            response_mq.enqueue(result)
+
+    def handle_output(self, output: Any):
+        """Handles output from the worker. If async scheduling is enabled,
+        it is passed to the async_output_busy_loop thread. Otherwise, it is
+        enqueued directly to the worker_response_mq.
+        """
+        if self.use_async_scheduling:
+            self.async_output_queue.put(output)
+        else:
+            self.enqueue_output(output)
+
+    def async_output_busy_loop(self):
+        """Entrypoint for the thread which handles outputs asynchronously."""
+        while True:
+            output = self.async_output_queue.get()
+            self.enqueue_output(output)
+
+    def worker_busy_loop(self, cancel: threading.Event | None = None):
+        """Main busy loop for Multiprocessing Workers"""
+        assert self.rpc_broadcast_mq is not None
+        while True:
+            method, args, kwargs, output_rank = self.rpc_broadcast_mq.dequeue(
+                cancel=cancel, indefinite=True
+            )
+            try:
+                if isinstance(method, str):
+                    func = getattr(self.worker, method)
+                elif isinstance(method, bytes):
+                    func = partial(cloudpickle.loads(method), self.worker)
+
+                output = func(*args, **kwargs)
+            except Exception as e:
+                # Notes have been introduced in python 3.11
+                if hasattr(e, "add_note"):
+                    e.add_note(traceback.format_exc())
+                logger.exception("WorkerProc hit an exception.")
+                # exception might not be serializable, so we convert it to
+                # string, only for logging purpose.
+                if output_rank is None or self.rank == output_rank:
+                    self.handle_output(e)
+                continue
+
+            if output_rank is None or self.rank == output_rank:
+                self.handle_output(output)
+
+    @staticmethod
+    def setup_proc_title_and_log_prefix(enable_ep: bool) -> None:
+        dp_size = get_dp_group().world_size
+        dp_rank = get_dp_group().rank_in_group
+        pp_size = get_pp_group().world_size
+        pp_rank = get_pp_group().rank_in_group
+        pcp_size = get_pcp_group().world_size
+        pcp_rank = get_pcp_group().rank_in_group
+        tp_size = get_tp_group().world_size
+        tp_rank = get_tp_group().rank_in_group
+        dcp_size = get_dcp_group().world_size
+        dcp_rank = get_dcp_group().rank_in_group
+        process_name = "Worker"
+        if dp_size > 1:
+            process_name += f"_DP{dp_rank}"
+        if pp_size > 1:
+            process_name += f"_PP{pp_rank}"
+        if pcp_size > 1:
+            process_name += f"_PCP{pcp_rank}"
+        if tp_size > 1:
+            process_name += f"_TP{tp_rank}"
+        if dcp_size > 1:
+            process_name += f"_DCP{dcp_rank}"
+        if enable_ep:
+            ep_rank = get_ep_group().rank_in_group
+            process_name += f"_EP{ep_rank}"
+        set_process_title(name=process_name)
+        decorate_logs(process_name)
+
+
+def set_multiprocessing_worker_envs():
+    """Set up environment variables that should be used when there are workers
+    in a multiprocessing environment. This should be called by the parent
+    process before worker processes are created"""
+
+    _maybe_force_spawn()
+
+    # Configure thread parallelism if OMP_NUM_THREADS isn't set
+    #
+    # Helps to avoid CPU contention. The default of spawning a thread per
+    # core combined with multiprocessing for each GPU can have a negative
+    # impact on performance. The contention is amplified when running in a
+    # container where CPU limits can cause throttling.
+    default_omp_num_threads = 1
+    if (
+        "OMP_NUM_THREADS" not in os.environ
+        and (current_parallelism := torch.get_num_threads()) > default_omp_num_threads
+    ):
+        logger.warning(
+            "Reducing Torch parallelism from %d threads to %d to avoid "
+            "unnecessary CPU contention. Set OMP_NUM_THREADS in the "
+            "external environment to tune this value as needed.",
+            current_parallelism,
+            default_omp_num_threads,
+        )
+        os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads)
+        torch.set_num_threads(default_omp_num_threads)
--- a/vllm/v1/executor/ray_distributed_executor.py
+++ b/vllm/v1/executor/ray_distributed_executor.py
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.v1.executor.ray_executor import (
+    RayDistributedExecutor as _RayDistributedExecutor,
+)
+
+# For backwards compatibility.
+RayDistributedExecutor = _RayDistributedExecutor
--- a/vllm/v1/executor/ray_executor.py
+++ b/vllm/v1/executor/ray_executor.py
@@ -0,0 +1,626 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from collections import defaultdict
+from collections.abc import Callable
+from concurrent.futures import Future
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any
+
+import cloudpickle
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.ray.ray_env import get_env_vars_to_copy
+from vllm.utils.network_utils import (
+    get_distributed_init_method,
+    get_ip,
+    get_open_port,
+)
+from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
+from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.executor.ray_utils import (
+    FutureWrapper,
+    RayWorkerWrapper,
+    initialize_ray_cluster,
+    ray,
+)
+from vllm.v1.outputs import ModelRunnerOutput
+
+if ray is not None:
+    from ray.actor import ActorHandle
+    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+else:
+    ActorHandle = None
+
+if TYPE_CHECKING:
+    from ray.util.placement_group import PlacementGroup
+
+logger = init_logger(__name__)
+
+COMPLETED_NONE_FUTURE: Future[ModelRunnerOutput | None] = Future()
+COMPLETED_NONE_FUTURE.set_result(None)
+
+
+@dataclass
+class RayWorkerMetaData:
+    """
+    Metadata for a Ray worker.
+    The order of ray worker creation can be random,
+    and we need to reset the rank after creating all workers.
+    """
+
+    worker: ActorHandle
+    created_rank: int
+    adjusted_rank: int = -1
+    ip: str = ""
+
+
+class RayDistributedExecutor(Executor):
+    """Ray-based distributed executor"""
+
+    # These env vars are worker-specific, therefore are NOT copied
+    # from the driver to the workers
+    WORKER_SPECIFIC_ENV_VARS = {
+        "VLLM_HOST_IP",
+        "VLLM_HOST_PORT",
+        "LOCAL_RANK",
+        "CUDA_VISIBLE_DEVICES",
+    }
+
+    # These non-vLLM env vars are copied from the driver to workers
+    ADDITIONAL_ENV_VARS = {"HF_TOKEN", "HUGGING_FACE_HUB_TOKEN"}
+
+    uses_ray: bool = True
+    supports_pp: bool = True
+
+    def _init_executor(self) -> None:
+        self.forward_dag: ray.dag.CompiledDAG | None = None
+
+        # For TPU or XPU, avoid compiling NVIDIA's NCCL
+        if current_platform.is_tpu() or current_platform.is_xpu():
+            os.environ["VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE"] = "shm"
+
+        assert self.uses_ray
+        initialize_ray_cluster(self.parallel_config)
+        placement_group = self.parallel_config.placement_group
+
+        # Disable Ray usage stats collection.
+        ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
+        if ray_usage != "1":
+            os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
+
+        # Create the parallel GPU workers.
+        self._init_workers_ray(placement_group)
+
+        # KV connector setup
+        self.has_connector = self.vllm_config.kv_transfer_config is not None
+
+        self.uses_sampler = self.vllm_config.model_config.runner_type != "pooling" and (
+            self.vllm_config.ec_transfer_config is None
+            or not self.vllm_config.ec_transfer_config.is_ec_producer
+        )
+
+        self.scheduler_output: SchedulerOutput | None = None
+
+    @property
+    def max_concurrent_batches(self) -> int:
+        """Ray distributed executor supports pipeline parallelism,
+        meaning that it allows PP size batches to be executed concurrently.
+        """
+        if self.scheduler_config.async_scheduling:
+            return 2
+        return self.parallel_config.pipeline_parallel_size
+
+    def shutdown(self) -> None:
+        if logger:
+            # Somehow logger can be None here.
+            logger.info(
+                "Shutting down Ray distributed executor. If you see error log "
+                "from logging.cc regarding SIGTERM received, please ignore "
+                "because this is the expected termination process in Ray."
+            )
+        if hasattr(self, "forward_dag") and self.forward_dag is not None:
+            self.forward_dag.teardown()
+            import ray
+
+            for worker in self.workers:
+                ray.kill(worker)
+            self.forward_dag = None
+
+    def _configure_ray_workers_use_nsight(self, ray_remote_kwargs) -> dict[str, Any]:
+        # If nsight profiling is enabled, we need to set the profiling
+        # configuration for the ray workers as runtime env.
+        runtime_env = ray_remote_kwargs.setdefault("runtime_env", {})
+        runtime_env.update(
+            {
+                "nsight": {
+                    "t": "cuda,cudnn,cublas",
+                    "o": "'worker_process_%p'",
+                    "cuda-graph-trace": "node",
+                }
+            }
+        )
+
+        return ray_remote_kwargs
+
+    # child class could overwrite this to return actual env vars.
+    def _get_env_vars_to_be_updated(self):
+        return self._env_vars_for_all_workers
+
+    def _init_workers_ray(self, placement_group: "PlacementGroup", **ray_remote_kwargs):
+        num_gpus = envs.VLLM_RAY_PER_WORKER_GPUS
+
+        # The driver dummy worker does not actually use any resources.
+        # It holds the resource for the driver worker.
+        self.driver_dummy_worker: RayWorkerWrapper | None = None
+        # The remaining workers are the actual ray actors.
+        self.workers: list[RayWorkerWrapper] = []
+
+        # Used in ray compiled DAG: indexed first by PP rank,
+        # and then TP rank. In other words, the inner list is
+        # the TP group of workers for a PP rank.
+        self.pp_tp_workers: list[list[RayWorkerWrapper]] = []
+
+        if self.parallel_config.ray_workers_use_nsight:
+            ray_remote_kwargs = self._configure_ray_workers_use_nsight(
+                ray_remote_kwargs
+            )
+
+        # Create the workers.
+        bundle_indices: list[int]
+        if envs.VLLM_RAY_BUNDLE_INDICES:
+            # Use the bundle indices specified by the user.
+            bundle_indices = list(map(int, envs.VLLM_RAY_BUNDLE_INDICES.split(",")))
+            assert len(bundle_indices) == self.parallel_config.world_size, (
+                "VLLM_RAY_BUNDLE_INDICES must have the same size"
+                f" as the world size, but got {bundle_indices=} "
+                f"and {self.parallel_config.world_size=}"
+            )
+            assert len(set(bundle_indices)) == len(bundle_indices), (
+                "VLLM_RAY_BUNDLE_INDICES cannot have duplicate values,"
+                f" but got {bundle_indices=}"
+            )
+        else:
+            # use the first N bundles that have GPU resources.
+            bundle_indices = []
+            for bundle_id, bundle in enumerate(placement_group.bundle_specs):
+                if bundle.get(current_platform.ray_device_key, 0):
+                    bundle_indices.append(bundle_id)
+            bundle_indices = bundle_indices[: self.parallel_config.world_size]
+
+        worker_metadata: list[RayWorkerMetaData] = []
+        driver_ip = get_ip()
+        for rank, bundle_id in enumerate(bundle_indices):
+            scheduling_strategy = PlacementGroupSchedulingStrategy(
+                placement_group=placement_group,
+                placement_group_capture_child_tasks=True,
+                placement_group_bundle_index=bundle_id,
+            )
+
+            if current_platform.ray_device_key == "GPU":
+                # NV+AMD GPUs, and Intel XPUs
+                worker = ray.remote(
+                    num_cpus=0,
+                    num_gpus=num_gpus,
+                    scheduling_strategy=scheduling_strategy,
+                    **ray_remote_kwargs,
+                )(RayWorkerWrapper).remote(  # type: ignore[attr-defined]
+                    vllm_config=self.vllm_config, rpc_rank=rank
+                )
+            else:
+                worker = ray.remote(
+                    num_cpus=0,
+                    num_gpus=0,
+                    resources={current_platform.ray_device_key: num_gpus},
+                    scheduling_strategy=scheduling_strategy,
+                    **ray_remote_kwargs,
+                )(RayWorkerWrapper).remote(  # type: ignore[attr-defined]
+                    vllm_config=self.vllm_config, rpc_rank=rank
+                )
+            worker_metadata.append(RayWorkerMetaData(worker=worker, created_rank=rank))
+
+        worker_ips = ray.get(
+            [
+                each.worker.get_node_ip.remote()  # type: ignore[attr-defined]
+                for each in worker_metadata
+            ]
+        )
+
+        for each, ip in zip(worker_metadata, worker_ips):
+            each.ip = ip
+
+        logger.debug("workers: %s", worker_metadata)
+        logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker)
+
+        ip_counts: dict[str, int] = {}
+        for ip in worker_ips:
+            ip_counts[ip] = ip_counts.get(ip, 0) + 1
+
+        def sort_by_driver_then_worker_ip(item: RayWorkerMetaData):
+            """
+            Sort the workers based on 3 properties:
+            1. If the worker is on the same node as the driver (vllm engine),
+                it should be placed first.
+            2. Then, if the worker is on a node with fewer workers, it should
+                be placed first.
+            3. Finally, if the work is on a node with smaller IP address, it
+                should be placed first.
+            """
+            ip = item.ip
+            return 0 if ip == driver_ip else 1, ip_counts[ip], ip
+
+        # After sorting, the workers on the same node will be
+        # close to each other, and the workers on the driver
+        # node will be placed first.
+        sorted_worker_metadata = sorted(
+            worker_metadata, key=sort_by_driver_then_worker_ip
+        )
+        for i, item in enumerate(sorted_worker_metadata):
+            item.adjusted_rank = i
+        self.workers = [item.worker for item in sorted_worker_metadata]
+        rerank_mapping = {
+            item.created_rank: item.adjusted_rank for item in sorted_worker_metadata
+        }
+        self.collective_rpc("adjust_rank", args=(rerank_mapping,))
+
+        # Get the set of GPU IDs used on each node.
+        worker_node_and_gpu_ids = []
+        for worker in [self.driver_dummy_worker] + self.workers:
+            if worker is None:
+                # driver_dummy_worker can be None when using ray spmd worker.
+                continue
+            worker_node_and_gpu_ids.append(
+                ray.get(worker.get_node_and_gpu_ids.remote())
+            )  # type: ignore[attr-defined]
+
+        node_workers = defaultdict(list)  # node id -> list of worker ranks
+        node_gpus = defaultdict(list)  # node id -> list of gpu ids
+
+        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
+            node_workers[node_id].append(i)
+            # `gpu_ids` can be a list of strings or integers.
+            # convert them to integers for consistency.
+            # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs),
+            # string sorting is not sufficient.
+            # see https://github.com/vllm-project/vllm/issues/5590
+            gpu_ids = [int(x) for x in gpu_ids]
+            node_gpus[node_id].extend(gpu_ids)
+        for node_id, gpu_ids in node_gpus.items():
+            node_gpus[node_id] = sorted(gpu_ids)
+
+        all_ips = set(worker_ips + [driver_ip])
+        n_ips = len(all_ips)
+        n_nodes = len(node_workers)
+
+        if n_nodes != n_ips:
+            raise RuntimeError(
+                f"Every node should have a unique IP address. Got {n_nodes}"
+                f" nodes with node ids {list(node_workers.keys())} and "
+                f"{n_ips} unique IP addresses {all_ips}. Please check your"
+                " network configuration. If you set `VLLM_HOST_IP`"
+                " environment variable, make sure it is unique for"
+                " each node."
+            )
+
+        # Set environment variables for the driver and workers.
+        all_args_to_update_environment_variables = [
+            {
+                current_platform.device_control_env_var: ",".join(
+                    map(str, node_gpus[node_id])
+                ),
+            }
+            for (node_id, _) in worker_node_and_gpu_ids
+        ]
+
+        # Environment variables to copy from driver to workers
+        env_vars_to_copy = get_env_vars_to_copy(
+            exclude_vars=self.WORKER_SPECIFIC_ENV_VARS,
+            additional_vars=set(current_platform.additional_env_vars).union(
+                self.ADDITIONAL_ENV_VARS
+            ),
+            destination="workers",
+        )
+
+        # Copy existing env vars to each worker's args
+        for args in all_args_to_update_environment_variables:
+            # TODO: refactor platform-specific env vars
+            for name in env_vars_to_copy:
+                if name in os.environ:
+                    args[name] = os.environ[name]
+
+        self._env_vars_for_all_workers = all_args_to_update_environment_variables
+
+        self.collective_rpc(
+            "update_environment_variables", args=(self._get_env_vars_to_be_updated(),)
+        )
+
+        if len(node_gpus) == 1:
+            # in single node case, we don't need to get the IP address.
+            # the loopback address is sufficient
+            # NOTE: a node may have several IP addresses, one for each
+            # network interface. `get_ip()` might return any of them,
+            # while they might not work for communication inside the node
+            # if the network setup is complicated. Using the loopback address
+            # solves this issue, as it always works for communication inside
+            # the node.
+            driver_ip = "127.0.0.1"
+        distributed_init_method = get_distributed_init_method(
+            driver_ip, get_open_port()
+        )
+
+        # Initialize the actual workers inside worker wrapper.
+        all_kwargs = []
+        for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids):
+            local_rank = node_workers[node_id].index(rank)
+            kwargs = dict(
+                vllm_config=self.vllm_config,
+                local_rank=local_rank,
+                rank=rank,
+                distributed_init_method=distributed_init_method,
+                is_driver_worker=(not self.parallel_config)
+                or (rank % self.parallel_config.tensor_parallel_size == 0),
+            )
+            all_kwargs.append(kwargs)
+        self.collective_rpc("init_worker", args=(all_kwargs,))
+
+        self.collective_rpc("init_device")
+        self.collective_rpc("load_model")
+
+        for pp_rank in range(self.parallel_config.pipeline_parallel_size):
+            self.pp_tp_workers.append([])
+            for tp_rank in range(self.parallel_config.tensor_parallel_size):
+                # PP=2, TP=4
+                # pp_tp_workers = [[0, 1, 2, 3], [4, 5, 6, 7]]
+                rank = (pp_rank * self.parallel_config.tensor_parallel_size) + tp_rank
+                assert len(self.pp_tp_workers[pp_rank]) == tp_rank
+                assert pp_rank < len(self.pp_tp_workers)
+                self.pp_tp_workers[pp_rank].append(self.workers[rank])
+
+    def reinitialize_distributed(
+        self, reconfig_request: ReconfigureDistributedRequest
+    ) -> None:
+        self.collective_rpc("reinitialize_distributed", args=(reconfig_request,))
+        if (
+            reconfig_request.new_data_parallel_rank
+            == ReconfigureRankType.SHUTDOWN_CURRENT_RANK
+        ):
+            self.shutdown()
+
+    def execute_model(  # type: ignore[override]
+        self,
+        scheduler_output: SchedulerOutput,
+        non_block: bool = False,
+    ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
+        if self.scheduler_output is not None:
+            raise RuntimeError(
+                "State error: sample_tokens() must be called "
+                "after execute_model() returns None."
+            )
+
+        if not self.uses_sampler or not scheduler_output.total_num_scheduled_tokens:
+            # Model will not execute, call model runner immediately.
+            return self._execute_dag(scheduler_output, None, non_block)
+
+        # Model will execute, defer to sample_tokens() call.
+        self.scheduler_output = scheduler_output
+        return COMPLETED_NONE_FUTURE if non_block else None
+
+    def sample_tokens(  # type: ignore[override]
+        self,
+        grammar_output: "GrammarOutput | None",
+        non_block: bool = False,
+    ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
+        """Execute the model on the Ray workers.
+
+        The scheduler output to use should have been provided in
+        a prior call to execute_model().
+
+        Args:
+            grammar_output: The structured outputs grammar bitmask, if applicable.
+            non_block: If True, the method will return a Future.
+
+        Returns:
+            The model runner output.
+        """
+        scheduler_output = self.scheduler_output
+        if scheduler_output is None:
+            return COMPLETED_NONE_FUTURE if non_block else None
+
+        self.scheduler_output = None
+
+        return self._execute_dag(scheduler_output, grammar_output, non_block)
+
+    def _execute_dag(
+        self,
+        scheduler_output: SchedulerOutput,
+        grammar_output: "GrammarOutput | None",
+        non_block: bool = False,
+    ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
+        # Build the compiled DAG for the first time.
+        if self.forward_dag is None:  # type: ignore
+            self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)
+
+        refs = self.forward_dag.execute((scheduler_output, grammar_output))  # type: ignore
+
+        if not self.has_connector:
+            # Get output only from a single worker (output_rank)
+            # When PP is not used, we block here until the result is available.
+            if not non_block:
+                return refs[0].get()
+
+            # When PP is used, we return a FutureWrapper immediately so that
+            # the scheduler can yield to the next batch.
+            return FutureWrapper(refs[0])
+
+        # Get output from all workers when connector is present
+        assert self.kv_output_aggregator is not None
+        if not non_block:
+            # Block and get results from all workers
+            return self.kv_output_aggregator.aggregate(ray.get(refs))
+
+        # Return a future that will aggregate outputs from all workers
+        return FutureWrapper(refs, self.kv_output_aggregator)
+
+    def collective_rpc(  # type: ignore[override]
+        self,
+        method: str | Callable,
+        timeout: float | None = None,
+        args: tuple = (),
+        kwargs: dict[str, Any] | None = None,
+        non_block: bool = False,
+    ) -> list[Any] | Future[list[Any]]:
+        """Runs the given method on all workers."""
+        sent_method = method if isinstance(method, str) else cloudpickle.dumps(method)
+        del method
+
+        if kwargs is None:
+            kwargs = {}
+        ray_worker_outputs = [
+            worker.execute_method.remote(  # type: ignore[attr-defined]
+                sent_method, *args, **kwargs
+            )
+            for worker in self.workers
+        ]
+
+        # Get the results of the ray workers.
+        if non_block:
+            return FutureWrapper(ray_worker_outputs)
+
+        return ray.get(ray_worker_outputs, timeout=timeout)
+
+    def _check_ray_cgraph_installation(self):
+        import importlib.metadata
+
+        from packaging import version
+
+        required_version = version.parse("2.43.0")
+        current_version = version.parse(importlib.metadata.version("ray"))
+        if current_version < required_version:
+            raise ValueError(
+                f"Ray version {required_version} is "
+                f"required, but found {current_version}"
+            )
+
+        import importlib.util
+
+        cgraph_spec = importlib.util.find_spec("ray.experimental.compiled_dag_ref")
+        if cgraph_spec is None:
+            raise ValueError(
+                "Ray Compiled Graph is not installed. "
+                "Run `pip install ray[cgraph]` to install it."
+            )
+
+        cupy_spec = importlib.util.find_spec("cupy")
+        if cupy_spec is None and envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE == "nccl":
+            raise ValueError(
+                "cupy is not installed but required since "
+                "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE is set to 'nccl'. "
+                "Run `pip install ray[cgraph]` and check cupy installation."
+            )
+
+    def _compiled_ray_dag(self, enable_asyncio: bool):
+        assert self.parallel_config.use_ray
+        self._check_ray_cgraph_installation()
+        # Enlarge the default value of "RAY_CGRAPH_get_timeout" to 300 seconds
+        # (it is 10 seconds by default). This is a Ray environment variable to
+        # control the timeout of getting result from a compiled graph execution,
+        # i.e., the distributed execution that includes model forward runs and
+        # intermediate tensor communications, in the case of vllm.
+        # Note: we should set this env var before importing
+        # ray.dag, otherwise it will not take effect.
+        os.environ.setdefault("RAY_CGRAPH_get_timeout", "300")  # noqa: SIM112
+        from ray.dag import InputNode, MultiOutputNode
+
+        logger.info(
+            "RAY_CGRAPH_get_timeout is set to %s",
+            os.environ["RAY_CGRAPH_get_timeout"],  # noqa: SIM112
+        )
+        logger.info(
+            "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE = %s",
+            envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE,
+        )
+        logger.info(
+            "VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM = %s",
+            envs.VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM,
+        )
+
+        channel_type = envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE
+        if channel_type not in ("auto", "nccl", "shm"):
+            raise ValueError(
+                "Invalid value for VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: "
+                f"{channel_type}. Valid values are: 'auto', 'nccl', or 'shm'."
+            )
+
+        with InputNode() as input_data:
+            # Example DAG: PP=2, TP=4
+            #
+            # SchedulerOutput -> 0 -> (SchedulerOutput, IntermediateTensors) -> 4 -> ModelRunnerOutput   # noqa: E501
+            # SchedulerOutput -> 1 -> (SchedulerOutput, IntermediateTensors) -> 5 -> ModelRunnerOutput   # noqa: E501
+            # SchedulerOutput -> 2 -> (SchedulerOutput, IntermediateTensors) -> 6 -> ModelRunnerOutput   # noqa: E501
+            # SchedulerOutput -> 3 -> (SchedulerOutput, IntermediateTensors) -> 7 -> ModelRunnerOutput   # noqa: E501
+
+            # All workers in the first TP group will take in the
+            # ExecuteModelRequest as input.
+            outputs = [input_data for _ in self.pp_tp_workers[0]]
+            for pp_rank, tp_group in enumerate(self.pp_tp_workers):
+                # Each PP worker takes in the output of the previous PP worker,
+                # and the TP group executes in SPMD fashion.
+                outputs = [
+                    worker.execute_model_ray.bind(outputs[i])  # type: ignore[attr-defined]
+                    for i, worker in enumerate(tp_group)
+                ]
+
+                last_pp_rank = len(self.pp_tp_workers) - 1
+                if (
+                    pp_rank < last_pp_rank
+                    and envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE != "shm"
+                ):
+                    # Specify how intermediate tensors should be passed
+                    # between pp stages, no need to specify for the last
+                    # pp stage or when using shared memory (the default).
+                    transport = envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE
+                    outputs = [
+                        output.with_tensor_transport(transport=transport)
+                        for output in outputs
+                    ]
+
+            forward_dag = MultiOutputNode(outputs)
+
+        if envs.VLLM_USE_RAY_WRAPPED_PP_COMM:
+            from ray.experimental.channel.accelerator_context import (
+                register_accelerator_context,
+            )
+
+            from vllm.distributed.device_communicators.ray_communicator import (
+                RayPPCommunicator,
+            )
+
+            register_accelerator_context(
+                torch_module_name="cuda", communicator_cls=RayPPCommunicator
+            )
+            logger.info(
+                "Using RayPPCommunicator "
+                "(which wraps vLLM _PP GroupCoordinator) "
+                "for Ray Compiled Graph communication."
+            )
+        else:
+            logger.info(
+                "Using Ray's NCCL communicator for Ray Compiled Graph communication."
+            )
+
+        return forward_dag.experimental_compile(
+            enable_asyncio=enable_asyncio,
+            _overlap_gpu_communication=envs.VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM,
+        )
+
+    def __del__(self):
+        self.shutdown()
+
+    def check_health(self) -> None:
+        # Assume that the Ray workers are healthy.
+        # TODO: check the health of the Ray workers
+        return
--- a/vllm/v1/executor/ray_utils.py
+++ b/vllm/v1/executor/ray_utils.py
@@ -0,0 +1,465 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import time
+from collections import defaultdict
+from concurrent.futures import Future
+from typing import TYPE_CHECKING, Union
+
+import vllm.platforms
+from vllm.config import ParallelConfig
+from vllm.distributed import get_pp_group
+from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.sequence import IntermediateTensors
+from vllm.utils.network_utils import get_ip
+from vllm.v1.outputs import AsyncModelRunnerOutput
+from vllm.v1.worker.worker_base import WorkerWrapperBase
+
+if TYPE_CHECKING:
+    from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
+    from vllm.v1.outputs import ModelRunnerOutput
+
+logger = init_logger(__name__)
+PG_WAIT_TIMEOUT = 1800
+
+try:
+    import ray
+    from ray.util import placement_group_table
+    from ray.util.placement_group import PlacementGroup
+
+    try:
+        from ray._private.state import available_resources_per_node
+    except ImportError:
+        # Ray 2.9.x doesn't expose `available_resources_per_node`
+        from ray._private.state import state as _state
+
+        available_resources_per_node = _state._available_resources_per_node
+
+    class RayWorkerWrapper(WorkerWrapperBase):
+        """Ray wrapper for vllm.worker.Worker, allowing Worker to be
+        lazily initialized after Ray sets CUDA_VISIBLE_DEVICES."""
+
+        def __init__(self, *args, **kwargs) -> None:
+            super().__init__(*args, **kwargs)
+            # Since the compiled DAG runs a main execution
+            # in a different thread that calls cuda.set_device.
+            # The flag indicates is set_device is called on
+            # that thread.
+            self.compiled_dag_cuda_device_set = False
+
+        def get_node_ip(self) -> str:
+            return get_ip()
+
+        def get_node_and_gpu_ids(self) -> tuple[str, list[int]]:
+            node_id = ray.get_runtime_context().get_node_id()
+            device_key = vllm.platforms.current_platform.ray_device_key
+            if not device_key:
+                raise RuntimeError(
+                    "current platform %s does not support ray.",
+                    vllm.platforms.current_platform.device_name,
+                )
+            gpu_ids = ray.get_runtime_context().get_accelerator_ids()[device_key]
+            return node_id, gpu_ids
+
+        def setup_device_if_necessary(self):
+            # TODO(swang): This is needed right now because Ray CG executes
+            # on a background thread, so we need to reset torch's current
+            # device.
+            # We can remove this API after it is fixed in compiled graph.
+            assert self.worker is not None, "Worker is not initialized"
+            if not self.compiled_dag_cuda_device_set:
+                if current_platform.is_tpu():
+                    # Not needed
+                    pass
+                else:
+                    assert self.worker.device is not None
+                    current_platform.set_device(self.worker.device)
+
+                self.compiled_dag_cuda_device_set = True
+
+        def execute_model_ray(
+            self,
+            execute_model_input: tuple["SchedulerOutput", "GrammarOutput"]
+            | tuple["SchedulerOutput", "GrammarOutput", "IntermediateTensors"],
+        ) -> Union[
+            "ModelRunnerOutput",
+            tuple["SchedulerOutput", "GrammarOutput", "IntermediateTensors"],
+        ]:
+            # This method is used by Ray Compiled Graph to execute the model,
+            # and it needs a special logic of self.setup_device_if_necessary()
+            self.setup_device_if_necessary()
+            assert self.worker is not None, "Worker is not initialized"
+            if len(execute_model_input) == 3:
+                scheduler_output, grammar_output, intermediate_tensors = (
+                    execute_model_input
+                )
+            else:
+                scheduler_output, grammar_output = execute_model_input
+                intermediate_tensors = None
+            assert self.worker.model_runner is not None
+            output = self.worker.model_runner.execute_model(
+                scheduler_output, intermediate_tensors
+            )
+            if isinstance(output, IntermediateTensors):
+                output = scheduler_output, grammar_output, output
+            elif not get_pp_group().is_last_rank:
+                # Case where there are no scheduled requests
+                # but may still be finished requests.
+                assert not output or not output.req_ids
+                output = scheduler_output, grammar_output, None
+            elif output is None:
+                output = self.worker.model_runner.sample_tokens(grammar_output)
+                # Ensure outputs crossing Ray compiled DAG are serializable.
+                # AsyncModelRunnerOutput holds CUDA events and cannot be
+                # pickled.
+                if isinstance(output, AsyncModelRunnerOutput):
+                    output = output.get_output()
+            return output
+
+        def override_env_vars(self, vars: dict[str, str]):
+            os.environ.update(vars)
+
+    ray_import_err = None
+
+except ImportError as e:
+    ray = None  # type: ignore
+    # only capture string to avoid variable references in the traceback that can
+    # prevent garbage collection in some cases
+    ray_import_err = str(e)
+    RayWorkerWrapper = None  # type: ignore
+
+
+class FutureWrapper(Future):
+    """A wrapper around Ray output reference to meet the interface
+    of .execute_model(): The top level (core busy loop) expects .result() api
+    to block and return a single output.
+
+    If aggregator is provided, the outputs from all workers are aggregated upon
+    the result() call. If not only the first worker's output is returned.
+    """
+
+    def __init__(self, ref_or_refs, aggregator: KVOutputAggregator | None = None):
+        super().__init__()
+        self.ref_or_refs = ref_or_refs
+        self.aggregator = aggregator
+
+    def result(self, timeout=None):
+        outputs = ray.get(self.ref_or_refs, timeout=timeout)
+        if self.aggregator is None:
+            return outputs
+
+        return self.aggregator.aggregate(outputs, output_rank=0)
+
+
+def ray_is_available() -> bool:
+    """Returns True if Ray is available."""
+    return ray is not None
+
+
+def assert_ray_available():
+    """Raise an exception if Ray is not available."""
+    if ray is None:
+        raise ValueError(
+            f"Failed to import Ray: {ray_import_err}."
+            "Please install Ray with `pip install ray`."
+        )
+
+
+def _verify_bundles(
+    placement_group: "PlacementGroup", parallel_config: ParallelConfig, device_str: str
+):
+    """Verify a given placement group has bundles located in the right place.
+
+    There are 2 rules.
+    - Warn if all tensor parallel workers cannot fit in a single node.
+    - Fail if driver node is not included in a placement group.
+    """
+    assert ray.is_initialized(), (
+        "Ray is not initialized although distributed-executor-backend is ray."
+    )
+    pg_data = placement_group_table(placement_group)
+    # bundle_idx -> node_id
+    bundle_to_node_ids = pg_data["bundles_to_node_id"]
+    # bundle_idx -> bundle (e.g., {"GPU": 1})
+    bundles = pg_data["bundles"]
+    # node_id -> List of bundle (e.g., {"GPU": 1})
+    node_id_to_bundle: dict[str, list[dict[str, float]]] = defaultdict(list)
+
+    for bundle_idx, node_id in bundle_to_node_ids.items():
+        node_id_to_bundle[node_id].append(bundles[bundle_idx])
+    driver_node_id = ray.get_runtime_context().get_node_id()
+
+    if driver_node_id not in node_id_to_bundle:
+        raise RuntimeError(
+            f"driver node id {driver_node_id} is not included in a placement "
+            f"group {placement_group.id}. Node id -> bundles "
+            f"{node_id_to_bundle}. "
+            "You don't have enough GPUs available in a current node. Check "
+            "`ray status` and `ray list nodes` to see if you have available "
+            "GPUs in a node `{driver_node_id}` before starting an vLLM engine."
+        )
+
+    for node_id, bundles in node_id_to_bundle.items():
+        if len(bundles) < parallel_config.tensor_parallel_size:
+            logger.warning(
+                "tensor_parallel_size=%d "
+                "is bigger than a reserved number of %ss (%d "
+                "%ss) in a node %s. Tensor parallel workers can be "
+                "spread out to 2+ nodes which can degrade the performance "
+                "unless you have fast interconnect across nodes, like "
+                "Infiniband. To resolve this issue, make sure you have more "
+                "than %d GPUs available at each node.",
+                parallel_config.tensor_parallel_size,
+                device_str,
+                len(bundles),
+                device_str,
+                node_id,
+                parallel_config.tensor_parallel_size,
+            )
+
+
+def _wait_until_pg_ready(current_placement_group: "PlacementGroup"):
+    """Wait until a placement group is ready.
+
+    It prints the informative log messages if the placement group is
+    not created within time.
+
+    """
+    # Wait until PG is ready - this will block until all
+    # requested resources are available, and will time out
+    # if they cannot be provisioned.
+    placement_group_specs = current_placement_group.bundle_specs
+
+    s = time.time()
+    pg_ready_ref = current_placement_group.ready()
+    wait_interval = 10
+    while time.time() - s < PG_WAIT_TIMEOUT:
+        ready, _ = ray.wait([pg_ready_ref], timeout=wait_interval)
+        if len(ready) > 0:
+            break
+
+        # Exponential backoff for warning print.
+        wait_interval *= 2
+        logger.info(
+            "Waiting for creating a placement group of specs for "
+            "%d seconds. specs=%s. Check `ray status` and "
+            "`ray list nodes` to see if you have enough resources,"
+            " and make sure the IP addresses used by ray cluster"
+            " are the same as VLLM_HOST_IP environment variable"
+            " specified in each node if you are running on a multi-node.",
+            int(time.time() - s),
+            placement_group_specs,
+        )
+
+    try:
+        ray.get(pg_ready_ref, timeout=0)
+    except ray.exceptions.GetTimeoutError:
+        # Provide more helpful error message when GPU count is exceeded
+        total_gpu_required = sum(spec.get("GPU", 0) for spec in placement_group_specs)
+        # If more than one GPU is required for the placement group, provide a
+        # more specific error message.
+        # We use >1 here because multi-GPU (tensor parallel) jobs are more
+        # likely to fail due to insufficient cluster resources, and users may
+        # need to adjust tensor_parallel_size to fit available GPUs.
+        if total_gpu_required > 1:
+            raise ValueError(
+                f"Cannot provide a placement group requiring "
+                f"{total_gpu_required} GPUs "
+                f"(placement_group_specs={placement_group_specs}) within "
+                f"{PG_WAIT_TIMEOUT} seconds.\n"
+                f"Tensor parallel size may exceed available GPUs in your "
+                f"cluster. Check resources with `ray status` and "
+                f"`ray list nodes`.\n"
+                f"If running on K8s with limited GPUs, consider reducing "
+                f"--tensor-parallel-size to match available GPU resources."
+            ) from None
+        else:
+            raise ValueError(
+                "Cannot provide a placement group of "
+                f"{placement_group_specs=} within "
+                f"{PG_WAIT_TIMEOUT} seconds. See "
+                "`ray status` and `ray list nodes` to make sure the cluster "
+                "has enough resources."
+            ) from None
+
+
+def _wait_until_pg_removed(current_placement_group: "PlacementGroup"):
+    ray.util.remove_placement_group(current_placement_group)
+    s = time.time()
+    wait_interval = 10
+    while time.time() - s < PG_WAIT_TIMEOUT:
+        pg = ray.util.get_current_placement_group()
+        if pg is None:
+            break
+
+        # Exponential backoff for warning print.
+        wait_interval *= 2
+        logger.info(
+            "Waiting for removing a placement group of specs for %d seconds.",
+            int(time.time() - s),
+        )
+        time.sleep(wait_interval)
+
+
+def initialize_ray_cluster(
+    parallel_config: ParallelConfig,
+    ray_address: str | None = None,
+):
+    """Initialize the distributed cluster with Ray.
+
+    it will connect to the Ray cluster and create a placement group
+    for the workers, which includes the specification of the resources
+    for each distributed worker.
+
+    Args:
+        parallel_config: The configurations for parallel execution.
+        ray_address: The address of the Ray cluster. If None, uses
+            the default Ray cluster address.
+    """
+    assert_ray_available()
+    from vllm.platforms import current_platform
+
+    # Prevalidate GPU requirements before Ray processing
+    if current_platform.is_cuda() and parallel_config.world_size > 1:
+        from vllm.utils.torch_utils import cuda_device_count_stateless
+
+        available_gpus = cuda_device_count_stateless()
+        if parallel_config.world_size > available_gpus:
+            logger.warning(
+                "Tensor parallel size (%d) exceeds available GPUs (%d). "
+                "This may result in Ray placement group allocation failures. "
+                "Consider reducing tensor_parallel_size to %d or less, "
+                "or ensure your Ray cluster has %d GPUs available.",
+                parallel_config.world_size,
+                available_gpus,
+                available_gpus,
+                parallel_config.world_size,
+            )
+
+    if ray.is_initialized():
+        logger.info("Ray is already initialized. Skipping Ray initialization.")
+    elif current_platform.is_rocm() or current_platform.is_xpu():
+        # Try to connect existing ray instance and create a new one if not found
+        try:
+            ray.init("auto")
+        except ConnectionError:
+            logger.warning(
+                "No existing RAY instance detected. "
+                "A new instance will be launched with current node resources."
+            )
+            ray.init(
+                address=ray_address,
+                num_gpus=parallel_config.world_size,
+                runtime_env=parallel_config.ray_runtime_env,
+            )
+    else:
+        ray.init(address=ray_address, runtime_env=parallel_config.ray_runtime_env)
+
+    device_str = current_platform.ray_device_key
+    if not device_str:
+        raise ValueError(
+            f"current platform {current_platform.device_name} does not support ray."
+        )
+
+    # Create or get the placement group for worker processes
+    if parallel_config.placement_group:
+        current_placement_group = parallel_config.placement_group
+    else:
+        current_placement_group = ray.util.get_current_placement_group()
+
+    if current_placement_group:
+        logger.info("Using the existing placement group")
+
+        # We are in a placement group
+        bundles = current_placement_group.bundle_specs
+        # Verify that we can use the placement group.
+        device_bundles = 0
+        for bundle in bundles:
+            bundle_devices = bundle.get(device_str, 0)
+            if bundle_devices > 1:
+                raise ValueError(
+                    f"Placement group bundle cannot have more than 1 {device_str}."
+                )
+            if bundle_devices:
+                device_bundles += 1
+        if parallel_config.world_size > device_bundles:
+            raise ValueError(
+                f"The number of required {device_str}s exceeds the total "
+                f"number of available {device_str}s in the placement group. "
+                f"Required number of devices: {parallel_config.world_size}. "
+                f"Total number of devices: {device_bundles}."
+            )
+    else:
+        logger.info("No current placement group found. Creating a new placement group.")
+        num_devices_in_cluster = ray.cluster_resources().get(device_str, 0)
+        # Log a warning message and delay resource allocation failure response.
+        # Avoid immediate rejection to allow user-initiated placement group
+        # created and wait cluster to be ready
+        if parallel_config.world_size > num_devices_in_cluster:
+            logger.warning(
+                "The number of required %ss exceeds the total "
+                "number of available %ss in the placement group.",
+                device_str,
+                device_str,
+            )
+        # Create a new placement group
+        placement_group_specs: list[dict[str, float]] = [
+            {device_str: 1.0} for _ in range(parallel_config.world_size)
+        ]
+
+        # vLLM engine is also a worker to execute model with an accelerator,
+        # so it requires to have the device in a current node. Check if
+        # the current node has at least one device.
+        current_ip = get_ip()
+        current_node_id = ray.get_runtime_context().get_node_id()
+        current_node_resource = available_resources_per_node()[current_node_id]
+        if current_node_resource.get(device_str, 0) < 1:
+            raise ValueError(
+                f"Current node has no {device_str} available. "
+                f"{current_node_resource=}. vLLM engine cannot start without "
+                f"{device_str}. Make sure you have at least 1 {device_str} "
+                f"available in a node {current_node_id=} {current_ip=}."
+            )
+        # This way, at least bundle is required to be created in a current
+        # node.
+        placement_group_specs[0][f"node:{current_ip}"] = 0.001
+
+        # By default, Ray packs resources as much as possible.
+        current_placement_group = ray.util.placement_group(
+            placement_group_specs, strategy="PACK"
+        )
+        _wait_until_pg_ready(current_placement_group)
+
+    assert current_placement_group is not None
+    _verify_bundles(current_placement_group, parallel_config, device_str)
+    # Set the placement group in the parallel config
+    parallel_config.placement_group = current_placement_group
+
+
+def get_num_tpu_nodes() -> int:
+    from ray._private.accelerators import TPUAcceleratorManager
+
+    cluster_resources = ray.cluster_resources()
+    total_tpus = int(cluster_resources["TPU"])
+    tpus_per_node = TPUAcceleratorManager.get_current_node_num_accelerators()
+    assert total_tpus % tpus_per_node == 0
+    return total_tpus // tpus_per_node
+
+
+def get_num_nodes_in_placement_group() -> int:
+    pg_table = ray.util.placement_group_table()
+    current_pg = ray.util.get_current_placement_group()
+    num_nodes = 0
+
+    if current_pg:
+        nodes_in_pg = set()
+        for pg_key, pg in pg_table.items():
+            if pg_key == current_pg.id.hex():
+                for _, node in pg["bundles_to_node_id"].items():
+                    nodes_in_pg.add(node)
+        num_nodes = len(nodes_in_pg)
+
+    return num_nodes
--- a/vllm/v1/executor/uniproc_executor.py
+++ b/vllm/v1/executor/uniproc_executor.py
@@ -0,0 +1,186 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+from collections.abc import Callable
+from concurrent.futures import Future, ThreadPoolExecutor
+from functools import cached_property
+from multiprocessing import Lock
+from typing import Any
+
+import torch
+import torch.distributed as dist
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.utils.network_utils import get_distributed_init_method, get_ip, get_open_port
+from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
+from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.outputs import AsyncModelRunnerOutput, DraftTokenIds, ModelRunnerOutput
+from vllm.v1.serial_utils import run_method
+from vllm.v1.worker.worker_base import WorkerWrapperBase
+
+logger = init_logger(__name__)
+
+
+class UniProcExecutor(Executor):
+    def _init_executor(self) -> None:
+        """Initialize the worker and load the model."""
+        self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config, rpc_rank=0)
+        distributed_init_method, rank, local_rank = self._distributed_args()
+        kwargs = dict(
+            vllm_config=self.vllm_config,
+            local_rank=local_rank,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            is_driver_worker=True,
+            shared_worker_lock=Lock(),
+        )
+
+        self.async_output_thread: ThreadPoolExecutor | None = None
+        if self.max_concurrent_batches > 1:
+            self.async_output_thread = ThreadPoolExecutor(
+                max_workers=1, thread_name_prefix="WorkerAsyncOutput"
+            )
+
+        self.driver_worker.init_worker(all_kwargs=[kwargs])
+        self.driver_worker.init_device()
+        self.driver_worker.load_model()
+
+    def _distributed_args(self) -> tuple[str, int, int]:
+        """Return (distributed_init_method, rank, local_rank)."""
+        distributed_init_method = get_distributed_init_method(get_ip(), get_open_port())
+        # set local rank as the device index if specified
+        device_info = self.vllm_config.device_config.device.__str__().split(":")
+        local_rank = int(device_info[1]) if len(device_info) > 1 else 0
+        return distributed_init_method, 0, local_rank
+
+    @cached_property
+    def max_concurrent_batches(self) -> int:
+        return 2 if self.scheduler_config.async_scheduling else 1
+
+    def collective_rpc(  # type: ignore[override]
+        self,
+        method: str | Callable,
+        timeout: float | None = None,
+        args: tuple = (),
+        kwargs: dict | None = None,
+        non_block: bool = False,
+        single_value: bool = False,
+    ) -> Any:
+        if kwargs is None:
+            kwargs = {}
+
+        if not non_block:
+            result = run_method(self.driver_worker, method, args, kwargs)
+            return result if single_value else [result]
+
+        try:
+            result = run_method(self.driver_worker, method, args, kwargs)
+            if isinstance(result, AsyncModelRunnerOutput):
+                if (async_thread := self.async_output_thread) is not None:
+                    if single_value:
+                        return async_thread.submit(result.get_output)
+
+                    def get_output_list() -> list[Any]:
+                        return [result.get_output()]
+
+                    return async_thread.submit(get_output_list)
+                result = result.get_output()
+            future = Future[Any]()
+            future.set_result(result if single_value else [result])
+        except Exception as e:
+            future = Future[Any]()
+            future.set_exception(e)
+        return future
+
+    def execute_model(  # type: ignore[override]
+        self, scheduler_output: SchedulerOutput, non_block: bool = False
+    ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
+        return self.collective_rpc(
+            "execute_model",
+            args=(scheduler_output,),
+            non_block=non_block,
+            single_value=True,
+        )
+
+    def sample_tokens(  # type: ignore[override]
+        self, grammar_output: GrammarOutput | None, non_block: bool = False
+    ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
+        return self.collective_rpc(
+            "sample_tokens",
+            args=(grammar_output,),
+            non_block=non_block,
+            single_value=True,
+        )
+
+    def take_draft_token_ids(self) -> DraftTokenIds | None:
+        return self.collective_rpc("take_draft_token_ids", single_value=True)
+
+    def check_health(self) -> None:
+        # UniProcExecutor will always be healthy as long as
+        # it's running.
+        return
+
+    def reinitialize_distributed(
+        self, reconfig_request: ReconfigureDistributedRequest
+    ) -> None:
+        self.driver_worker.reinitialize_distributed(reconfig_request)
+        if (
+            reconfig_request.new_data_parallel_rank
+            == ReconfigureRankType.SHUTDOWN_CURRENT_RANK
+        ):
+            self.shutdown()
+
+    def shutdown(self) -> None:
+        if worker := self.driver_worker:
+            worker.shutdown()
+
+
+class ExecutorWithExternalLauncher(UniProcExecutor):
+    """An executor that uses external launchers to launch engines,
+    specially designed for torchrun-compatible launchers, for
+    offline inference with tensor parallelism.
+
+    see https://github.com/vllm-project/vllm/issues/11400 for
+    the motivation, and examples/offline_inference/torchrun_example.py
+    for the usage example.
+
+    The key idea: although it is tensor-parallel inference, we only
+    create one worker per executor, users will launch multiple
+    engines with torchrun-compatible launchers, and all these engines
+    work together to process the same prompts. When scheduling is
+    deterministic, all the engines will generate the same outputs,
+    and they don't need to synchronize the states with each other.
+    """
+
+    def _init_executor(self) -> None:
+        """Initialize the worker and load the model."""
+        assert not envs.VLLM_ENABLE_V1_MULTIPROCESSING, (
+            "To get deterministic execution, "
+            "please set VLLM_ENABLE_V1_MULTIPROCESSING=0"
+        )
+        super()._init_executor()
+
+    def _distributed_args(self) -> tuple[str, int, int]:
+        # engines are launched in torchrun-compatible launchers
+        # so we can use the env:// method.
+        # required env vars:
+        # - RANK
+        # - LOCAL_RANK
+        # - MASTER_ADDR
+        # - MASTER_PORT
+        distributed_init_method = "env://"
+        rank = int(os.environ["RANK"])
+        local_rank = int(os.environ["LOCAL_RANK"])
+        return distributed_init_method, rank, local_rank
+
+    def determine_available_memory(self) -> list[int]:  # in bytes
+        # we need to get the min across all ranks.
+        memory = super().determine_available_memory()
+        from vllm.distributed.parallel_state import get_world_group
+
+        cpu_group = get_world_group().cpu_group
+        memory_tensor = torch.tensor([memory], device="cpu", dtype=torch.int64)
+        dist.all_reduce(memory_tensor, group=cpu_group, op=dist.ReduceOp.MIN)
+        return [memory_tensor.item()]