Upgrade to vllm 0.17.0 corex v4.1 overlay

2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -115,7 +115,15 @@ class Executor(ABC):
        underlying workers.
        """
        self.collective_rpc("initialize_from_config", args=(kv_cache_configs,))
-        self.collective_rpc("compile_or_warm_up_model")
+        compilation_times: list[float] = self.collective_rpc("compile_or_warm_up_model")
+        # Propagate compilation time from workers back to the main process.
+        # With TP>1, compilation happens in worker processes, so the main
+        # process config is never updated. Use max across workers since they
+        # compile in parallel.
+        if compilation_times:
+            self.vllm_config.compilation_config.compilation_time = max(
+                compilation_times
+            )

    def register_failure_callback(self, callback: FailureCallback):  # noqa: B027
        """
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -38,12 +38,14 @@ from vllm.distributed.parallel_state import (
    get_pcp_group,
    get_pp_group,
    get_tp_group,
+    model_parallel_is_initialized,
 )
 from vllm.envs import enable_envs_cache
 from vllm.logger import init_logger
 from vllm.tracing import instrument, maybe_init_worker_tracer
 from vllm.utils.network_utils import (
    get_distributed_init_method,
+    get_ip,
    get_loopback_ip,
    get_open_port,
 )
@@ -128,11 +130,27 @@ class MultiprocExecutor(Executor):
            # For leader node within each dp rank,
            # each dp will have its own leader multiproc executor.
            max_chunk_bytes = envs.VLLM_MQ_MAX_CHUNK_BYTES_MB * 1024 * 1024
+            mq_connect_ip = get_ip()
+            logger.info(
+                "DP group leader: node_rank=%d, node_rank_within_dp=%d, "
+                "master_addr=%s, mq_connect_ip=%s (local), "
+                "world_size=%d, local_world_size=%d",
+                self.parallel_config.node_rank,
+                self.parallel_config.node_rank_within_dp,
+                self.parallel_config.master_addr,
+                mq_connect_ip,
+                self.world_size,
+                self.local_world_size,
+            )
+            mq_kwargs: dict[str, Any] = {}
+            if envs.VLLM_ENABLE_PP_ILU_OPT:
+                mq_kwargs["max_chunks"] = 32
            self.rpc_broadcast_mq = MessageQueue(
                self.world_size,
                self.local_world_size,
                max_chunk_bytes=max_chunk_bytes,
-                connect_ip=self.parallel_config.master_addr,
+                connect_ip=mq_connect_ip,
+                **mq_kwargs,
            )
            scheduler_output_handle = self.rpc_broadcast_mq.export_handle()
        # Create workers
@@ -567,17 +585,22 @@ class WorkerProc:
            )
            self.async_output_copy_thread.start()

-        # Initialize device
-        self.worker.init_device()
-
-        # Set process title and log prefix
        self.setup_proc_title_and_log_prefix(
            enable_ep=vllm_config.parallel_config.enable_expert_parallel
        )

        # Load model
+        is_eep_new_worker = envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH
+        if not is_eep_new_worker:
+            self.worker.init_device()
+            # Update process title now that parallel groups are initialized
+            self.setup_proc_title_and_log_prefix(
+                enable_ep=vllm_config.parallel_config.enable_expert_parallel
+            )
+            self.worker.load_model()
+        # Initialize message queues after init_device() since multi-node setups
+        # (nnodes_within_dp > 1) require distributed groups to be initialized
        self._init_message_queues(input_shm_handle, vllm_config)
-        self.worker.load_model()

        # Enable environment variable cache (e.g. assume no more
        # environment variable overrides after this point)
@@ -872,6 +895,13 @@ class WorkerProc:

    @staticmethod
    def setup_proc_title_and_log_prefix(enable_ep: bool) -> None:
+        # Check if parallel groups are initialized first
+        if not model_parallel_is_initialized():
+            # Parallel groups not yet initialized, use default process name
+            set_process_title(name="Worker")
+            decorate_logs("Worker")
+            return
+
        dp_size = get_dp_group().world_size
        dp_rank = get_dp_group().rank_in_group
        pp_size = get_pp_group().world_size
--- a/vllm/v1/executor/ray_executor.py
+++ b/vllm/v1/executor/ray_executor.py
@@ -382,8 +382,10 @@ class RayDistributedExecutor(Executor):
            all_kwargs.append(kwargs)
        self.collective_rpc("init_worker", args=(all_kwargs,))

-        self.collective_rpc("init_device")
-        self.collective_rpc("load_model")
+        is_eep_new_worker = envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH
+        if not is_eep_new_worker:
+            self.collective_rpc("init_device")
+            self.collective_rpc("load_model")

        for pp_rank in range(self.parallel_config.pipeline_parallel_size):
            self.pp_tp_workers.append([])
--- a/vllm/v1/executor/ray_utils.py
+++ b/vllm/v1/executor/ray_utils.py
@@ -104,11 +104,23 @@ try:
                scheduler_output, intermediate_tensors
            )
            if self._is_intermediate_tensors(output):
+                if (
+                    self.worker.model_runner.supports_mm_inputs
+                    and get_pp_group().is_first_rank
+                ):
+                    # Strip mm_features before Ray forwards it to the next PP Stage.
+                    # PP Stage>0 only needs the intermediate tensors,
+                    # not preprocessed multimodal data.
+
+                    # scheduled_new_reqs is a required field of SchedulerOutput,
+                    # so accessing it directly will raise AttributeError if missing.
+                    for req in scheduler_output.scheduled_new_reqs:
+                        req.mm_features = []
                return scheduler_output, grammar_output, output

            if isinstance(output, AsyncModelRunnerOutput):
                output = output.get_output()
-            if not get_pp_group().is_last_rank:
+            if not self._is_last_rank():
                # Case where there are no scheduled requests
                # but may still be finished requests.
                assert not output or not output.req_ids
@@ -128,6 +140,9 @@ try:
        def _is_intermediate_tensors(self, output) -> bool:
            return isinstance(output, IntermediateTensors)

+        def _is_last_rank(self) -> bool:
+            return get_pp_group().is_last_rank
+
    ray_import_err = None

 except ImportError as e:
@@ -362,7 +377,40 @@ def initialize_ray_cluster(
                runtime_env=parallel_config.ray_runtime_env,
            )
    else:
-        ray.init(address=ray_address, runtime_env=parallel_config.ray_runtime_env)
+        import os
+        import torch
+        import vllm.envs as envs
+        runtime_env = {}
+        device_count = torch.cuda.device_count()
+        nccl_if_name = os.environ.get("NCCL_SOCKET_IFNAME",None)
+        vllm_nccl_comm = os.environ.get("VLLM_FORCE_NCCL_COMM",None)
+        if nccl_if_name is not None and vllm_nccl_comm is not None:
+            runtime_env = {"env_vars":{
+                            "NCCL_SOCKET_IFNAME":nccl_if_name,
+                            "VLLM_FORCE_NCCL_COMM":vllm_nccl_comm}}
+        elif nccl_if_name is not None:
+            runtime_env = {"env_vars":{
+                            "NCCL_SOCKET_IFNAME":nccl_if_name}}
+        elif vllm_nccl_comm is not None:
+            runtime_env = {"env_vars":{
+                            "VLLM_FORCE_NCCL_COMM":vllm_nccl_comm}}
+        if "env_vars" not in runtime_env:
+            runtime_env = {
+                "env_vars":{"RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES":"1"}
+            }
+        else:
+            runtime_env["env_vars"].update({"RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES":"1"})
+        all_envs = dict(os.environ)
+        all_vllm_envs = {k: v for k,v in all_envs.items() if "VLLM" in k}
+        runtime_env["env_vars"].update(all_vllm_envs)
+        # ray.init(address=ray_address, ignore_reinit_error=True, runtime_env=runtime_env)
+        if device_count >= parallel_config.world_size:
+            ray.init(address=ray_address,
+                     ignore_reinit_error=True,
+                     num_gpus=parallel_config.world_size,
+                     runtime_env=runtime_env)
+        else:
+            ray.init(address=ray_address, ignore_reinit_error=True, runtime_env=runtime_env)

    device_str = current_platform.ray_device_key
    if not device_str:
--- a/vllm/v1/executor/uniproc_executor.py
+++ b/vllm/v1/executor/uniproc_executor.py
@@ -14,7 +14,6 @@ import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.utils.network_utils import get_distributed_init_method, get_ip, get_open_port
 from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
-from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.outputs import AsyncModelRunnerOutput, DraftTokenIds, ModelRunnerOutput
 from vllm.v1.serial_utils import run_method
@@ -43,9 +42,11 @@ class UniProcExecutor(Executor):
                max_workers=1, thread_name_prefix="WorkerAsyncOutput"
            )

+        is_eep_new_worker = envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH
        self.driver_worker.init_worker(all_kwargs=[kwargs])
-        self.driver_worker.init_device()
-        self.driver_worker.load_model()
+        if not is_eep_new_worker:
+            self.driver_worker.init_device()
+            self.driver_worker.load_model()

    def _distributed_args(self) -> tuple[str, int, int]:
        """Return (distributed_init_method, rank, local_rank)."""
@@ -122,16 +123,6 @@ class UniProcExecutor(Executor):
        # it's running.
        return

-    def reinitialize_distributed(
-        self, reconfig_request: ReconfigureDistributedRequest
-    ) -> None:
-        self.driver_worker.reinitialize_distributed(reconfig_request)
-        if (
-            reconfig_request.new_data_parallel_rank
-            == ReconfigureRankType.SHUTDOWN_CURRENT_RANK
-        ):
-            self.shutdown()
-
    def shutdown(self) -> None:
        if worker := self.driver_worker:
            worker.shutdown()