Upgrade to vllm 0.17.0 corex v4.1 overlay

2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions
--- a/vllm/v1/executor/ray_utils.py
+++ b/vllm/v1/executor/ray_utils.py
@@ -104,11 +104,23 @@ try:
                scheduler_output, intermediate_tensors
            )
            if self._is_intermediate_tensors(output):
+                if (
+                    self.worker.model_runner.supports_mm_inputs
+                    and get_pp_group().is_first_rank
+                ):
+                    # Strip mm_features before Ray forwards it to the next PP Stage.
+                    # PP Stage>0 only needs the intermediate tensors,
+                    # not preprocessed multimodal data.
+
+                    # scheduled_new_reqs is a required field of SchedulerOutput,
+                    # so accessing it directly will raise AttributeError if missing.
+                    for req in scheduler_output.scheduled_new_reqs:
+                        req.mm_features = []
                return scheduler_output, grammar_output, output

            if isinstance(output, AsyncModelRunnerOutput):
                output = output.get_output()
-            if not get_pp_group().is_last_rank:
+            if not self._is_last_rank():
                # Case where there are no scheduled requests
                # but may still be finished requests.
                assert not output or not output.req_ids
@@ -128,6 +140,9 @@ try:
        def _is_intermediate_tensors(self, output) -> bool:
            return isinstance(output, IntermediateTensors)

+        def _is_last_rank(self) -> bool:
+            return get_pp_group().is_last_rank
+
    ray_import_err = None

 except ImportError as e:
@@ -362,7 +377,40 @@ def initialize_ray_cluster(
                runtime_env=parallel_config.ray_runtime_env,
            )
    else:
-        ray.init(address=ray_address, runtime_env=parallel_config.ray_runtime_env)
+        import os
+        import torch
+        import vllm.envs as envs
+        runtime_env = {}
+        device_count = torch.cuda.device_count()
+        nccl_if_name = os.environ.get("NCCL_SOCKET_IFNAME",None)
+        vllm_nccl_comm = os.environ.get("VLLM_FORCE_NCCL_COMM",None)
+        if nccl_if_name is not None and vllm_nccl_comm is not None:
+            runtime_env = {"env_vars":{
+                            "NCCL_SOCKET_IFNAME":nccl_if_name,
+                            "VLLM_FORCE_NCCL_COMM":vllm_nccl_comm}}
+        elif nccl_if_name is not None:
+            runtime_env = {"env_vars":{
+                            "NCCL_SOCKET_IFNAME":nccl_if_name}}
+        elif vllm_nccl_comm is not None:
+            runtime_env = {"env_vars":{
+                            "VLLM_FORCE_NCCL_COMM":vllm_nccl_comm}}
+        if "env_vars" not in runtime_env:
+            runtime_env = {
+                "env_vars":{"RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES":"1"}
+            }
+        else:
+            runtime_env["env_vars"].update({"RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES":"1"})
+        all_envs = dict(os.environ)
+        all_vllm_envs = {k: v for k,v in all_envs.items() if "VLLM" in k}
+        runtime_env["env_vars"].update(all_vllm_envs)
+        # ray.init(address=ray_address, ignore_reinit_error=True, runtime_env=runtime_env)
+        if device_count >= parallel_config.world_size:
+            ray.init(address=ray_address,
+                     ignore_reinit_error=True,
+                     num_gpus=parallel_config.world_size,
+                     runtime_env=runtime_env)
+        else:
+            ray.init(address=ray_address, ignore_reinit_error=True, runtime_env=runtime_env)

    device_str = current_platform.ray_device_key
    if not device_str: