Upgrade to vllm 0.17.0 corex v4.1 overlay
This commit is contained in:
@@ -104,11 +104,23 @@ try:
|
||||
scheduler_output, intermediate_tensors
|
||||
)
|
||||
if self._is_intermediate_tensors(output):
|
||||
if (
|
||||
self.worker.model_runner.supports_mm_inputs
|
||||
and get_pp_group().is_first_rank
|
||||
):
|
||||
# Strip mm_features before Ray forwards it to the next PP Stage.
|
||||
# PP Stage>0 only needs the intermediate tensors,
|
||||
# not preprocessed multimodal data.
|
||||
|
||||
# scheduled_new_reqs is a required field of SchedulerOutput,
|
||||
# so accessing it directly will raise AttributeError if missing.
|
||||
for req in scheduler_output.scheduled_new_reqs:
|
||||
req.mm_features = []
|
||||
return scheduler_output, grammar_output, output
|
||||
|
||||
if isinstance(output, AsyncModelRunnerOutput):
|
||||
output = output.get_output()
|
||||
if not get_pp_group().is_last_rank:
|
||||
if not self._is_last_rank():
|
||||
# Case where there are no scheduled requests
|
||||
# but may still be finished requests.
|
||||
assert not output or not output.req_ids
|
||||
@@ -128,6 +140,9 @@ try:
|
||||
def _is_intermediate_tensors(self, output) -> bool:
|
||||
return isinstance(output, IntermediateTensors)
|
||||
|
||||
def _is_last_rank(self) -> bool:
|
||||
return get_pp_group().is_last_rank
|
||||
|
||||
ray_import_err = None
|
||||
|
||||
except ImportError as e:
|
||||
@@ -362,7 +377,40 @@ def initialize_ray_cluster(
|
||||
runtime_env=parallel_config.ray_runtime_env,
|
||||
)
|
||||
else:
|
||||
ray.init(address=ray_address, runtime_env=parallel_config.ray_runtime_env)
|
||||
import os
|
||||
import torch
|
||||
import vllm.envs as envs
|
||||
runtime_env = {}
|
||||
device_count = torch.cuda.device_count()
|
||||
nccl_if_name = os.environ.get("NCCL_SOCKET_IFNAME",None)
|
||||
vllm_nccl_comm = os.environ.get("VLLM_FORCE_NCCL_COMM",None)
|
||||
if nccl_if_name is not None and vllm_nccl_comm is not None:
|
||||
runtime_env = {"env_vars":{
|
||||
"NCCL_SOCKET_IFNAME":nccl_if_name,
|
||||
"VLLM_FORCE_NCCL_COMM":vllm_nccl_comm}}
|
||||
elif nccl_if_name is not None:
|
||||
runtime_env = {"env_vars":{
|
||||
"NCCL_SOCKET_IFNAME":nccl_if_name}}
|
||||
elif vllm_nccl_comm is not None:
|
||||
runtime_env = {"env_vars":{
|
||||
"VLLM_FORCE_NCCL_COMM":vllm_nccl_comm}}
|
||||
if "env_vars" not in runtime_env:
|
||||
runtime_env = {
|
||||
"env_vars":{"RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES":"1"}
|
||||
}
|
||||
else:
|
||||
runtime_env["env_vars"].update({"RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES":"1"})
|
||||
all_envs = dict(os.environ)
|
||||
all_vllm_envs = {k: v for k,v in all_envs.items() if "VLLM" in k}
|
||||
runtime_env["env_vars"].update(all_vllm_envs)
|
||||
# ray.init(address=ray_address, ignore_reinit_error=True, runtime_env=runtime_env)
|
||||
if device_count >= parallel_config.world_size:
|
||||
ray.init(address=ray_address,
|
||||
ignore_reinit_error=True,
|
||||
num_gpus=parallel_config.world_size,
|
||||
runtime_env=runtime_env)
|
||||
else:
|
||||
ray.init(address=ray_address, ignore_reinit_error=True, runtime_env=runtime_env)
|
||||
|
||||
device_str = current_platform.ray_device_key
|
||||
if not device_str:
|
||||
|
||||
Reference in New Issue
Block a user