Upgrade to vllm 0.17.0 corex v4.1 overlay
This commit is contained in:
@@ -115,7 +115,15 @@ class Executor(ABC):
|
||||
underlying workers.
|
||||
"""
|
||||
self.collective_rpc("initialize_from_config", args=(kv_cache_configs,))
|
||||
self.collective_rpc("compile_or_warm_up_model")
|
||||
compilation_times: list[float] = self.collective_rpc("compile_or_warm_up_model")
|
||||
# Propagate compilation time from workers back to the main process.
|
||||
# With TP>1, compilation happens in worker processes, so the main
|
||||
# process config is never updated. Use max across workers since they
|
||||
# compile in parallel.
|
||||
if compilation_times:
|
||||
self.vllm_config.compilation_config.compilation_time = max(
|
||||
compilation_times
|
||||
)
|
||||
|
||||
def register_failure_callback(self, callback: FailureCallback): # noqa: B027
|
||||
"""
|
||||
|
||||
@@ -38,12 +38,14 @@ from vllm.distributed.parallel_state import (
|
||||
get_pcp_group,
|
||||
get_pp_group,
|
||||
get_tp_group,
|
||||
model_parallel_is_initialized,
|
||||
)
|
||||
from vllm.envs import enable_envs_cache
|
||||
from vllm.logger import init_logger
|
||||
from vllm.tracing import instrument, maybe_init_worker_tracer
|
||||
from vllm.utils.network_utils import (
|
||||
get_distributed_init_method,
|
||||
get_ip,
|
||||
get_loopback_ip,
|
||||
get_open_port,
|
||||
)
|
||||
@@ -128,11 +130,27 @@ class MultiprocExecutor(Executor):
|
||||
# For leader node within each dp rank,
|
||||
# each dp will have its own leader multiproc executor.
|
||||
max_chunk_bytes = envs.VLLM_MQ_MAX_CHUNK_BYTES_MB * 1024 * 1024
|
||||
mq_connect_ip = get_ip()
|
||||
logger.info(
|
||||
"DP group leader: node_rank=%d, node_rank_within_dp=%d, "
|
||||
"master_addr=%s, mq_connect_ip=%s (local), "
|
||||
"world_size=%d, local_world_size=%d",
|
||||
self.parallel_config.node_rank,
|
||||
self.parallel_config.node_rank_within_dp,
|
||||
self.parallel_config.master_addr,
|
||||
mq_connect_ip,
|
||||
self.world_size,
|
||||
self.local_world_size,
|
||||
)
|
||||
mq_kwargs: dict[str, Any] = {}
|
||||
if envs.VLLM_ENABLE_PP_ILU_OPT:
|
||||
mq_kwargs["max_chunks"] = 32
|
||||
self.rpc_broadcast_mq = MessageQueue(
|
||||
self.world_size,
|
||||
self.local_world_size,
|
||||
max_chunk_bytes=max_chunk_bytes,
|
||||
connect_ip=self.parallel_config.master_addr,
|
||||
connect_ip=mq_connect_ip,
|
||||
**mq_kwargs,
|
||||
)
|
||||
scheduler_output_handle = self.rpc_broadcast_mq.export_handle()
|
||||
# Create workers
|
||||
@@ -567,17 +585,22 @@ class WorkerProc:
|
||||
)
|
||||
self.async_output_copy_thread.start()
|
||||
|
||||
# Initialize device
|
||||
self.worker.init_device()
|
||||
|
||||
# Set process title and log prefix
|
||||
self.setup_proc_title_and_log_prefix(
|
||||
enable_ep=vllm_config.parallel_config.enable_expert_parallel
|
||||
)
|
||||
|
||||
# Load model
|
||||
is_eep_new_worker = envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH
|
||||
if not is_eep_new_worker:
|
||||
self.worker.init_device()
|
||||
# Update process title now that parallel groups are initialized
|
||||
self.setup_proc_title_and_log_prefix(
|
||||
enable_ep=vllm_config.parallel_config.enable_expert_parallel
|
||||
)
|
||||
self.worker.load_model()
|
||||
# Initialize message queues after init_device() since multi-node setups
|
||||
# (nnodes_within_dp > 1) require distributed groups to be initialized
|
||||
self._init_message_queues(input_shm_handle, vllm_config)
|
||||
self.worker.load_model()
|
||||
|
||||
# Enable environment variable cache (e.g. assume no more
|
||||
# environment variable overrides after this point)
|
||||
@@ -872,6 +895,13 @@ class WorkerProc:
|
||||
|
||||
@staticmethod
|
||||
def setup_proc_title_and_log_prefix(enable_ep: bool) -> None:
|
||||
# Check if parallel groups are initialized first
|
||||
if not model_parallel_is_initialized():
|
||||
# Parallel groups not yet initialized, use default process name
|
||||
set_process_title(name="Worker")
|
||||
decorate_logs("Worker")
|
||||
return
|
||||
|
||||
dp_size = get_dp_group().world_size
|
||||
dp_rank = get_dp_group().rank_in_group
|
||||
pp_size = get_pp_group().world_size
|
||||
|
||||
@@ -382,8 +382,10 @@ class RayDistributedExecutor(Executor):
|
||||
all_kwargs.append(kwargs)
|
||||
self.collective_rpc("init_worker", args=(all_kwargs,))
|
||||
|
||||
self.collective_rpc("init_device")
|
||||
self.collective_rpc("load_model")
|
||||
is_eep_new_worker = envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH
|
||||
if not is_eep_new_worker:
|
||||
self.collective_rpc("init_device")
|
||||
self.collective_rpc("load_model")
|
||||
|
||||
for pp_rank in range(self.parallel_config.pipeline_parallel_size):
|
||||
self.pp_tp_workers.append([])
|
||||
|
||||
@@ -104,11 +104,23 @@ try:
|
||||
scheduler_output, intermediate_tensors
|
||||
)
|
||||
if self._is_intermediate_tensors(output):
|
||||
if (
|
||||
self.worker.model_runner.supports_mm_inputs
|
||||
and get_pp_group().is_first_rank
|
||||
):
|
||||
# Strip mm_features before Ray forwards it to the next PP Stage.
|
||||
# PP Stage>0 only needs the intermediate tensors,
|
||||
# not preprocessed multimodal data.
|
||||
|
||||
# scheduled_new_reqs is a required field of SchedulerOutput,
|
||||
# so accessing it directly will raise AttributeError if missing.
|
||||
for req in scheduler_output.scheduled_new_reqs:
|
||||
req.mm_features = []
|
||||
return scheduler_output, grammar_output, output
|
||||
|
||||
if isinstance(output, AsyncModelRunnerOutput):
|
||||
output = output.get_output()
|
||||
if not get_pp_group().is_last_rank:
|
||||
if not self._is_last_rank():
|
||||
# Case where there are no scheduled requests
|
||||
# but may still be finished requests.
|
||||
assert not output or not output.req_ids
|
||||
@@ -128,6 +140,9 @@ try:
|
||||
def _is_intermediate_tensors(self, output) -> bool:
|
||||
return isinstance(output, IntermediateTensors)
|
||||
|
||||
def _is_last_rank(self) -> bool:
|
||||
return get_pp_group().is_last_rank
|
||||
|
||||
ray_import_err = None
|
||||
|
||||
except ImportError as e:
|
||||
@@ -362,7 +377,40 @@ def initialize_ray_cluster(
|
||||
runtime_env=parallel_config.ray_runtime_env,
|
||||
)
|
||||
else:
|
||||
ray.init(address=ray_address, runtime_env=parallel_config.ray_runtime_env)
|
||||
import os
|
||||
import torch
|
||||
import vllm.envs as envs
|
||||
runtime_env = {}
|
||||
device_count = torch.cuda.device_count()
|
||||
nccl_if_name = os.environ.get("NCCL_SOCKET_IFNAME",None)
|
||||
vllm_nccl_comm = os.environ.get("VLLM_FORCE_NCCL_COMM",None)
|
||||
if nccl_if_name is not None and vllm_nccl_comm is not None:
|
||||
runtime_env = {"env_vars":{
|
||||
"NCCL_SOCKET_IFNAME":nccl_if_name,
|
||||
"VLLM_FORCE_NCCL_COMM":vllm_nccl_comm}}
|
||||
elif nccl_if_name is not None:
|
||||
runtime_env = {"env_vars":{
|
||||
"NCCL_SOCKET_IFNAME":nccl_if_name}}
|
||||
elif vllm_nccl_comm is not None:
|
||||
runtime_env = {"env_vars":{
|
||||
"VLLM_FORCE_NCCL_COMM":vllm_nccl_comm}}
|
||||
if "env_vars" not in runtime_env:
|
||||
runtime_env = {
|
||||
"env_vars":{"RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES":"1"}
|
||||
}
|
||||
else:
|
||||
runtime_env["env_vars"].update({"RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES":"1"})
|
||||
all_envs = dict(os.environ)
|
||||
all_vllm_envs = {k: v for k,v in all_envs.items() if "VLLM" in k}
|
||||
runtime_env["env_vars"].update(all_vllm_envs)
|
||||
# ray.init(address=ray_address, ignore_reinit_error=True, runtime_env=runtime_env)
|
||||
if device_count >= parallel_config.world_size:
|
||||
ray.init(address=ray_address,
|
||||
ignore_reinit_error=True,
|
||||
num_gpus=parallel_config.world_size,
|
||||
runtime_env=runtime_env)
|
||||
else:
|
||||
ray.init(address=ray_address, ignore_reinit_error=True, runtime_env=runtime_env)
|
||||
|
||||
device_str = current_platform.ray_device_key
|
||||
if not device_str:
|
||||
|
||||
@@ -14,7 +14,6 @@ import vllm.envs as envs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.network_utils import get_distributed_init_method, get_ip, get_open_port
|
||||
from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
|
||||
from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
|
||||
from vllm.v1.executor.abstract import Executor
|
||||
from vllm.v1.outputs import AsyncModelRunnerOutput, DraftTokenIds, ModelRunnerOutput
|
||||
from vllm.v1.serial_utils import run_method
|
||||
@@ -43,9 +42,11 @@ class UniProcExecutor(Executor):
|
||||
max_workers=1, thread_name_prefix="WorkerAsyncOutput"
|
||||
)
|
||||
|
||||
is_eep_new_worker = envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH
|
||||
self.driver_worker.init_worker(all_kwargs=[kwargs])
|
||||
self.driver_worker.init_device()
|
||||
self.driver_worker.load_model()
|
||||
if not is_eep_new_worker:
|
||||
self.driver_worker.init_device()
|
||||
self.driver_worker.load_model()
|
||||
|
||||
def _distributed_args(self) -> tuple[str, int, int]:
|
||||
"""Return (distributed_init_method, rank, local_rank)."""
|
||||
@@ -122,16 +123,6 @@ class UniProcExecutor(Executor):
|
||||
# it's running.
|
||||
return
|
||||
|
||||
def reinitialize_distributed(
|
||||
self, reconfig_request: ReconfigureDistributedRequest
|
||||
) -> None:
|
||||
self.driver_worker.reinitialize_distributed(reconfig_request)
|
||||
if (
|
||||
reconfig_request.new_data_parallel_rank
|
||||
== ReconfigureRankType.SHUTDOWN_CURRENT_RANK
|
||||
):
|
||||
self.shutdown()
|
||||
|
||||
def shutdown(self) -> None:
|
||||
if worker := self.driver_worker:
|
||||
worker.shutdown()
|
||||
|
||||
Reference in New Issue
Block a user