Upgrade to vllm 0.17.0 corex v4.1 overlay
This commit is contained in:
@@ -17,6 +17,7 @@ from typing import Any, TypeVar, cast
|
||||
import msgspec
|
||||
import zmq
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import ParallelConfig, VllmConfig
|
||||
from vllm.distributed import stateless_destroy_torch_distributed_process_group
|
||||
from vllm.envs import enable_envs_cache
|
||||
@@ -44,6 +45,8 @@ from vllm.v1.core.kv_cache_utils import (
|
||||
from vllm.v1.core.sched.interface import PauseState, SchedulerInterface
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.engine import (
|
||||
EEP_NOTIFICATION_CALL_ID,
|
||||
EEPNotificationType,
|
||||
EngineCoreOutput,
|
||||
EngineCoreOutputs,
|
||||
EngineCoreRequest,
|
||||
@@ -72,7 +75,6 @@ from vllm.version import __version__ as VLLM_VERSION
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
POLLING_TIMEOUT_S = 2.5
|
||||
HANDSHAKE_TIMEOUT_MINS = 5
|
||||
|
||||
_R = TypeVar("_R") # Return type for collective_rpc
|
||||
@@ -111,6 +113,9 @@ class EngineCore:
|
||||
|
||||
self.available_gpu_memory_for_kv_cache = -1
|
||||
|
||||
if envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH:
|
||||
self._eep_scale_up_before_kv_init()
|
||||
|
||||
# Setup KV Caches and update CacheConfig after profiling.
|
||||
num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(
|
||||
vllm_config
|
||||
@@ -180,13 +185,55 @@ class EngineCore:
|
||||
# Batch queue for scheduled batches. This enables us to asynchronously
|
||||
# schedule and execute batches, and is required by pipeline parallelism
|
||||
# to eliminate pipeline bubbles.
|
||||
self.batch_queue_size = self.model_executor.max_concurrent_batches
|
||||
base_batch_queue_size = self.model_executor.max_concurrent_batches
|
||||
if envs.VLLM_ENABLE_PP_ILU_OPT:
|
||||
self.batch_queue_size = envs.VLLM_PP_ILU_OPT_BATCH_QUEUE_SIZE
|
||||
if self.batch_queue_size <= 0:
|
||||
self.batch_queue_size = base_batch_queue_size * 2
|
||||
self._use_batch_queue_ilu_opt = True
|
||||
logger.info(
|
||||
"PP ILU opt is enabled: batch_queue_size=%d (base=%d)",
|
||||
self.batch_queue_size,
|
||||
base_batch_queue_size,
|
||||
)
|
||||
else:
|
||||
self.batch_queue_size = base_batch_queue_size
|
||||
self._use_batch_queue_ilu_opt = False
|
||||
self.batch_queue: (
|
||||
deque[tuple[Future[ModelRunnerOutput], SchedulerOutput, Future[Any]]] | None
|
||||
) = None
|
||||
if self.batch_queue_size > 1:
|
||||
logger.debug("Batch queue is enabled with size %d", self.batch_queue_size)
|
||||
logger.info(
|
||||
"Batch queue is enabled with size %d (ilu_opt=%s)",
|
||||
self.batch_queue_size,
|
||||
self._use_batch_queue_ilu_opt,
|
||||
)
|
||||
self.batch_queue = deque(maxlen=self.batch_queue_size)
|
||||
if self._use_batch_queue_ilu_opt:
|
||||
self.engine_core_input_queue: queue.Queue[
|
||||
tuple[Future[ModelRunnerOutput], SchedulerOutput]
|
||||
] = queue.Queue(maxsize=self.batch_queue_size)
|
||||
self.engine_core_output_queue: queue.Queue[
|
||||
tuple[SchedulerOutput, ModelRunnerOutput, bool]
|
||||
] = queue.Queue(maxsize=self.batch_queue_size)
|
||||
self._batch_queue_loop_thread = threading.Thread(
|
||||
target=self._process_batch_queue_loop,
|
||||
daemon=True,
|
||||
)
|
||||
self._batch_queue_loop_thread.start()
|
||||
|
||||
# When PP mix ILU scheduling or PP ILU opt is enabled with a KV
|
||||
# connector, only NixlConnector is supported.
|
||||
if vllm_config.kv_transfer_config is not None and (
|
||||
envs.VLLM_ENABLE_PP_MIX_ILU_SCHEDULING or envs.VLLM_ENABLE_PP_ILU_OPT
|
||||
):
|
||||
kv_connector_name = vllm_config.kv_transfer_config.kv_connector
|
||||
if kv_connector_name != "NixlConnector":
|
||||
raise ValueError(
|
||||
"When VLLM_ENABLE_PP_MIX_ILU_SCHEDULING or VLLM_ENABLE_PP_ILU_OPT "
|
||||
"is enabled with a KV connector, only NixlConnector is supported; "
|
||||
f"current kv_connector is {kv_connector_name!r}."
|
||||
)
|
||||
|
||||
self.is_ec_producer = (
|
||||
vllm_config.ec_transfer_config is not None
|
||||
@@ -209,6 +256,10 @@ class EngineCore:
|
||||
self.step if self.batch_queue is None else self.step_with_batch_queue
|
||||
)
|
||||
self.async_scheduling = vllm_config.scheduler_config.async_scheduling
|
||||
|
||||
self.draft_in_model_output = (
|
||||
self.batch_queue is not None and self.use_spec_decode
|
||||
)
|
||||
|
||||
self.aborts_queue = queue.Queue[list[str]]()
|
||||
|
||||
@@ -234,12 +285,10 @@ class EngineCore:
|
||||
|
||||
has_kv_cache = any(kv_cache_spec for kv_cache_spec in kv_cache_specs)
|
||||
if has_kv_cache:
|
||||
if os.environ.get("VLLM_ELASTIC_EP_SCALE_UP_LAUNCH") == "1":
|
||||
dp_group = getattr(self, "dp_group", None)
|
||||
assert dp_group is not None
|
||||
self.available_gpu_memory_for_kv_cache = (
|
||||
ParallelConfig.sync_kv_cache_memory_size(dp_group, -1)
|
||||
)
|
||||
if envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH:
|
||||
# NOTE(yongji): should already be set
|
||||
# during _eep_scale_up_before_kv_init
|
||||
assert self.available_gpu_memory_for_kv_cache > 0
|
||||
available_gpu_memory = [self.available_gpu_memory_for_kv_cache] * len(
|
||||
kv_cache_specs
|
||||
)
|
||||
@@ -408,12 +457,52 @@ class EngineCore:
|
||||
# When using async scheduling we can't get draft token ids in advance,
|
||||
# so we update draft token ids in the worker process and don't
|
||||
# need to update draft token ids here.
|
||||
if self.draft_in_model_output:
|
||||
return
|
||||
if not self.async_scheduling and self.use_spec_decode and model_executed:
|
||||
# Take the draft token ids.
|
||||
draft_token_ids = self.model_executor.take_draft_token_ids()
|
||||
if draft_token_ids is not None:
|
||||
self.scheduler.update_draft_token_ids(draft_token_ids)
|
||||
|
||||
def _has_kv_connector_work(self, meta: Any) -> bool:
|
||||
"""Return True if kv_connector_metadata has any recv/save/send work."""
|
||||
if meta is None:
|
||||
return False
|
||||
for attr in ("reqs_to_recv", "reqs_to_save", "reqs_to_send"):
|
||||
val = getattr(meta, attr, None)
|
||||
if val is not None and len(val) > 0:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _has_meaningful_scheduler_output(
|
||||
self, scheduler_output: SchedulerOutput
|
||||
) -> bool:
|
||||
"""Return False if scheduler_output is effectively empty."""
|
||||
return not (
|
||||
len(scheduler_output.scheduled_new_reqs) == 0
|
||||
and len(scheduler_output.scheduled_cached_reqs.req_ids) == 0
|
||||
and len(scheduler_output.num_scheduled_tokens) == 0
|
||||
and scheduler_output.total_num_scheduled_tokens == 0
|
||||
and len(scheduler_output.scheduled_spec_decode_tokens) == 0
|
||||
and len(scheduler_output.scheduled_encoder_inputs) == 0
|
||||
and len(scheduler_output.finished_req_ids) == 0
|
||||
and (scheduler_output.scheduled_resumed_reqs is None
|
||||
or len(scheduler_output.scheduled_resumed_reqs) == 0)
|
||||
and not self._has_kv_connector_work(
|
||||
scheduler_output.kv_connector_metadata
|
||||
)
|
||||
)
|
||||
|
||||
def _process_batch_queue_loop(self) -> None:
|
||||
while True:
|
||||
future, scheduler_output = self.engine_core_input_queue.get()
|
||||
with self.log_error_detail(scheduler_output):
|
||||
model_output = future.result()
|
||||
self.engine_core_output_queue.put(
|
||||
(scheduler_output, model_output, False)
|
||||
)
|
||||
|
||||
def step_with_batch_queue(
|
||||
self,
|
||||
) -> tuple[dict[int, EngineCoreOutputs] | None, bool]:
|
||||
@@ -434,6 +523,9 @@ class EngineCore:
|
||||
batch_queue = self.batch_queue
|
||||
assert batch_queue is not None
|
||||
|
||||
if self._use_batch_queue_ilu_opt:
|
||||
return self.step_with_batch_queue_ilu_opt()
|
||||
|
||||
# Try to schedule a new batch if the batch queue is not full, but
|
||||
# the scheduler may return an empty batch if all requests are scheduled.
|
||||
# Note that this is not blocking.
|
||||
@@ -531,6 +623,96 @@ class EngineCore:
|
||||
|
||||
return engine_core_outputs, model_executed
|
||||
|
||||
def step_with_batch_queue_ilu_opt(
|
||||
self,
|
||||
) -> tuple[dict[int, EngineCoreOutputs] | None, bool]:
|
||||
"""Async batch queue variant using background thread for PP ILU opt.
|
||||
|
||||
Uses engine_core_input_queue / engine_core_output_queue with a
|
||||
background thread (_process_batch_queue_loop) that blocks on
|
||||
future.result(), so the main thread never blocks on GPU compute.
|
||||
"""
|
||||
assert not self.is_ec_producer, (
|
||||
"ec_producer is not supported in step_with_batch_queue_ilu_opt"
|
||||
)
|
||||
assert not self.is_pooling_model, (
|
||||
"is_pooling_model is not supported in step_with_batch_queue_ilu_opt"
|
||||
)
|
||||
assert not self.async_scheduling, (
|
||||
"async_scheduling is not supported in step_with_batch_queue_ilu_opt"
|
||||
)
|
||||
|
||||
model_executed = False
|
||||
|
||||
if self.scheduler.has_requests():
|
||||
scheduler_output = self.scheduler.schedule()
|
||||
has_meaningful_schedule = self._has_meaningful_scheduler_output(
|
||||
scheduler_output
|
||||
)
|
||||
if (
|
||||
self.engine_core_input_queue.qsize() <= 1
|
||||
and not has_meaningful_schedule
|
||||
):
|
||||
has_meaningful_schedule = True
|
||||
if has_meaningful_schedule:
|
||||
logger.debug(
|
||||
"[step_with_batch_queue_ilu_opt] scheduler_output: "
|
||||
"total_num_scheduled_tokens=%s num_scheduled_tokens=%s "
|
||||
"scheduled_new_reqs=%s scheduled_cached_reqs.req_ids=%s "
|
||||
"resumed_req_ids=%s finished_req_ids=%s "
|
||||
"has_meaningful_schedule=%s",
|
||||
scheduler_output.total_num_scheduled_tokens,
|
||||
scheduler_output.num_scheduled_tokens,
|
||||
[r.req_id for r in scheduler_output.scheduled_new_reqs],
|
||||
scheduler_output.scheduled_cached_reqs.req_ids,
|
||||
scheduler_output.scheduled_cached_reqs.resumed_req_ids,
|
||||
scheduler_output.finished_req_ids,
|
||||
has_meaningful_schedule,
|
||||
)
|
||||
|
||||
if has_meaningful_schedule:
|
||||
exec_future = self.model_executor.execute_model(
|
||||
scheduler_output, non_block=True
|
||||
)
|
||||
model_executed = (
|
||||
scheduler_output.total_num_scheduled_tokens > 0
|
||||
)
|
||||
|
||||
if not model_executed:
|
||||
future = cast(Future[ModelRunnerOutput], exec_future)
|
||||
else:
|
||||
grammar_output = self.scheduler.get_grammar_bitmask(
|
||||
scheduler_output
|
||||
)
|
||||
future = self.model_executor.sample_tokens(
|
||||
grammar_output, non_block=True
|
||||
)
|
||||
|
||||
if self.engine_core_input_queue.full():
|
||||
scheduler_output_out, model_output_out, model_executed_out = (
|
||||
self.engine_core_output_queue.get()
|
||||
)
|
||||
engine_core_outputs = self.scheduler.update_from_output(
|
||||
scheduler_output_out, model_output_out
|
||||
)
|
||||
self.engine_core_input_queue.put(
|
||||
(future, scheduler_output)
|
||||
)
|
||||
return engine_core_outputs, model_executed_out
|
||||
|
||||
self.engine_core_input_queue.put((future, scheduler_output))
|
||||
|
||||
try:
|
||||
scheduler_output, model_output, model_executed = (
|
||||
self.engine_core_output_queue.get_nowait()
|
||||
)
|
||||
engine_core_outputs = self.scheduler.update_from_output(
|
||||
scheduler_output, model_output
|
||||
)
|
||||
return engine_core_outputs, model_executed
|
||||
except queue.Empty:
|
||||
return None, False
|
||||
|
||||
def _process_aborts_queue(self):
|
||||
if not self.aborts_queue.empty():
|
||||
request_ids = []
|
||||
@@ -753,11 +935,22 @@ class EngineCore:
|
||||
self.structured_output_manager.grammar_init(req)
|
||||
return req, request.current_wave
|
||||
|
||||
def _eep_scale_up_before_kv_init(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def _eep_send_engine_core_notification(
|
||||
self,
|
||||
notification_type: EEPNotificationType,
|
||||
vllm_config: VllmConfig | None = None,
|
||||
):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class EngineCoreProc(EngineCore):
|
||||
"""ZMQ-wrapper for running EngineCore in background process."""
|
||||
|
||||
ENGINE_CORE_DEAD = b"ENGINE_CORE_DEAD"
|
||||
addresses: EngineZmqAddresses
|
||||
|
||||
@instrument(span_name="EngineCoreProc init")
|
||||
def __init__(
|
||||
@@ -808,6 +1001,13 @@ class EngineCoreProc(EngineCore):
|
||||
# and "hybrid" LB modes.
|
||||
self.publish_dp_lb_stats = internal_dp_balancing
|
||||
|
||||
self.addresses = addresses
|
||||
self.process_input_queue_block = True
|
||||
if envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH:
|
||||
self._eep_send_engine_core_notification(
|
||||
EEPNotificationType.NEW_CORE_ENGINES_INIT_READY,
|
||||
vllm_config=vllm_config,
|
||||
)
|
||||
self._init_data_parallel(vllm_config)
|
||||
|
||||
super().__init__(
|
||||
@@ -1120,8 +1320,14 @@ class EngineCoreProc(EngineCore):
|
||||
if logger.isEnabledFor(DEBUG):
|
||||
logger.debug("EngineCore waiting for work.")
|
||||
waited = True
|
||||
req = self.input_queue.get()
|
||||
self._handle_client_request(*req)
|
||||
block = self.process_input_queue_block
|
||||
try:
|
||||
req = self.input_queue.get(block=block)
|
||||
self._handle_client_request(*req)
|
||||
except queue.Empty:
|
||||
break
|
||||
if not block:
|
||||
break
|
||||
|
||||
if waited:
|
||||
logger.debug("EngineCore loop active.")
|
||||
@@ -1291,6 +1497,11 @@ class EngineCoreProc(EngineCore):
|
||||
for input_socket, _ in poller.poll():
|
||||
# (RequestType, RequestData)
|
||||
type_frame, *data_frames = input_socket.recv_multipart(copy=False)
|
||||
# NOTE(yongji): ignore READY message sent by DP coordinator
|
||||
# that is used to notify newly started engines
|
||||
if type_frame.buffer == b"READY":
|
||||
assert input_socket == coord_socket
|
||||
continue
|
||||
request_type = EngineCoreRequestType(bytes(type_frame.buffer))
|
||||
|
||||
# Deserialize the request data.
|
||||
@@ -1489,6 +1700,10 @@ class DPEngineCoreProc(EngineCoreProc):
|
||||
self.current_wave = 0
|
||||
self.last_counts = (0, 0)
|
||||
|
||||
from vllm.distributed.elastic_ep.elastic_state import ElasticEPScalingState
|
||||
|
||||
self.eep_scaling_state: ElasticEPScalingState | None = None
|
||||
|
||||
# Initialize the engine.
|
||||
dp_rank = vllm_config.parallel_config.data_parallel_rank
|
||||
super().__init__(
|
||||
@@ -1512,7 +1727,9 @@ class DPEngineCoreProc(EngineCoreProc):
|
||||
assert 0 <= local_dp_rank <= dp_rank < dp_size
|
||||
|
||||
self.dp_rank = dp_rank
|
||||
self.dp_group = vllm_config.parallel_config.stateless_init_dp_group()
|
||||
self.dp_group, self.dp_store = (
|
||||
vllm_config.parallel_config.stateless_init_dp_group(return_store=True)
|
||||
)
|
||||
|
||||
def shutdown(self):
|
||||
super().shutdown()
|
||||
@@ -1533,7 +1750,11 @@ class DPEngineCoreProc(EngineCoreProc):
|
||||
|
||||
def resume_scheduler(self):
|
||||
super().resume_scheduler()
|
||||
if not self.engines_running and self.scheduler.has_unfinished_requests():
|
||||
if (
|
||||
self.has_coordinator
|
||||
and not self.engines_running
|
||||
and self.scheduler.has_unfinished_requests()
|
||||
):
|
||||
# Wake up other DP engines.
|
||||
self.output_queue.put_nowait(
|
||||
(-1, EngineCoreOutputs(start_wave=self.current_wave))
|
||||
@@ -1575,7 +1796,12 @@ class DPEngineCoreProc(EngineCoreProc):
|
||||
# 1) Poll the input queue until there is work to do.
|
||||
self._process_input_queue()
|
||||
|
||||
# 2) Step the engine core.
|
||||
if self.eep_scaling_state is not None:
|
||||
_ = self.eep_scaling_state.progress()
|
||||
if self.eep_scaling_state.is_complete():
|
||||
self.process_input_queue_block = True
|
||||
self.eep_scaling_state = None
|
||||
|
||||
executed = self._process_engine_step()
|
||||
self._maybe_publish_request_counts()
|
||||
|
||||
@@ -1625,54 +1851,129 @@ class DPEngineCoreProc(EngineCoreProc):
|
||||
def reinitialize_distributed(
|
||||
self, reconfig_request: ReconfigureDistributedRequest
|
||||
) -> None:
|
||||
stateless_destroy_torch_distributed_process_group(self.dp_group)
|
||||
self.shutdown()
|
||||
from copy import deepcopy
|
||||
|
||||
parallel_config = self.vllm_config.parallel_config
|
||||
old_dp_size = parallel_config.data_parallel_size
|
||||
parallel_config.data_parallel_size = reconfig_request.new_data_parallel_size
|
||||
if reconfig_request.new_data_parallel_rank != -1:
|
||||
parallel_config.data_parallel_rank = reconfig_request.new_data_parallel_rank
|
||||
# local rank specifies device visibility, it should not be changed
|
||||
assert (
|
||||
reconfig_request.new_data_parallel_rank_local
|
||||
== ReconfigureRankType.KEEP_CURRENT_RANK
|
||||
)
|
||||
parallel_config.data_parallel_master_ip = (
|
||||
reconfig_request.new_data_parallel_master_ip
|
||||
)
|
||||
parallel_config.data_parallel_master_port = (
|
||||
reconfig_request.new_data_parallel_master_port
|
||||
)
|
||||
if reconfig_request.new_data_parallel_rank != -2:
|
||||
self.dp_rank = parallel_config.data_parallel_rank
|
||||
self.dp_group = parallel_config.stateless_init_dp_group()
|
||||
reconfig_request.new_data_parallel_master_port = (
|
||||
parallel_config.data_parallel_master_port
|
||||
)
|
||||
from vllm.distributed.elastic_ep.elastic_state import ElasticEPScalingState
|
||||
|
||||
self.model_executor.reinitialize_distributed(reconfig_request)
|
||||
if reconfig_request.new_data_parallel_size > old_dp_size:
|
||||
assert self.available_gpu_memory_for_kv_cache > 0
|
||||
# pass available_gpu_memory_for_kv_cache from existing
|
||||
# engine-cores to new engine-cores so they can directly
|
||||
# use it in _initialize_kv_caches() rather than profiling.
|
||||
ParallelConfig.sync_kv_cache_memory_size(
|
||||
self.dp_group, self.available_gpu_memory_for_kv_cache
|
||||
)
|
||||
# NOTE(yongji): newly joined workers require dummy_run even
|
||||
# CUDA graph is not used
|
||||
self.model_executor.collective_rpc("compile_or_warm_up_model")
|
||||
new_parallel_config = deepcopy(self.vllm_config.parallel_config)
|
||||
old_dp_size = new_parallel_config.data_parallel_size
|
||||
new_parallel_config.data_parallel_size = reconfig_request.new_data_parallel_size
|
||||
if (
|
||||
reconfig_request.new_data_parallel_rank
|
||||
== ReconfigureRankType.SHUTDOWN_CURRENT_RANK
|
||||
!= ReconfigureRankType.KEEP_CURRENT_RANK
|
||||
):
|
||||
self.shutdown()
|
||||
logger.info("DPEngineCoreProc %s shutdown", self.dp_rank)
|
||||
else:
|
||||
logger.info(
|
||||
"Distributed environment reinitialized for DP rank %s", self.dp_rank
|
||||
new_parallel_config.data_parallel_rank = (
|
||||
reconfig_request.new_data_parallel_rank
|
||||
)
|
||||
new_parallel_config.data_parallel_master_ip = (
|
||||
reconfig_request.new_data_parallel_master_ip
|
||||
)
|
||||
new_parallel_config.data_parallel_master_port = (
|
||||
reconfig_request.new_data_parallel_master_port
|
||||
)
|
||||
new_parallel_config._data_parallel_master_port_list = (
|
||||
reconfig_request.new_data_parallel_master_port_list
|
||||
)
|
||||
|
||||
is_scale_down = reconfig_request.new_data_parallel_size < old_dp_size
|
||||
is_shutdown = (
|
||||
reconfig_request.new_data_parallel_rank
|
||||
== ReconfigureRankType.SHUTDOWN_CURRENT_RANK
|
||||
)
|
||||
|
||||
self.eep_scaling_state = ElasticEPScalingState(
|
||||
model_executor=self.model_executor,
|
||||
engine_core=self,
|
||||
vllm_config=self.vllm_config,
|
||||
new_parallel_config=new_parallel_config,
|
||||
worker_type="removing" if is_shutdown else "existing",
|
||||
scale_type="scale_down" if is_scale_down else "scale_up",
|
||||
reconfig_request=reconfig_request,
|
||||
)
|
||||
self.process_input_queue_block = False
|
||||
logger.info(
|
||||
"[Elastic EP] Received reconfiguration request and starting scaling up/down"
|
||||
)
|
||||
|
||||
def _eep_send_engine_core_notification(
|
||||
self,
|
||||
notification_type: EEPNotificationType,
|
||||
vllm_config: VllmConfig | None = None,
|
||||
):
|
||||
"""
|
||||
Send notifications to EngineCoreClient, which can then forward
|
||||
the notifications to other engine core processes. It is used for:
|
||||
1) In scale up: new core engines to notify exisiting core engines
|
||||
that they are ready;
|
||||
2) In scale down: removing core engines to notify EngineCoreClient
|
||||
so EngineCoreClient can release their ray placement groups;
|
||||
3) Both scale up/down: to notify EngineCoreClient that exisiting
|
||||
core engines have already switched to the new parallel setup.
|
||||
"""
|
||||
if vllm_config is None:
|
||||
dp_rank = self.vllm_config.parallel_config.data_parallel_rank
|
||||
else:
|
||||
dp_rank = vllm_config.parallel_config.data_parallel_rank
|
||||
notification_data = (notification_type.value, dp_rank)
|
||||
outputs = EngineCoreOutputs(
|
||||
utility_output=UtilityOutput(
|
||||
call_id=EEP_NOTIFICATION_CALL_ID,
|
||||
result=UtilityResult(notification_data),
|
||||
)
|
||||
)
|
||||
outputs.engine_index = self.engine_index
|
||||
|
||||
if hasattr(self, "output_thread") and self.output_thread.is_alive():
|
||||
self.output_queue.put_nowait((0, outputs))
|
||||
else:
|
||||
encoder = MsgpackEncoder()
|
||||
with (
|
||||
zmq.Context() as ctx,
|
||||
make_zmq_socket(
|
||||
ctx, self.addresses.outputs[0], zmq.PUSH, linger=4000
|
||||
) as socket,
|
||||
):
|
||||
socket.send_multipart(encoder.encode(outputs))
|
||||
|
||||
def eep_handle_engine_core_notification(
|
||||
self, notification_type: str | EEPNotificationType
|
||||
):
|
||||
"""
|
||||
Handle notification received from EngineCoreClient
|
||||
(forwarded from new core engines).
|
||||
"""
|
||||
assert self.eep_scaling_state is not None
|
||||
if isinstance(notification_type, str):
|
||||
notification_type = EEPNotificationType(notification_type)
|
||||
self.eep_scaling_state.handle_notification(notification_type)
|
||||
|
||||
def _eep_scale_up_before_kv_init(self):
|
||||
from vllm.distributed.elastic_ep.elastic_state import ElasticEPScalingState
|
||||
|
||||
self.eep_scaling_state = ElasticEPScalingState(
|
||||
model_executor=self.model_executor,
|
||||
engine_core=self,
|
||||
vllm_config=self.vllm_config,
|
||||
new_parallel_config=self.vllm_config.parallel_config,
|
||||
worker_type="new",
|
||||
scale_type="scale_up",
|
||||
reconfig_request=None,
|
||||
)
|
||||
self.model_executor.collective_rpc("init_device")
|
||||
self.model_executor.collective_rpc("load_model")
|
||||
self._eep_send_engine_core_notification(
|
||||
EEPNotificationType.NEW_CORE_ENGINES_WEIGHTS_INIT_READY
|
||||
)
|
||||
self.model_executor.collective_rpc(
|
||||
"elastic_ep_execute", args=("receive_weights",)
|
||||
)
|
||||
self.available_gpu_memory_for_kv_cache = (
|
||||
ParallelConfig.sync_kv_cache_memory_size(self.dp_group, -1)
|
||||
)
|
||||
self.model_executor.collective_rpc(
|
||||
"elastic_ep_execute", args=("prepare_new_worker",)
|
||||
)
|
||||
self.process_input_queue_block = False
|
||||
|
||||
|
||||
class EngineCoreActorMixin:
|
||||
|
||||
Reference in New Issue
Block a user