Upgrade to vllm 0.17.0 corex v4.1 overlay

This commit is contained in:
2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions

View File

@@ -17,6 +17,7 @@ from typing import Any, TypeVar, cast
import msgspec
import zmq
import vllm.envs as envs
from vllm.config import ParallelConfig, VllmConfig
from vllm.distributed import stateless_destroy_torch_distributed_process_group
from vllm.envs import enable_envs_cache
@@ -44,6 +45,8 @@ from vllm.v1.core.kv_cache_utils import (
from vllm.v1.core.sched.interface import PauseState, SchedulerInterface
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.engine import (
EEP_NOTIFICATION_CALL_ID,
EEPNotificationType,
EngineCoreOutput,
EngineCoreOutputs,
EngineCoreRequest,
@@ -72,7 +75,6 @@ from vllm.version import __version__ as VLLM_VERSION
logger = init_logger(__name__)
POLLING_TIMEOUT_S = 2.5
HANDSHAKE_TIMEOUT_MINS = 5
_R = TypeVar("_R") # Return type for collective_rpc
@@ -111,6 +113,9 @@ class EngineCore:
self.available_gpu_memory_for_kv_cache = -1
if envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH:
self._eep_scale_up_before_kv_init()
# Setup KV Caches and update CacheConfig after profiling.
num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(
vllm_config
@@ -180,13 +185,55 @@ class EngineCore:
# Batch queue for scheduled batches. This enables us to asynchronously
# schedule and execute batches, and is required by pipeline parallelism
# to eliminate pipeline bubbles.
self.batch_queue_size = self.model_executor.max_concurrent_batches
base_batch_queue_size = self.model_executor.max_concurrent_batches
if envs.VLLM_ENABLE_PP_ILU_OPT:
self.batch_queue_size = envs.VLLM_PP_ILU_OPT_BATCH_QUEUE_SIZE
if self.batch_queue_size <= 0:
self.batch_queue_size = base_batch_queue_size * 2
self._use_batch_queue_ilu_opt = True
logger.info(
"PP ILU opt is enabled: batch_queue_size=%d (base=%d)",
self.batch_queue_size,
base_batch_queue_size,
)
else:
self.batch_queue_size = base_batch_queue_size
self._use_batch_queue_ilu_opt = False
self.batch_queue: (
deque[tuple[Future[ModelRunnerOutput], SchedulerOutput, Future[Any]]] | None
) = None
if self.batch_queue_size > 1:
logger.debug("Batch queue is enabled with size %d", self.batch_queue_size)
logger.info(
"Batch queue is enabled with size %d (ilu_opt=%s)",
self.batch_queue_size,
self._use_batch_queue_ilu_opt,
)
self.batch_queue = deque(maxlen=self.batch_queue_size)
if self._use_batch_queue_ilu_opt:
self.engine_core_input_queue: queue.Queue[
tuple[Future[ModelRunnerOutput], SchedulerOutput]
] = queue.Queue(maxsize=self.batch_queue_size)
self.engine_core_output_queue: queue.Queue[
tuple[SchedulerOutput, ModelRunnerOutput, bool]
] = queue.Queue(maxsize=self.batch_queue_size)
self._batch_queue_loop_thread = threading.Thread(
target=self._process_batch_queue_loop,
daemon=True,
)
self._batch_queue_loop_thread.start()
# When PP mix ILU scheduling or PP ILU opt is enabled with a KV
# connector, only NixlConnector is supported.
if vllm_config.kv_transfer_config is not None and (
envs.VLLM_ENABLE_PP_MIX_ILU_SCHEDULING or envs.VLLM_ENABLE_PP_ILU_OPT
):
kv_connector_name = vllm_config.kv_transfer_config.kv_connector
if kv_connector_name != "NixlConnector":
raise ValueError(
"When VLLM_ENABLE_PP_MIX_ILU_SCHEDULING or VLLM_ENABLE_PP_ILU_OPT "
"is enabled with a KV connector, only NixlConnector is supported; "
f"current kv_connector is {kv_connector_name!r}."
)
self.is_ec_producer = (
vllm_config.ec_transfer_config is not None
@@ -209,6 +256,10 @@ class EngineCore:
self.step if self.batch_queue is None else self.step_with_batch_queue
)
self.async_scheduling = vllm_config.scheduler_config.async_scheduling
self.draft_in_model_output = (
self.batch_queue is not None and self.use_spec_decode
)
self.aborts_queue = queue.Queue[list[str]]()
@@ -234,12 +285,10 @@ class EngineCore:
has_kv_cache = any(kv_cache_spec for kv_cache_spec in kv_cache_specs)
if has_kv_cache:
if os.environ.get("VLLM_ELASTIC_EP_SCALE_UP_LAUNCH") == "1":
dp_group = getattr(self, "dp_group", None)
assert dp_group is not None
self.available_gpu_memory_for_kv_cache = (
ParallelConfig.sync_kv_cache_memory_size(dp_group, -1)
)
if envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH:
# NOTE(yongji): should already be set
# during _eep_scale_up_before_kv_init
assert self.available_gpu_memory_for_kv_cache > 0
available_gpu_memory = [self.available_gpu_memory_for_kv_cache] * len(
kv_cache_specs
)
@@ -408,12 +457,52 @@ class EngineCore:
# When using async scheduling we can't get draft token ids in advance,
# so we update draft token ids in the worker process and don't
# need to update draft token ids here.
if self.draft_in_model_output:
return
if not self.async_scheduling and self.use_spec_decode and model_executed:
# Take the draft token ids.
draft_token_ids = self.model_executor.take_draft_token_ids()
if draft_token_ids is not None:
self.scheduler.update_draft_token_ids(draft_token_ids)
def _has_kv_connector_work(self, meta: Any) -> bool:
"""Return True if kv_connector_metadata has any recv/save/send work."""
if meta is None:
return False
for attr in ("reqs_to_recv", "reqs_to_save", "reqs_to_send"):
val = getattr(meta, attr, None)
if val is not None and len(val) > 0:
return True
return False
def _has_meaningful_scheduler_output(
self, scheduler_output: SchedulerOutput
) -> bool:
"""Return False if scheduler_output is effectively empty."""
return not (
len(scheduler_output.scheduled_new_reqs) == 0
and len(scheduler_output.scheduled_cached_reqs.req_ids) == 0
and len(scheduler_output.num_scheduled_tokens) == 0
and scheduler_output.total_num_scheduled_tokens == 0
and len(scheduler_output.scheduled_spec_decode_tokens) == 0
and len(scheduler_output.scheduled_encoder_inputs) == 0
and len(scheduler_output.finished_req_ids) == 0
and (scheduler_output.scheduled_resumed_reqs is None
or len(scheduler_output.scheduled_resumed_reqs) == 0)
and not self._has_kv_connector_work(
scheduler_output.kv_connector_metadata
)
)
def _process_batch_queue_loop(self) -> None:
while True:
future, scheduler_output = self.engine_core_input_queue.get()
with self.log_error_detail(scheduler_output):
model_output = future.result()
self.engine_core_output_queue.put(
(scheduler_output, model_output, False)
)
def step_with_batch_queue(
self,
) -> tuple[dict[int, EngineCoreOutputs] | None, bool]:
@@ -434,6 +523,9 @@ class EngineCore:
batch_queue = self.batch_queue
assert batch_queue is not None
if self._use_batch_queue_ilu_opt:
return self.step_with_batch_queue_ilu_opt()
# Try to schedule a new batch if the batch queue is not full, but
# the scheduler may return an empty batch if all requests are scheduled.
# Note that this is not blocking.
@@ -531,6 +623,96 @@ class EngineCore:
return engine_core_outputs, model_executed
def step_with_batch_queue_ilu_opt(
self,
) -> tuple[dict[int, EngineCoreOutputs] | None, bool]:
"""Async batch queue variant using background thread for PP ILU opt.
Uses engine_core_input_queue / engine_core_output_queue with a
background thread (_process_batch_queue_loop) that blocks on
future.result(), so the main thread never blocks on GPU compute.
"""
assert not self.is_ec_producer, (
"ec_producer is not supported in step_with_batch_queue_ilu_opt"
)
assert not self.is_pooling_model, (
"is_pooling_model is not supported in step_with_batch_queue_ilu_opt"
)
assert not self.async_scheduling, (
"async_scheduling is not supported in step_with_batch_queue_ilu_opt"
)
model_executed = False
if self.scheduler.has_requests():
scheduler_output = self.scheduler.schedule()
has_meaningful_schedule = self._has_meaningful_scheduler_output(
scheduler_output
)
if (
self.engine_core_input_queue.qsize() <= 1
and not has_meaningful_schedule
):
has_meaningful_schedule = True
if has_meaningful_schedule:
logger.debug(
"[step_with_batch_queue_ilu_opt] scheduler_output: "
"total_num_scheduled_tokens=%s num_scheduled_tokens=%s "
"scheduled_new_reqs=%s scheduled_cached_reqs.req_ids=%s "
"resumed_req_ids=%s finished_req_ids=%s "
"has_meaningful_schedule=%s",
scheduler_output.total_num_scheduled_tokens,
scheduler_output.num_scheduled_tokens,
[r.req_id for r in scheduler_output.scheduled_new_reqs],
scheduler_output.scheduled_cached_reqs.req_ids,
scheduler_output.scheduled_cached_reqs.resumed_req_ids,
scheduler_output.finished_req_ids,
has_meaningful_schedule,
)
if has_meaningful_schedule:
exec_future = self.model_executor.execute_model(
scheduler_output, non_block=True
)
model_executed = (
scheduler_output.total_num_scheduled_tokens > 0
)
if not model_executed:
future = cast(Future[ModelRunnerOutput], exec_future)
else:
grammar_output = self.scheduler.get_grammar_bitmask(
scheduler_output
)
future = self.model_executor.sample_tokens(
grammar_output, non_block=True
)
if self.engine_core_input_queue.full():
scheduler_output_out, model_output_out, model_executed_out = (
self.engine_core_output_queue.get()
)
engine_core_outputs = self.scheduler.update_from_output(
scheduler_output_out, model_output_out
)
self.engine_core_input_queue.put(
(future, scheduler_output)
)
return engine_core_outputs, model_executed_out
self.engine_core_input_queue.put((future, scheduler_output))
try:
scheduler_output, model_output, model_executed = (
self.engine_core_output_queue.get_nowait()
)
engine_core_outputs = self.scheduler.update_from_output(
scheduler_output, model_output
)
return engine_core_outputs, model_executed
except queue.Empty:
return None, False
def _process_aborts_queue(self):
if not self.aborts_queue.empty():
request_ids = []
@@ -753,11 +935,22 @@ class EngineCore:
self.structured_output_manager.grammar_init(req)
return req, request.current_wave
def _eep_scale_up_before_kv_init(self):
raise NotImplementedError
def _eep_send_engine_core_notification(
self,
notification_type: EEPNotificationType,
vllm_config: VllmConfig | None = None,
):
raise NotImplementedError
class EngineCoreProc(EngineCore):
"""ZMQ-wrapper for running EngineCore in background process."""
ENGINE_CORE_DEAD = b"ENGINE_CORE_DEAD"
addresses: EngineZmqAddresses
@instrument(span_name="EngineCoreProc init")
def __init__(
@@ -808,6 +1001,13 @@ class EngineCoreProc(EngineCore):
# and "hybrid" LB modes.
self.publish_dp_lb_stats = internal_dp_balancing
self.addresses = addresses
self.process_input_queue_block = True
if envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH:
self._eep_send_engine_core_notification(
EEPNotificationType.NEW_CORE_ENGINES_INIT_READY,
vllm_config=vllm_config,
)
self._init_data_parallel(vllm_config)
super().__init__(
@@ -1120,8 +1320,14 @@ class EngineCoreProc(EngineCore):
if logger.isEnabledFor(DEBUG):
logger.debug("EngineCore waiting for work.")
waited = True
req = self.input_queue.get()
self._handle_client_request(*req)
block = self.process_input_queue_block
try:
req = self.input_queue.get(block=block)
self._handle_client_request(*req)
except queue.Empty:
break
if not block:
break
if waited:
logger.debug("EngineCore loop active.")
@@ -1291,6 +1497,11 @@ class EngineCoreProc(EngineCore):
for input_socket, _ in poller.poll():
# (RequestType, RequestData)
type_frame, *data_frames = input_socket.recv_multipart(copy=False)
# NOTE(yongji): ignore READY message sent by DP coordinator
# that is used to notify newly started engines
if type_frame.buffer == b"READY":
assert input_socket == coord_socket
continue
request_type = EngineCoreRequestType(bytes(type_frame.buffer))
# Deserialize the request data.
@@ -1489,6 +1700,10 @@ class DPEngineCoreProc(EngineCoreProc):
self.current_wave = 0
self.last_counts = (0, 0)
from vllm.distributed.elastic_ep.elastic_state import ElasticEPScalingState
self.eep_scaling_state: ElasticEPScalingState | None = None
# Initialize the engine.
dp_rank = vllm_config.parallel_config.data_parallel_rank
super().__init__(
@@ -1512,7 +1727,9 @@ class DPEngineCoreProc(EngineCoreProc):
assert 0 <= local_dp_rank <= dp_rank < dp_size
self.dp_rank = dp_rank
self.dp_group = vllm_config.parallel_config.stateless_init_dp_group()
self.dp_group, self.dp_store = (
vllm_config.parallel_config.stateless_init_dp_group(return_store=True)
)
def shutdown(self):
super().shutdown()
@@ -1533,7 +1750,11 @@ class DPEngineCoreProc(EngineCoreProc):
def resume_scheduler(self):
super().resume_scheduler()
if not self.engines_running and self.scheduler.has_unfinished_requests():
if (
self.has_coordinator
and not self.engines_running
and self.scheduler.has_unfinished_requests()
):
# Wake up other DP engines.
self.output_queue.put_nowait(
(-1, EngineCoreOutputs(start_wave=self.current_wave))
@@ -1575,7 +1796,12 @@ class DPEngineCoreProc(EngineCoreProc):
# 1) Poll the input queue until there is work to do.
self._process_input_queue()
# 2) Step the engine core.
if self.eep_scaling_state is not None:
_ = self.eep_scaling_state.progress()
if self.eep_scaling_state.is_complete():
self.process_input_queue_block = True
self.eep_scaling_state = None
executed = self._process_engine_step()
self._maybe_publish_request_counts()
@@ -1625,54 +1851,129 @@ class DPEngineCoreProc(EngineCoreProc):
def reinitialize_distributed(
self, reconfig_request: ReconfigureDistributedRequest
) -> None:
stateless_destroy_torch_distributed_process_group(self.dp_group)
self.shutdown()
from copy import deepcopy
parallel_config = self.vllm_config.parallel_config
old_dp_size = parallel_config.data_parallel_size
parallel_config.data_parallel_size = reconfig_request.new_data_parallel_size
if reconfig_request.new_data_parallel_rank != -1:
parallel_config.data_parallel_rank = reconfig_request.new_data_parallel_rank
# local rank specifies device visibility, it should not be changed
assert (
reconfig_request.new_data_parallel_rank_local
== ReconfigureRankType.KEEP_CURRENT_RANK
)
parallel_config.data_parallel_master_ip = (
reconfig_request.new_data_parallel_master_ip
)
parallel_config.data_parallel_master_port = (
reconfig_request.new_data_parallel_master_port
)
if reconfig_request.new_data_parallel_rank != -2:
self.dp_rank = parallel_config.data_parallel_rank
self.dp_group = parallel_config.stateless_init_dp_group()
reconfig_request.new_data_parallel_master_port = (
parallel_config.data_parallel_master_port
)
from vllm.distributed.elastic_ep.elastic_state import ElasticEPScalingState
self.model_executor.reinitialize_distributed(reconfig_request)
if reconfig_request.new_data_parallel_size > old_dp_size:
assert self.available_gpu_memory_for_kv_cache > 0
# pass available_gpu_memory_for_kv_cache from existing
# engine-cores to new engine-cores so they can directly
# use it in _initialize_kv_caches() rather than profiling.
ParallelConfig.sync_kv_cache_memory_size(
self.dp_group, self.available_gpu_memory_for_kv_cache
)
# NOTE(yongji): newly joined workers require dummy_run even
# CUDA graph is not used
self.model_executor.collective_rpc("compile_or_warm_up_model")
new_parallel_config = deepcopy(self.vllm_config.parallel_config)
old_dp_size = new_parallel_config.data_parallel_size
new_parallel_config.data_parallel_size = reconfig_request.new_data_parallel_size
if (
reconfig_request.new_data_parallel_rank
== ReconfigureRankType.SHUTDOWN_CURRENT_RANK
!= ReconfigureRankType.KEEP_CURRENT_RANK
):
self.shutdown()
logger.info("DPEngineCoreProc %s shutdown", self.dp_rank)
else:
logger.info(
"Distributed environment reinitialized for DP rank %s", self.dp_rank
new_parallel_config.data_parallel_rank = (
reconfig_request.new_data_parallel_rank
)
new_parallel_config.data_parallel_master_ip = (
reconfig_request.new_data_parallel_master_ip
)
new_parallel_config.data_parallel_master_port = (
reconfig_request.new_data_parallel_master_port
)
new_parallel_config._data_parallel_master_port_list = (
reconfig_request.new_data_parallel_master_port_list
)
is_scale_down = reconfig_request.new_data_parallel_size < old_dp_size
is_shutdown = (
reconfig_request.new_data_parallel_rank
== ReconfigureRankType.SHUTDOWN_CURRENT_RANK
)
self.eep_scaling_state = ElasticEPScalingState(
model_executor=self.model_executor,
engine_core=self,
vllm_config=self.vllm_config,
new_parallel_config=new_parallel_config,
worker_type="removing" if is_shutdown else "existing",
scale_type="scale_down" if is_scale_down else "scale_up",
reconfig_request=reconfig_request,
)
self.process_input_queue_block = False
logger.info(
"[Elastic EP] Received reconfiguration request and starting scaling up/down"
)
def _eep_send_engine_core_notification(
self,
notification_type: EEPNotificationType,
vllm_config: VllmConfig | None = None,
):
"""
Send notifications to EngineCoreClient, which can then forward
the notifications to other engine core processes. It is used for:
1) In scale up: new core engines to notify exisiting core engines
that they are ready;
2) In scale down: removing core engines to notify EngineCoreClient
so EngineCoreClient can release their ray placement groups;
3) Both scale up/down: to notify EngineCoreClient that exisiting
core engines have already switched to the new parallel setup.
"""
if vllm_config is None:
dp_rank = self.vllm_config.parallel_config.data_parallel_rank
else:
dp_rank = vllm_config.parallel_config.data_parallel_rank
notification_data = (notification_type.value, dp_rank)
outputs = EngineCoreOutputs(
utility_output=UtilityOutput(
call_id=EEP_NOTIFICATION_CALL_ID,
result=UtilityResult(notification_data),
)
)
outputs.engine_index = self.engine_index
if hasattr(self, "output_thread") and self.output_thread.is_alive():
self.output_queue.put_nowait((0, outputs))
else:
encoder = MsgpackEncoder()
with (
zmq.Context() as ctx,
make_zmq_socket(
ctx, self.addresses.outputs[0], zmq.PUSH, linger=4000
) as socket,
):
socket.send_multipart(encoder.encode(outputs))
def eep_handle_engine_core_notification(
self, notification_type: str | EEPNotificationType
):
"""
Handle notification received from EngineCoreClient
(forwarded from new core engines).
"""
assert self.eep_scaling_state is not None
if isinstance(notification_type, str):
notification_type = EEPNotificationType(notification_type)
self.eep_scaling_state.handle_notification(notification_type)
def _eep_scale_up_before_kv_init(self):
from vllm.distributed.elastic_ep.elastic_state import ElasticEPScalingState
self.eep_scaling_state = ElasticEPScalingState(
model_executor=self.model_executor,
engine_core=self,
vllm_config=self.vllm_config,
new_parallel_config=self.vllm_config.parallel_config,
worker_type="new",
scale_type="scale_up",
reconfig_request=None,
)
self.model_executor.collective_rpc("init_device")
self.model_executor.collective_rpc("load_model")
self._eep_send_engine_core_notification(
EEPNotificationType.NEW_CORE_ENGINES_WEIGHTS_INIT_READY
)
self.model_executor.collective_rpc(
"elastic_ep_execute", args=("receive_weights",)
)
self.available_gpu_memory_for_kv_cache = (
ParallelConfig.sync_kv_cache_memory_size(self.dp_group, -1)
)
self.model_executor.collective_rpc(
"elastic_ep_execute", args=("prepare_new_worker",)
)
self.process_input_queue_block = False
class EngineCoreActorMixin: