Upgrade to vllm 0.17.0 corex v4.1 overlay

2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -17,6 +17,7 @@ from typing import Any, TypeVar, cast
 import msgspec
 import zmq

+import vllm.envs as envs
 from vllm.config import ParallelConfig, VllmConfig
 from vllm.distributed import stateless_destroy_torch_distributed_process_group
 from vllm.envs import enable_envs_cache
@@ -44,6 +45,8 @@ from vllm.v1.core.kv_cache_utils import (
 from vllm.v1.core.sched.interface import PauseState, SchedulerInterface
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.engine import (
+    EEP_NOTIFICATION_CALL_ID,
+    EEPNotificationType,
    EngineCoreOutput,
    EngineCoreOutputs,
    EngineCoreRequest,
@@ -72,7 +75,6 @@ from vllm.version import __version__ as VLLM_VERSION

 logger = init_logger(__name__)

-POLLING_TIMEOUT_S = 2.5
 HANDSHAKE_TIMEOUT_MINS = 5

 _R = TypeVar("_R")  # Return type for collective_rpc
@@ -111,6 +113,9 @@ class EngineCore:

        self.available_gpu_memory_for_kv_cache = -1

+        if envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH:
+            self._eep_scale_up_before_kv_init()
+
        # Setup KV Caches and update CacheConfig after profiling.
        num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(
            vllm_config
@@ -180,13 +185,55 @@ class EngineCore:
        # Batch queue for scheduled batches. This enables us to asynchronously
        # schedule and execute batches, and is required by pipeline parallelism
        # to eliminate pipeline bubbles.
-        self.batch_queue_size = self.model_executor.max_concurrent_batches
+        base_batch_queue_size = self.model_executor.max_concurrent_batches
+        if envs.VLLM_ENABLE_PP_ILU_OPT:
+            self.batch_queue_size = envs.VLLM_PP_ILU_OPT_BATCH_QUEUE_SIZE
+            if self.batch_queue_size <= 0:
+                self.batch_queue_size = base_batch_queue_size * 2
+            self._use_batch_queue_ilu_opt = True
+            logger.info(
+                "PP ILU opt is enabled: batch_queue_size=%d (base=%d)",
+                self.batch_queue_size,
+                base_batch_queue_size,
+            )
+        else:
+            self.batch_queue_size = base_batch_queue_size
+            self._use_batch_queue_ilu_opt = False
        self.batch_queue: (
            deque[tuple[Future[ModelRunnerOutput], SchedulerOutput, Future[Any]]] | None
        ) = None
        if self.batch_queue_size > 1:
-            logger.debug("Batch queue is enabled with size %d", self.batch_queue_size)
+            logger.info(
+                "Batch queue is enabled with size %d (ilu_opt=%s)",
+                self.batch_queue_size,
+                self._use_batch_queue_ilu_opt,
+            )
            self.batch_queue = deque(maxlen=self.batch_queue_size)
+            if self._use_batch_queue_ilu_opt:
+                self.engine_core_input_queue: queue.Queue[
+                    tuple[Future[ModelRunnerOutput], SchedulerOutput]
+                ] = queue.Queue(maxsize=self.batch_queue_size)
+                self.engine_core_output_queue: queue.Queue[
+                    tuple[SchedulerOutput, ModelRunnerOutput, bool]
+                ] = queue.Queue(maxsize=self.batch_queue_size)
+                self._batch_queue_loop_thread = threading.Thread(
+                    target=self._process_batch_queue_loop,
+                    daemon=True,
+                )
+                self._batch_queue_loop_thread.start()
+
+        # When PP mix ILU scheduling or PP ILU opt is enabled with a KV
+        # connector, only NixlConnector is supported.
+        if vllm_config.kv_transfer_config is not None and (
+            envs.VLLM_ENABLE_PP_MIX_ILU_SCHEDULING or envs.VLLM_ENABLE_PP_ILU_OPT
+        ):
+            kv_connector_name = vllm_config.kv_transfer_config.kv_connector
+            if kv_connector_name != "NixlConnector":
+                raise ValueError(
+                    "When VLLM_ENABLE_PP_MIX_ILU_SCHEDULING or VLLM_ENABLE_PP_ILU_OPT "
+                    "is enabled with a KV connector, only NixlConnector is supported; "
+                    f"current kv_connector is {kv_connector_name!r}."
+                )

        self.is_ec_producer = (
            vllm_config.ec_transfer_config is not None
@@ -209,6 +256,10 @@ class EngineCore:
            self.step if self.batch_queue is None else self.step_with_batch_queue
        )
        self.async_scheduling = vllm_config.scheduler_config.async_scheduling
+        
+        self.draft_in_model_output = (
+            self.batch_queue is not None and self.use_spec_decode
+        )

        self.aborts_queue = queue.Queue[list[str]]()

@@ -234,12 +285,10 @@ class EngineCore:

        has_kv_cache = any(kv_cache_spec for kv_cache_spec in kv_cache_specs)
        if has_kv_cache:
-            if os.environ.get("VLLM_ELASTIC_EP_SCALE_UP_LAUNCH") == "1":
-                dp_group = getattr(self, "dp_group", None)
-                assert dp_group is not None
-                self.available_gpu_memory_for_kv_cache = (
-                    ParallelConfig.sync_kv_cache_memory_size(dp_group, -1)
-                )
+            if envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH:
+                # NOTE(yongji): should already be set
+                # during _eep_scale_up_before_kv_init
+                assert self.available_gpu_memory_for_kv_cache > 0
                available_gpu_memory = [self.available_gpu_memory_for_kv_cache] * len(
                    kv_cache_specs
                )
@@ -408,12 +457,52 @@ class EngineCore:
        # When using async scheduling we can't get draft token ids in advance,
        # so we update draft token ids in the worker process and don't
        # need to update draft token ids here.
+        if self.draft_in_model_output:
+            return
        if not self.async_scheduling and self.use_spec_decode and model_executed:
            # Take the draft token ids.
            draft_token_ids = self.model_executor.take_draft_token_ids()
            if draft_token_ids is not None:
                self.scheduler.update_draft_token_ids(draft_token_ids)

+    def _has_kv_connector_work(self, meta: Any) -> bool:
+        """Return True if kv_connector_metadata has any recv/save/send work."""
+        if meta is None:
+            return False
+        for attr in ("reqs_to_recv", "reqs_to_save", "reqs_to_send"):
+            val = getattr(meta, attr, None)
+            if val is not None and len(val) > 0:
+                return True
+        return False
+
+    def _has_meaningful_scheduler_output(
+        self, scheduler_output: SchedulerOutput
+    ) -> bool:
+        """Return False if scheduler_output is effectively empty."""
+        return not (
+            len(scheduler_output.scheduled_new_reqs) == 0
+            and len(scheduler_output.scheduled_cached_reqs.req_ids) == 0
+            and len(scheduler_output.num_scheduled_tokens) == 0
+            and scheduler_output.total_num_scheduled_tokens == 0
+            and len(scheduler_output.scheduled_spec_decode_tokens) == 0
+            and len(scheduler_output.scheduled_encoder_inputs) == 0
+            and len(scheduler_output.finished_req_ids) == 0
+            and (scheduler_output.scheduled_resumed_reqs is None
+                 or len(scheduler_output.scheduled_resumed_reqs) == 0)
+            and not self._has_kv_connector_work(
+                scheduler_output.kv_connector_metadata
+            )
+        )
+
+    def _process_batch_queue_loop(self) -> None:
+        while True:
+            future, scheduler_output = self.engine_core_input_queue.get()
+            with self.log_error_detail(scheduler_output):
+                model_output = future.result()
+            self.engine_core_output_queue.put(
+                (scheduler_output, model_output, False)
+            )
+
    def step_with_batch_queue(
        self,
    ) -> tuple[dict[int, EngineCoreOutputs] | None, bool]:
@@ -434,6 +523,9 @@ class EngineCore:
        batch_queue = self.batch_queue
        assert batch_queue is not None

+        if self._use_batch_queue_ilu_opt:
+            return self.step_with_batch_queue_ilu_opt()
+
        # Try to schedule a new batch if the batch queue is not full, but
        # the scheduler may return an empty batch if all requests are scheduled.
        # Note that this is not blocking.
@@ -531,6 +623,96 @@ class EngineCore:

        return engine_core_outputs, model_executed

+    def step_with_batch_queue_ilu_opt(
+        self,
+    ) -> tuple[dict[int, EngineCoreOutputs] | None, bool]:
+        """Async batch queue variant using background thread for PP ILU opt.
+
+        Uses engine_core_input_queue / engine_core_output_queue with a
+        background thread (_process_batch_queue_loop) that blocks on
+        future.result(), so the main thread never blocks on GPU compute.
+        """
+        assert not self.is_ec_producer, (
+            "ec_producer is not supported in step_with_batch_queue_ilu_opt"
+        )
+        assert not self.is_pooling_model, (
+            "is_pooling_model is not supported in step_with_batch_queue_ilu_opt"
+        )
+        assert not self.async_scheduling, (
+            "async_scheduling is not supported in step_with_batch_queue_ilu_opt"
+        )
+
+        model_executed = False
+
+        if self.scheduler.has_requests():
+            scheduler_output = self.scheduler.schedule()
+            has_meaningful_schedule = self._has_meaningful_scheduler_output(
+                scheduler_output
+            )
+            if (
+                self.engine_core_input_queue.qsize() <= 1
+                and not has_meaningful_schedule
+            ):
+                has_meaningful_schedule = True
+            if has_meaningful_schedule:
+                logger.debug(
+                    "[step_with_batch_queue_ilu_opt] scheduler_output: "
+                    "total_num_scheduled_tokens=%s num_scheduled_tokens=%s "
+                    "scheduled_new_reqs=%s scheduled_cached_reqs.req_ids=%s "
+                    "resumed_req_ids=%s finished_req_ids=%s "
+                    "has_meaningful_schedule=%s",
+                    scheduler_output.total_num_scheduled_tokens,
+                    scheduler_output.num_scheduled_tokens,
+                    [r.req_id for r in scheduler_output.scheduled_new_reqs],
+                    scheduler_output.scheduled_cached_reqs.req_ids,
+                    scheduler_output.scheduled_cached_reqs.resumed_req_ids,
+                    scheduler_output.finished_req_ids,
+                    has_meaningful_schedule,
+                )
+
+            if has_meaningful_schedule:
+                exec_future = self.model_executor.execute_model(
+                    scheduler_output, non_block=True
+                )
+                model_executed = (
+                    scheduler_output.total_num_scheduled_tokens > 0
+                )
+
+                if not model_executed:
+                    future = cast(Future[ModelRunnerOutput], exec_future)
+                else:
+                    grammar_output = self.scheduler.get_grammar_bitmask(
+                        scheduler_output
+                    )
+                    future = self.model_executor.sample_tokens(
+                        grammar_output, non_block=True
+                    )
+
+                if self.engine_core_input_queue.full():
+                    scheduler_output_out, model_output_out, model_executed_out = (
+                        self.engine_core_output_queue.get()
+                    )
+                    engine_core_outputs = self.scheduler.update_from_output(
+                        scheduler_output_out, model_output_out
+                    )
+                    self.engine_core_input_queue.put(
+                        (future, scheduler_output)
+                    )
+                    return engine_core_outputs, model_executed_out
+
+                self.engine_core_input_queue.put((future, scheduler_output))
+
+        try:
+            scheduler_output, model_output, model_executed = (
+                self.engine_core_output_queue.get_nowait()
+            )
+            engine_core_outputs = self.scheduler.update_from_output(
+                scheduler_output, model_output
+            )
+            return engine_core_outputs, model_executed
+        except queue.Empty:
+            return None, False
+
    def _process_aborts_queue(self):
        if not self.aborts_queue.empty():
            request_ids = []
@@ -753,11 +935,22 @@ class EngineCore:
            self.structured_output_manager.grammar_init(req)
        return req, request.current_wave

+    def _eep_scale_up_before_kv_init(self):
+        raise NotImplementedError
+
+    def _eep_send_engine_core_notification(
+        self,
+        notification_type: EEPNotificationType,
+        vllm_config: VllmConfig | None = None,
+    ):
+        raise NotImplementedError
+

 class EngineCoreProc(EngineCore):
    """ZMQ-wrapper for running EngineCore in background process."""

    ENGINE_CORE_DEAD = b"ENGINE_CORE_DEAD"
+    addresses: EngineZmqAddresses

    @instrument(span_name="EngineCoreProc init")
    def __init__(
@@ -808,6 +1001,13 @@ class EngineCoreProc(EngineCore):
            # and "hybrid" LB modes.
            self.publish_dp_lb_stats = internal_dp_balancing

+            self.addresses = addresses
+            self.process_input_queue_block = True
+            if envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH:
+                self._eep_send_engine_core_notification(
+                    EEPNotificationType.NEW_CORE_ENGINES_INIT_READY,
+                    vllm_config=vllm_config,
+                )
            self._init_data_parallel(vllm_config)

            super().__init__(
@@ -1120,8 +1320,14 @@ class EngineCoreProc(EngineCore):
                if logger.isEnabledFor(DEBUG):
                    logger.debug("EngineCore waiting for work.")
                    waited = True
-            req = self.input_queue.get()
-            self._handle_client_request(*req)
+            block = self.process_input_queue_block
+            try:
+                req = self.input_queue.get(block=block)
+                self._handle_client_request(*req)
+            except queue.Empty:
+                break
+            if not block:
+                break

        if waited:
            logger.debug("EngineCore loop active.")
@@ -1291,6 +1497,11 @@ class EngineCoreProc(EngineCore):
                for input_socket, _ in poller.poll():
                    # (RequestType, RequestData)
                    type_frame, *data_frames = input_socket.recv_multipart(copy=False)
+                    # NOTE(yongji): ignore READY message sent by DP coordinator
+                    # that is used to notify newly started engines
+                    if type_frame.buffer == b"READY":
+                        assert input_socket == coord_socket
+                        continue
                    request_type = EngineCoreRequestType(bytes(type_frame.buffer))

                    # Deserialize the request data.
@@ -1489,6 +1700,10 @@ class DPEngineCoreProc(EngineCoreProc):
        self.current_wave = 0
        self.last_counts = (0, 0)

+        from vllm.distributed.elastic_ep.elastic_state import ElasticEPScalingState
+
+        self.eep_scaling_state: ElasticEPScalingState | None = None
+
        # Initialize the engine.
        dp_rank = vllm_config.parallel_config.data_parallel_rank
        super().__init__(
@@ -1512,7 +1727,9 @@ class DPEngineCoreProc(EngineCoreProc):
        assert 0 <= local_dp_rank <= dp_rank < dp_size

        self.dp_rank = dp_rank
-        self.dp_group = vllm_config.parallel_config.stateless_init_dp_group()
+        self.dp_group, self.dp_store = (
+            vllm_config.parallel_config.stateless_init_dp_group(return_store=True)
+        )

    def shutdown(self):
        super().shutdown()
@@ -1533,7 +1750,11 @@ class DPEngineCoreProc(EngineCoreProc):

    def resume_scheduler(self):
        super().resume_scheduler()
-        if not self.engines_running and self.scheduler.has_unfinished_requests():
+        if (
+            self.has_coordinator
+            and not self.engines_running
+            and self.scheduler.has_unfinished_requests()
+        ):
            # Wake up other DP engines.
            self.output_queue.put_nowait(
                (-1, EngineCoreOutputs(start_wave=self.current_wave))
@@ -1575,7 +1796,12 @@ class DPEngineCoreProc(EngineCoreProc):
            # 1) Poll the input queue until there is work to do.
            self._process_input_queue()

-            # 2) Step the engine core.
+            if self.eep_scaling_state is not None:
+                _ = self.eep_scaling_state.progress()
+                if self.eep_scaling_state.is_complete():
+                    self.process_input_queue_block = True
+                    self.eep_scaling_state = None
+
            executed = self._process_engine_step()
            self._maybe_publish_request_counts()

@@ -1625,54 +1851,129 @@ class DPEngineCoreProc(EngineCoreProc):
    def reinitialize_distributed(
        self, reconfig_request: ReconfigureDistributedRequest
    ) -> None:
-        stateless_destroy_torch_distributed_process_group(self.dp_group)
-        self.shutdown()
+        from copy import deepcopy

-        parallel_config = self.vllm_config.parallel_config
-        old_dp_size = parallel_config.data_parallel_size
-        parallel_config.data_parallel_size = reconfig_request.new_data_parallel_size
-        if reconfig_request.new_data_parallel_rank != -1:
-            parallel_config.data_parallel_rank = reconfig_request.new_data_parallel_rank
-        # local rank specifies device visibility, it should not be changed
-        assert (
-            reconfig_request.new_data_parallel_rank_local
-            == ReconfigureRankType.KEEP_CURRENT_RANK
-        )
-        parallel_config.data_parallel_master_ip = (
-            reconfig_request.new_data_parallel_master_ip
-        )
-        parallel_config.data_parallel_master_port = (
-            reconfig_request.new_data_parallel_master_port
-        )
-        if reconfig_request.new_data_parallel_rank != -2:
-            self.dp_rank = parallel_config.data_parallel_rank
-            self.dp_group = parallel_config.stateless_init_dp_group()
-        reconfig_request.new_data_parallel_master_port = (
-            parallel_config.data_parallel_master_port
-        )
+        from vllm.distributed.elastic_ep.elastic_state import ElasticEPScalingState

-        self.model_executor.reinitialize_distributed(reconfig_request)
-        if reconfig_request.new_data_parallel_size > old_dp_size:
-            assert self.available_gpu_memory_for_kv_cache > 0
-            # pass available_gpu_memory_for_kv_cache from existing
-            # engine-cores to new engine-cores so they can directly
-            # use it in _initialize_kv_caches() rather than profiling.
-            ParallelConfig.sync_kv_cache_memory_size(
-                self.dp_group, self.available_gpu_memory_for_kv_cache
-            )
-            # NOTE(yongji): newly joined workers require dummy_run even
-            # CUDA graph is not used
-            self.model_executor.collective_rpc("compile_or_warm_up_model")
+        new_parallel_config = deepcopy(self.vllm_config.parallel_config)
+        old_dp_size = new_parallel_config.data_parallel_size
+        new_parallel_config.data_parallel_size = reconfig_request.new_data_parallel_size
        if (
            reconfig_request.new_data_parallel_rank
-            == ReconfigureRankType.SHUTDOWN_CURRENT_RANK
+            != ReconfigureRankType.KEEP_CURRENT_RANK
        ):
-            self.shutdown()
-            logger.info("DPEngineCoreProc %s shutdown", self.dp_rank)
-        else:
-            logger.info(
-                "Distributed environment reinitialized for DP rank %s", self.dp_rank
+            new_parallel_config.data_parallel_rank = (
+                reconfig_request.new_data_parallel_rank
            )
+        new_parallel_config.data_parallel_master_ip = (
+            reconfig_request.new_data_parallel_master_ip
+        )
+        new_parallel_config.data_parallel_master_port = (
+            reconfig_request.new_data_parallel_master_port
+        )
+        new_parallel_config._data_parallel_master_port_list = (
+            reconfig_request.new_data_parallel_master_port_list
+        )
+
+        is_scale_down = reconfig_request.new_data_parallel_size < old_dp_size
+        is_shutdown = (
+            reconfig_request.new_data_parallel_rank
+            == ReconfigureRankType.SHUTDOWN_CURRENT_RANK
+        )
+
+        self.eep_scaling_state = ElasticEPScalingState(
+            model_executor=self.model_executor,
+            engine_core=self,
+            vllm_config=self.vllm_config,
+            new_parallel_config=new_parallel_config,
+            worker_type="removing" if is_shutdown else "existing",
+            scale_type="scale_down" if is_scale_down else "scale_up",
+            reconfig_request=reconfig_request,
+        )
+        self.process_input_queue_block = False
+        logger.info(
+            "[Elastic EP] Received reconfiguration request and starting scaling up/down"
+        )
+
+    def _eep_send_engine_core_notification(
+        self,
+        notification_type: EEPNotificationType,
+        vllm_config: VllmConfig | None = None,
+    ):
+        """
+        Send notifications to EngineCoreClient, which can then forward
+        the notifications to other engine core processes. It is used for:
+        1) In scale up: new core engines to notify exisiting core engines
+           that they are ready;
+        2) In scale down: removing core engines to notify EngineCoreClient
+           so EngineCoreClient can release their ray placement groups;
+        3) Both scale up/down: to notify EngineCoreClient that exisiting
+           core engines have already switched to the new parallel setup.
+        """
+        if vllm_config is None:
+            dp_rank = self.vllm_config.parallel_config.data_parallel_rank
+        else:
+            dp_rank = vllm_config.parallel_config.data_parallel_rank
+        notification_data = (notification_type.value, dp_rank)
+        outputs = EngineCoreOutputs(
+            utility_output=UtilityOutput(
+                call_id=EEP_NOTIFICATION_CALL_ID,
+                result=UtilityResult(notification_data),
+            )
+        )
+        outputs.engine_index = self.engine_index
+
+        if hasattr(self, "output_thread") and self.output_thread.is_alive():
+            self.output_queue.put_nowait((0, outputs))
+        else:
+            encoder = MsgpackEncoder()
+            with (
+                zmq.Context() as ctx,
+                make_zmq_socket(
+                    ctx, self.addresses.outputs[0], zmq.PUSH, linger=4000
+                ) as socket,
+            ):
+                socket.send_multipart(encoder.encode(outputs))
+
+    def eep_handle_engine_core_notification(
+        self, notification_type: str | EEPNotificationType
+    ):
+        """
+        Handle notification received from EngineCoreClient
+        (forwarded from new core engines).
+        """
+        assert self.eep_scaling_state is not None
+        if isinstance(notification_type, str):
+            notification_type = EEPNotificationType(notification_type)
+        self.eep_scaling_state.handle_notification(notification_type)
+
+    def _eep_scale_up_before_kv_init(self):
+        from vllm.distributed.elastic_ep.elastic_state import ElasticEPScalingState
+
+        self.eep_scaling_state = ElasticEPScalingState(
+            model_executor=self.model_executor,
+            engine_core=self,
+            vllm_config=self.vllm_config,
+            new_parallel_config=self.vllm_config.parallel_config,
+            worker_type="new",
+            scale_type="scale_up",
+            reconfig_request=None,
+        )
+        self.model_executor.collective_rpc("init_device")
+        self.model_executor.collective_rpc("load_model")
+        self._eep_send_engine_core_notification(
+            EEPNotificationType.NEW_CORE_ENGINES_WEIGHTS_INIT_READY
+        )
+        self.model_executor.collective_rpc(
+            "elastic_ep_execute", args=("receive_weights",)
+        )
+        self.available_gpu_memory_for_kv_cache = (
+            ParallelConfig.sync_kv_cache_memory_size(self.dp_group, -1)
+        )
+        self.model_executor.collective_rpc(
+            "elastic_ep_execute", args=("prepare_new_worker",)
+        )
+        self.process_input_queue_block = False


 class EngineCoreActorMixin: