upgrade to vllm 0.11.2 (#4400)

Bump vLLM version to v0.11.2 What's broken and changed by vLLM: 1. structured_output is broken by https://github.com/vllm-project/vllm/pull/26866 2. get_mrope_input_positions is broken by https://github.com/vllm-project/vllm/pull/28399 3. graph mode is broken by https://github.com/vllm-project/vllm/pull/25110 we'll upgrade torch to 2.8 to fix the problem later 4. embedding is broken by https://github.com/vllm-project/vllm/pull/27583 5. `get_attn_backend_cls` and attention backend is broken are broken by https://github.com/vllm-project/vllm/pull/28534 6. spec decode is broken by https://github.com/vllm-project/vllm/pull/28771 7. sp feature is broken by https://github.com/vllm-project/vllm/pull/27126 8. mtp is broken by https://github.com/vllm-project/vllm/pull/27922 9. lora is broken by https://github.com/vllm-project/vllm/pull/21068 10. execute_model is broken by https://github.com/vllm-project/vllm/pull/26866 11. `VLLM_DISABLE_SHARED_EXPERTS_STREAM` env is broken by https://github.com/vllm-project/vllm/pull/28159 12. kv cahe is broken by https://github.com/vllm-project/vllm/pull/27753 13. dp is broken by https://github.com/vllm-project/vllm/pull/25110 What's broken and changed by ourself: 1. qwen vl is broken by https://github.com/vllm-project/vllm/pull/28455 We'll remove model files in the future to avoid this kind of error 2. Engine core is broken by https://github.com/vllm-project/vllm/pull/23691 We'll remove the patch file in the future. 3. Ascend scheduler is broken by https://github.com/vllm-project/vllm/pull/28733 We'll remove ascend scheudler later. 4. qwen3-next is broken by https://github.com/vllm-project/vllm/pull/28083 We'll remove model files in the future to avoid this kind of error 5. qwen vl is broken by https://github.com/vllm-project/vllm/pull/27764. We'll remove model files in the future Known issue: 1. ray doesn't work 2. the accuracy of qwen3-next is not correct 3. qwen3-vl is broken 4. prefix cache+ ascend scheduler + deepseek v2 lite is broken. Co-authored-by: MengqingCao <cmq0113@163.com> Co-authored-by: hfadzxy <starmoon_zhang@163.com> Co-authored-by: leo-pony <nengjunma@outlook.com> Co-authored-by: 22dimensions <waitingwind@foxmail.com> Co-authored-by: shen-shanshan <467638484@qq.com> - vLLM version: v0.11.2 --------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: hfadzxy <starmoon_zhang@163.com> Signed-off-by: leo-pony <nengjunma@outlook.com> Co-authored-by: MengqingCao <cmq0113@163.com> Co-authored-by: hfadzxy <starmoon_zhang@163.com> Co-authored-by: leo-pony <nengjunma@outlook.com>
2025-11-26 11:48:58 +08:00
parent d5f77f14d0
commit bc69d7cfe1
54 changed files with 744 additions and 437 deletions
--- a/vllm_ascend/patch/platform/init.py
+++ b/vllm_ascend/patch/platform/init.py
@@ -18,6 +18,7 @@ import os

 import vllm_ascend.patch.platform.patch_config  # noqa
 import vllm_ascend.patch.platform.patch_distributed  # noqa
+import vllm_ascend.patch.platform.patch_dynamo_vllm_backend  # noqa
 import vllm_ascend.patch.platform.patch_mamba_config  # noqa
 import vllm_ascend.patch.platform.patch_sched_yield  # noqa

--- a/vllm_ascend/patch/platform/patch_dynamo_vllm_backend.py
+++ b/vllm_ascend/patch/platform/patch_dynamo_vllm_backend.py
@@ -0,0 +1,16 @@
+# mypy: ignore-errors
+from typing import Any, Dict
+
+import torch.fx as fx
+from vllm.compilation.backends import VllmBackend
+from vllm.compilation.caching import VllmSerializableFunction
+
+_original_vllmbackend_call = VllmBackend.__call__
+
+
+def __patch_call__(self, graph: fx.GraphModule, example_inputs,
+                   options: Dict[str, Any]) -> VllmSerializableFunction:
+    return _original_vllmbackend_call(self, graph, example_inputs)
+
+
+VllmBackend.__call__ = __patch_call__
--- a/vllm_ascend/patch/platform/patch_multiproc_executor.py
+++ b/vllm_ascend/patch/platform/patch_multiproc_executor.py
@@ -1,24 +1,24 @@
 import threading
 import weakref
-from concurrent.futures import ThreadPoolExecutor
+from collections import deque
+from collections.abc import Callable
 from multiprocessing.synchronize import Lock as LockType
-from typing import Optional

 import vllm.v1.executor.multiproc_executor
 from vllm import envs
 from vllm.config import VllmConfig
-from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
+from vllm.distributed.device_communicators.shm_broadcast import (Handle,
+                                                                 MessageQueue)
 from vllm.utils.network_utils import (get_distributed_init_method,
                                      get_loopback_ip, get_open_port)
 from vllm.utils.system_utils import get_mp_context
 from vllm.v1.executor.abstract import FailureCallback
 from vllm.v1.executor.multiproc_executor import (
-    MultiprocExecutor, UnreadyWorkerProcHandle, WorkerProc,
+    FutureWrapper, MultiprocExecutor, UnreadyWorkerProcHandle, WorkerProc,
    set_multiprocessing_worker_envs)


 class AscendMultiprocExecutor(MultiprocExecutor):
-    supports_pp: bool = True

    def _init_executor(self) -> None:
        # Call self.shutdown at exit to clean up
@@ -26,10 +26,14 @@ class AscendMultiprocExecutor(MultiprocExecutor):
        self._finalizer = weakref.finalize(self, self.shutdown)
        self.is_failed = False
        self.shutdown_event = threading.Event()
-        self.failure_callback: Optional[FailureCallback] = None
-        self.io_thread_pool: Optional[ThreadPoolExecutor] = None
+        self.failure_callback: FailureCallback | None = None

        self.world_size = self.parallel_config.world_size
+        assert self.world_size % self.parallel_config.nnodes_within_dp == 0, (
+            f"global world_size ({self.parallel_config.world_size}) must be "
+            f"divisible by nnodes_within_dp "
+            f"({self.parallel_config.nnodes_within_dp}). ")
+        self.local_world_size = self.parallel_config.local_world_size
        tensor_parallel_size = self.parallel_config.tensor_parallel_size
        pp_parallel_size = self.parallel_config.pipeline_parallel_size
        assert self.world_size == tensor_parallel_size * pp_parallel_size, (
@@ -45,27 +49,36 @@ class AscendMultiprocExecutor(MultiprocExecutor):
        # get_loopback_ip() for communication.
        distributed_init_method = get_distributed_init_method(
            get_loopback_ip(), get_open_port())
-
+        self.rpc_broadcast_mq: MessageQueue | None = None
+        scheduler_output_handle: Handle | None = None
        # Initialize worker and set up message queues for SchedulerOutputs
        # and ModelRunnerOutputs
-        max_chunk_bytes = envs.VLLM_MQ_MAX_CHUNK_BYTES_MB * 1024 * 1024
-        self.rpc_broadcast_mq = MessageQueue(self.world_size,
-                                             self.world_size,
-                                             max_chunk_bytes=max_chunk_bytes)
-        scheduler_output_handle = self.rpc_broadcast_mq.export_handle()
-
+        if self.parallel_config.node_rank_within_dp == 0:
+            # For leader node within each dp rank,
+            # each dp will have its own leader multiproc executor.
+            max_chunk_bytes = envs.VLLM_MQ_MAX_CHUNK_BYTES_MB * 1024 * 1024
+            self.rpc_broadcast_mq = MessageQueue(
+                self.world_size,
+                self.local_world_size,
+                max_chunk_bytes=max_chunk_bytes,
+                connect_ip=self.parallel_config.master_addr,
+            )
+            scheduler_output_handle = self.rpc_broadcast_mq.export_handle()
        # Create workers
        context = get_mp_context()
        shared_worker_lock = context.Lock()
        unready_workers: list[UnreadyWorkerProcHandle] = []
        success = False
        try:
-            for rank in range(self.world_size):
+            global_start_rank = (self.local_world_size *
+                                 self.parallel_config.node_rank_within_dp)
+            for local_rank in range(self.local_world_size):
+                global_rank = global_start_rank + local_rank
                unready_workers.append(
                    AscendWorkerProc.make_worker_process(
                        vllm_config=self.vllm_config,
-                        local_rank=rank,
-                        rank=rank,
+                        local_rank=local_rank,
+                        rank=global_rank,
                        distributed_init_method=distributed_init_method,
                        input_shm_handle=scheduler_output_handle,
                        shared_worker_lock=shared_worker_lock,
@@ -73,15 +86,38 @@ class AscendMultiprocExecutor(MultiprocExecutor):

            # Workers must be created before wait_for_ready to avoid
            # deadlock, since worker.init_device() does a device sync.
-            self.workers = WorkerProc.wait_for_ready(unready_workers)
+
+            # Wait for all local workers to be ready.
+            self.workers = AscendWorkerProc.wait_for_ready(unready_workers)
+
+            # Start background thread to monitor worker health if not in headless mode.
+            if self.monitor_workers:
+                self.start_worker_monitor()
+
+            self.response_mqs = []
+            # Only leader node have remote response mqs
+            if self.parallel_config.node_rank_within_dp == 0:
+                for rank in range(self.world_size):
+                    if rank < self.local_world_size:
+                        local_message_queue = self.workers[
+                            rank].worker_response_mq
+                        assert local_message_queue is not None
+                        self.response_mqs.append(local_message_queue)
+                    else:
+                        remote_message_queue = self.workers[
+                            0].peer_worker_response_mqs[rank]
+                        assert remote_message_queue is not None
+                        self.response_mqs.append(remote_message_queue)

            # Ensure message queues are ready. Will deadlock if re-ordered
            # Must be kept consistent with the WorkerProc.
-            self.rpc_broadcast_mq.wait_until_ready()
-            for w in self.workers:
-                w.worker_response_mq.wait_until_ready()

-            self.start_worker_monitor()
+            # Wait for all input mqs to be ready.
+            if self.rpc_broadcast_mq is not None:
+                self.rpc_broadcast_mq.wait_until_ready()
+            # Wait for all remote response mqs to be ready.
+            for response_mq in self.response_mqs:
+                response_mq.wait_until_ready()
            success = True
        finally:
            if not success:
@@ -93,17 +129,9 @@ class AscendMultiprocExecutor(MultiprocExecutor):
                self._ensure_worker_termination(
                    [uw.proc for uw in unready_workers])

-        # For pipeline parallel, we use a thread pool for asynchronous
-        # execute_model.
-        if self.max_concurrent_batches > 1:
-            # Note: must use only 1 IO thread to keep dequeue sequence
-            # from the response queue
-            # _async_aggregate_workers_output also assumes a single IO thread
-            self.io_thread_pool = ThreadPoolExecutor(
-                max_workers=1, thread_name_prefix="mp_exec_io")
+        self.futures_queue = deque[tuple[FutureWrapper, Callable]]()

        self.output_rank = self._get_output_rank()
-        self.has_connector = self.vllm_config.kv_transfer_config is not None


 class AscendWorkerProc(WorkerProc):