From 5305a2ccf943304435a7716120ee6bb5c130a6b8 Mon Sep 17 00:00:00 2001
From: yiz-liu <136800916+yiz-liu@users.noreply.github.com>
Date: Mon, 12 May 2025 17:31:29 +0800
Subject: [PATCH] =?UTF-8?q?[Bugfix]=20Tweak=20distributed=20process=20grou?=
 =?UTF-8?q?p=20initialization=20and=20add=20dummy=E2=80=A6=20(#816)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix batch execution method to enable DP in V1

Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
---
 .../patch_common/patch_distributed.py         | 24 +++++++++++++------
 vllm_ascend/worker/worker_v1.py               |  3 +++
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/vllm_ascend/patch/platform/patch_common/patch_distributed.py b/vllm_ascend/patch/platform/patch_common/patch_distributed.py
index 5dd5c66..ac46ab0 100644
--- a/vllm_ascend/patch/platform/patch_common/patch_distributed.py
+++ b/vllm_ascend/patch/platform/patch_common/patch_distributed.py
@@ -47,7 +47,7 @@ def ascend_destroy_model_parallel():
     destory_ascend_model_parallel()
 
 
-def ascend_stateless_init_torch_distributed_process_group(
+def stateless_init_torch_distributed_process_group(
         host: str, port: int, rank: int, world_size: int,
         backend: str) -> ProcessGroup:
     """
@@ -96,10 +96,16 @@ def ascend_stateless_init_torch_distributed_process_group(
     # different systems (e.g. RPC) in case the store is multi-tenant.
     prefix_store = PrefixStore(init_method, store)
 
+    # TODO(Yizhou): The reason we need to set options while vllm does not
+    # seems to be related to the version of PyTorch. In the latest version,
+    # there is no need to set options. While in the older version, 2.5.1
+    # specifically, we need to set options.
+    options = ProcessGroup.Options(backend=backend)
     pg: ProcessGroup = ProcessGroup(
         prefix_store,
         group_rank,
         group_size,
+        options,
     )
     if backend == "gloo":
         from torch.distributed.distributed_c10d import ProcessGroupGloo
@@ -136,7 +142,10 @@ def ascend_stateless_init_torch_distributed_process_group(
     else:
         raise RuntimeError(f"Unsupported torch distributed backend: {backend}")
 
-    pg._set_default_backend(backend_type)
+    # TODO(Yizhou): Like we mentioned above, _set_default_backend is not
+    # implemented in the 2.5.1 version of PyTorch. But we need to set it
+    # after the latest version is released.
+    # pg._set_default_backend(backend_type)
     backend_class._set_sequence_number_for_group()
 
     pg._register_backend(device, backend_type, backend_class)
@@ -163,20 +172,21 @@ def parallel_config_get_dp_port(self) -> int:
 
 
 def ascend_stateless_init_dp_group(self) -> "ProcessGroup":
-    from vllm.distributed.utils import \
-        stateless_init_torch_distributed_process_group
-
+    # TODO(Yizhou): Currently we have to set the backend to gloo
+    # because in vllm.config.ParallelConfig.has_unfinished_dp the
+    # device is set to cpu. We need to fix this in the future.
+    # We need to compare the performance of gloo and hccl and then
+    # decide which one to use.
     dp_group = stateless_init_torch_distributed_process_group(
         self.data_parallel_master_ip,
         self.get_next_dp_init_port(),
         self.data_parallel_rank,
         self.data_parallel_size,
-        backend="hccl")
+        backend="gloo")
 
     return dp_group
 
 
 vllm.distributed.parallel_state.destroy_model_parallel = ascend_destroy_model_parallel
-vllm.distributed.stateless_init_torch_distributed_process_group = ascend_stateless_init_torch_distributed_process_group
 ParallelConfig.get_next_dp_init_port = parallel_config_get_dp_port
 ParallelConfig.stateless_init_dp_group = ascend_stateless_init_dp_group
diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py
index b31c8f1..2ba1973 100644
--- a/vllm_ascend/worker/worker_v1.py
+++ b/vllm_ascend/worker/worker_v1.py
@@ -216,6 +216,9 @@ class NPUWorker(WorkerBase):
         else:
             self.profiler.stop()
 
+    def execute_dummy_batch(self) -> None:
+        self.model_runner._dummy_run(1)
+
     def _init_worker_distributed_environment(self) -> None:
         """Initialize the distributed environment."""
         additional_config = self.vllm_config.additional_config