From 5305a2ccf943304435a7716120ee6bb5c130a6b8 Mon Sep 17 00:00:00 2001 From: yiz-liu <136800916+yiz-liu@users.noreply.github.com> Date: Mon, 12 May 2025 17:31:29 +0800 Subject: [PATCH] =?UTF-8?q?[Bugfix]=20Tweak=20distributed=20process=20grou?= =?UTF-8?q?p=20initialization=20and=20add=20dummy=E2=80=A6=20(#816)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix batch execution method to enable DP in V1 Signed-off-by: Yizhou Liu --- .../patch_common/patch_distributed.py | 24 +++++++++++++------ vllm_ascend/worker/worker_v1.py | 3 +++ 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/vllm_ascend/patch/platform/patch_common/patch_distributed.py b/vllm_ascend/patch/platform/patch_common/patch_distributed.py index 5dd5c66..ac46ab0 100644 --- a/vllm_ascend/patch/platform/patch_common/patch_distributed.py +++ b/vllm_ascend/patch/platform/patch_common/patch_distributed.py @@ -47,7 +47,7 @@ def ascend_destroy_model_parallel(): destory_ascend_model_parallel() -def ascend_stateless_init_torch_distributed_process_group( +def stateless_init_torch_distributed_process_group( host: str, port: int, rank: int, world_size: int, backend: str) -> ProcessGroup: """ @@ -96,10 +96,16 @@ def ascend_stateless_init_torch_distributed_process_group( # different systems (e.g. RPC) in case the store is multi-tenant. prefix_store = PrefixStore(init_method, store) + # TODO(Yizhou): The reason we need to set options while vllm does not + # seems to be related to the version of PyTorch. In the latest version, + # there is no need to set options. While in the older version, 2.5.1 + # specifically, we need to set options. + options = ProcessGroup.Options(backend=backend) pg: ProcessGroup = ProcessGroup( prefix_store, group_rank, group_size, + options, ) if backend == "gloo": from torch.distributed.distributed_c10d import ProcessGroupGloo @@ -136,7 +142,10 @@ def ascend_stateless_init_torch_distributed_process_group( else: raise RuntimeError(f"Unsupported torch distributed backend: {backend}") - pg._set_default_backend(backend_type) + # TODO(Yizhou): Like we mentioned above, _set_default_backend is not + # implemented in the 2.5.1 version of PyTorch. But we need to set it + # after the latest version is released. + # pg._set_default_backend(backend_type) backend_class._set_sequence_number_for_group() pg._register_backend(device, backend_type, backend_class) @@ -163,20 +172,21 @@ def parallel_config_get_dp_port(self) -> int: def ascend_stateless_init_dp_group(self) -> "ProcessGroup": - from vllm.distributed.utils import \ - stateless_init_torch_distributed_process_group - + # TODO(Yizhou): Currently we have to set the backend to gloo + # because in vllm.config.ParallelConfig.has_unfinished_dp the + # device is set to cpu. We need to fix this in the future. + # We need to compare the performance of gloo and hccl and then + # decide which one to use. dp_group = stateless_init_torch_distributed_process_group( self.data_parallel_master_ip, self.get_next_dp_init_port(), self.data_parallel_rank, self.data_parallel_size, - backend="hccl") + backend="gloo") return dp_group vllm.distributed.parallel_state.destroy_model_parallel = ascend_destroy_model_parallel -vllm.distributed.stateless_init_torch_distributed_process_group = ascend_stateless_init_torch_distributed_process_group ParallelConfig.get_next_dp_init_port = parallel_config_get_dp_port ParallelConfig.stateless_init_dp_group = ascend_stateless_init_dp_group diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index b31c8f1..2ba1973 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -216,6 +216,9 @@ class NPUWorker(WorkerBase): else: self.profiler.stop() + def execute_dummy_batch(self) -> None: + self.model_runner._dummy_run(1) + def _init_worker_distributed_environment(self) -> None: """Initialize the distributed environment.""" additional_config = self.vllm_config.additional_config