From 44a3783d13146ba106a88c44be89518698ef6065 Mon Sep 17 00:00:00 2001 From: Zilin Zhu Date: Thu, 15 May 2025 10:15:21 +0800 Subject: [PATCH] [fix][RL] Remove the incorrect barrier in init_weights_update_group (#5914) --- python/sglang/srt/model_executor/model_runner.py | 1 - test/srt/test_update_weights_from_distributed.py | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 4117e5f62..3846a283d 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -631,7 +631,6 @@ class ModelRunner: rank=rank, group_name=group_name, ) - dist.barrier(group=self._model_update_group, device_ids=[rank]) return True, "Succeeded to initialize custom process group." except Exception as e: message = f"Failed to initialize custom process group: {e}." diff --git a/test/srt/test_update_weights_from_distributed.py b/test/srt/test_update_weights_from_distributed.py index 064406703..cfcc5d951 100644 --- a/test/srt/test_update_weights_from_distributed.py +++ b/test/srt/test_update_weights_from_distributed.py @@ -162,7 +162,6 @@ def init_process_hf( rank=rank, group_name="test_parameter_update_group", ) - dist.barrier(group=group, device_ids=[rank]) torch.cuda.synchronize() time_begin_broadcast = time.perf_counter() @@ -223,8 +222,8 @@ def init_process_sgl( if rank == 1: url = DEFAULT_URL_FOR_TEST else: - host, port = DEFAULT_URL_FOR_TEST.split(":") - url = ":".join(host, str(int(port) + 10000)) + host, _, port = DEFAULT_URL_FOR_TEST.rpartition(":") + url = ":".join([host, str(int(port) + 10000)]) print(f"[sgl] rank {rank} init server on url: {url}") process = popen_launch_server(