[fix][RL] Remove the incorrect barrier in init_weights_update_group (#5914)

This commit is contained in:
Zilin Zhu
2025-05-15 10:15:21 +08:00
committed by GitHub
parent f3bf611054
commit 44a3783d13
2 changed files with 2 additions and 4 deletions

View File

@@ -631,7 +631,6 @@ class ModelRunner:
rank=rank,
group_name=group_name,
)
dist.barrier(group=self._model_update_group, device_ids=[rank])
return True, "Succeeded to initialize custom process group."
except Exception as e:
message = f"Failed to initialize custom process group: {e}."

View File

@@ -162,7 +162,6 @@ def init_process_hf(
rank=rank,
group_name="test_parameter_update_group",
)
dist.barrier(group=group, device_ids=[rank])
torch.cuda.synchronize()
time_begin_broadcast = time.perf_counter()
@@ -223,8 +222,8 @@ def init_process_sgl(
if rank == 1:
url = DEFAULT_URL_FOR_TEST
else:
host, port = DEFAULT_URL_FOR_TEST.split(":")
url = ":".join(host, str(int(port) + 10000))
host, _, port = DEFAULT_URL_FOR_TEST.rpartition(":")
url = ":".join([host, str(int(port) + 10000)])
print(f"[sgl] rank {rank} init server on url: {url}")
process = popen_launch_server(