[fix][RL] Remove the incorrect barrier in init_weights_update_group (#5914)
This commit is contained in:
@@ -631,7 +631,6 @@ class ModelRunner:
|
|||||||
rank=rank,
|
rank=rank,
|
||||||
group_name=group_name,
|
group_name=group_name,
|
||||||
)
|
)
|
||||||
dist.barrier(group=self._model_update_group, device_ids=[rank])
|
|
||||||
return True, "Succeeded to initialize custom process group."
|
return True, "Succeeded to initialize custom process group."
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
message = f"Failed to initialize custom process group: {e}."
|
message = f"Failed to initialize custom process group: {e}."
|
||||||
|
|||||||
@@ -162,7 +162,6 @@ def init_process_hf(
|
|||||||
rank=rank,
|
rank=rank,
|
||||||
group_name="test_parameter_update_group",
|
group_name="test_parameter_update_group",
|
||||||
)
|
)
|
||||||
dist.barrier(group=group, device_ids=[rank])
|
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
time_begin_broadcast = time.perf_counter()
|
time_begin_broadcast = time.perf_counter()
|
||||||
|
|
||||||
@@ -223,8 +222,8 @@ def init_process_sgl(
|
|||||||
if rank == 1:
|
if rank == 1:
|
||||||
url = DEFAULT_URL_FOR_TEST
|
url = DEFAULT_URL_FOR_TEST
|
||||||
else:
|
else:
|
||||||
host, port = DEFAULT_URL_FOR_TEST.split(":")
|
host, _, port = DEFAULT_URL_FOR_TEST.rpartition(":")
|
||||||
url = ":".join(host, str(int(port) + 10000))
|
url = ":".join([host, str(int(port) + 10000)])
|
||||||
|
|
||||||
print(f"[sgl] rank {rank} init server on url: {url}")
|
print(f"[sgl] rank {rank} init server on url: {url}")
|
||||||
process = popen_launch_server(
|
process = popen_launch_server(
|
||||||
|
|||||||
Reference in New Issue
Block a user