Hint users when weight update timeouts (#6570)
This commit is contained in:
@@ -338,8 +338,14 @@ def update_expert_weights_single_layer(
|
|||||||
return
|
return
|
||||||
|
|
||||||
reqs = torch.distributed.batch_isend_irecv(p2p_ops)
|
reqs = torch.distributed.batch_isend_irecv(p2p_ops)
|
||||||
for req in reqs:
|
try:
|
||||||
req.wait()
|
for req in reqs:
|
||||||
|
req.wait(timeout=30)
|
||||||
|
except RuntimeError:
|
||||||
|
logger.error(
|
||||||
|
f"Context: {rank=} {old_physical_to_logical_map=} {new_physical_to_logical_map=} {num_local_physical_experts=} {num_gpu_per_node=}"
|
||||||
|
)
|
||||||
|
raise
|
||||||
|
|
||||||
def _execute_buffer2weight_copies(buffer2weight_copy_infos):
|
def _execute_buffer2weight_copies(buffer2weight_copy_infos):
|
||||||
for (
|
for (
|
||||||
|
|||||||
Reference in New Issue
Block a user