Hint users when weight update timeouts (#6570)

This commit is contained in:
fzyzcjy
2025-05-26 00:13:17 +08:00
committed by GitHub
parent 3f23d8cdf1
commit 5ccf8fe1a0

View File

@@ -338,8 +338,14 @@ def update_expert_weights_single_layer(
return
reqs = torch.distributed.batch_isend_irecv(p2p_ops)
for req in reqs:
req.wait()
try:
for req in reqs:
req.wait(timeout=30)
except RuntimeError:
logger.error(
f"Context: {rank=} {old_physical_to_logical_map=} {new_physical_to_logical_map=} {num_local_physical_experts=} {num_gpu_per_node=}"
)
raise
def _execute_buffer2weight_copies(buffer2weight_copy_infos):
for (