diff --git a/python/sglang/srt/model_executor/expert_location_updater.py b/python/sglang/srt/model_executor/expert_location_updater.py index 266b79283..ca31ef740 100644 --- a/python/sglang/srt/model_executor/expert_location_updater.py +++ b/python/sglang/srt/model_executor/expert_location_updater.py @@ -338,8 +338,14 @@ def update_expert_weights_single_layer( return reqs = torch.distributed.batch_isend_irecv(p2p_ops) - for req in reqs: - req.wait() + try: + for req in reqs: + req.wait(timeout=30) + except RuntimeError: + logger.error( + f"Context: {rank=} {old_physical_to_logical_map=} {new_physical_to_logical_map=} {num_local_physical_experts=} {num_gpu_per_node=}" + ) + raise def _execute_buffer2weight_copies(buffer2weight_copy_infos): for (