From 5ccf8fe1a07e9ccfbec3bc572da4a129246ce3d3 Mon Sep 17 00:00:00 2001 From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com> Date: Mon, 26 May 2025 00:13:17 +0800 Subject: [PATCH] Hint users when weight update timeouts (#6570) --- .../srt/model_executor/expert_location_updater.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/model_executor/expert_location_updater.py b/python/sglang/srt/model_executor/expert_location_updater.py index 266b79283..ca31ef740 100644 --- a/python/sglang/srt/model_executor/expert_location_updater.py +++ b/python/sglang/srt/model_executor/expert_location_updater.py @@ -338,8 +338,14 @@ def update_expert_weights_single_layer( return reqs = torch.distributed.batch_isend_irecv(p2p_ops) - for req in reqs: - req.wait() + try: + for req in reqs: + req.wait(timeout=30) + except RuntimeError: + logger.error( + f"Context: {rank=} {old_physical_to_logical_map=} {new_physical_to_logical_map=} {num_local_physical_experts=} {num_gpu_per_node=}" + ) + raise def _execute_buffer2weight_copies(buffer2weight_copy_infos): for (