From 10a9ab7b07e79247133669f74446fd5c4a39fc28 Mon Sep 17 00:00:00 2001 From: Kebe Date: Fri, 28 Mar 2025 09:52:10 +0800 Subject: [PATCH] Fix error due to CustomAllreduce setup failure (#4815) Signed-off-by: Kebe --- python/sglang/srt/distributed/parallel_state.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/python/sglang/srt/distributed/parallel_state.py b/python/sglang/srt/distributed/parallel_state.py index d4000b866..e43bc0000 100644 --- a/python/sglang/srt/distributed/parallel_state.py +++ b/python/sglang/srt/distributed/parallel_state.py @@ -264,10 +264,16 @@ class GroupCoordinator: self.ca_comm: Optional[CustomAllreduce] = None if use_custom_allreduce and self.world_size > 1: # Initialize a custom fast all-reduce implementation. - self.ca_comm = CustomAllreduce( - group=self.cpu_group, - device=self.device, - ) + try: + self.ca_comm = CustomAllreduce( + group=self.cpu_group, + device=self.device, + ) + except Exception as e: + logger.warning( + f"Setup Custom allreduce failed with {e}. To silence this " + "warning, specify --disable-custom-all-reduce explicitly." + ) from sglang.srt.distributed.device_communicators.hpu_communicator import ( HpuCommunicator,