From 733446dd36033831496a0fe0905f743b5fc9db7a Mon Sep 17 00:00:00 2001 From: pansicheng Date: Thu, 14 Aug 2025 12:46:42 +0800 Subject: [PATCH] fix io group (#9154) Co-authored-by: Zhiqiang Xie --- python/sglang/srt/managers/cache_controller.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/managers/cache_controller.py b/python/sglang/srt/managers/cache_controller.py index 08e2146af..b25bf4032 100644 --- a/python/sglang/srt/managers/cache_controller.py +++ b/python/sglang/srt/managers/cache_controller.py @@ -296,6 +296,9 @@ class HiCacheController: self.prefetch_tp_group = torch.distributed.new_group( group_ranks, backend="gloo" ) + self.prefetch_io_tp_group = torch.distributed.new_group( + group_ranks, backend="gloo" + ) self.backup_tp_group = torch.distributed.new_group( group_ranks, backend="gloo" ) @@ -602,7 +605,7 @@ class HiCacheController: if self.tp_world_size > 1: # to ensure all TP workers release the host memory at the same time - torch.distributed.barrier(group=self.prefetch_tp_group) + torch.distributed.barrier(group=self.prefetch_io_tp_group) # operation terminated by controller, release pre-allocated memory self.mem_pool_host.free( operation.host_indices[operation.completed_tokens :]