fix io group (#9154)

Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
This commit is contained in:
pansicheng
2025-08-14 12:46:42 +08:00
committed by GitHub
parent 4c22897a66
commit 733446dd36

View File

@@ -296,6 +296,9 @@ class HiCacheController:
self.prefetch_tp_group = torch.distributed.new_group(
group_ranks, backend="gloo"
)
self.prefetch_io_tp_group = torch.distributed.new_group(
group_ranks, backend="gloo"
)
self.backup_tp_group = torch.distributed.new_group(
group_ranks, backend="gloo"
)
@@ -602,7 +605,7 @@ class HiCacheController:
if self.tp_world_size > 1:
# to ensure all TP workers release the host memory at the same time
torch.distributed.barrier(group=self.prefetch_tp_group)
torch.distributed.barrier(group=self.prefetch_io_tp_group)
# operation terminated by controller, release pre-allocated memory
self.mem_pool_host.free(
operation.host_indices[operation.completed_tokens :]