fix io group (#9154)
Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
This commit is contained in:
@@ -296,6 +296,9 @@ class HiCacheController:
|
||||
self.prefetch_tp_group = torch.distributed.new_group(
|
||||
group_ranks, backend="gloo"
|
||||
)
|
||||
self.prefetch_io_tp_group = torch.distributed.new_group(
|
||||
group_ranks, backend="gloo"
|
||||
)
|
||||
self.backup_tp_group = torch.distributed.new_group(
|
||||
group_ranks, backend="gloo"
|
||||
)
|
||||
@@ -602,7 +605,7 @@ class HiCacheController:
|
||||
|
||||
if self.tp_world_size > 1:
|
||||
# to ensure all TP workers release the host memory at the same time
|
||||
torch.distributed.barrier(group=self.prefetch_tp_group)
|
||||
torch.distributed.barrier(group=self.prefetch_io_tp_group)
|
||||
# operation terminated by controller, release pre-allocated memory
|
||||
self.mem_pool_host.free(
|
||||
operation.host_indices[operation.completed_tokens :]
|
||||
|
||||
Reference in New Issue
Block a user