fix io group (#9154)
Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
This commit is contained in:
@@ -296,6 +296,9 @@ class HiCacheController:
|
|||||||
self.prefetch_tp_group = torch.distributed.new_group(
|
self.prefetch_tp_group = torch.distributed.new_group(
|
||||||
group_ranks, backend="gloo"
|
group_ranks, backend="gloo"
|
||||||
)
|
)
|
||||||
|
self.prefetch_io_tp_group = torch.distributed.new_group(
|
||||||
|
group_ranks, backend="gloo"
|
||||||
|
)
|
||||||
self.backup_tp_group = torch.distributed.new_group(
|
self.backup_tp_group = torch.distributed.new_group(
|
||||||
group_ranks, backend="gloo"
|
group_ranks, backend="gloo"
|
||||||
)
|
)
|
||||||
@@ -602,7 +605,7 @@ class HiCacheController:
|
|||||||
|
|
||||||
if self.tp_world_size > 1:
|
if self.tp_world_size > 1:
|
||||||
# to ensure all TP workers release the host memory at the same time
|
# to ensure all TP workers release the host memory at the same time
|
||||||
torch.distributed.barrier(group=self.prefetch_tp_group)
|
torch.distributed.barrier(group=self.prefetch_io_tp_group)
|
||||||
# operation terminated by controller, release pre-allocated memory
|
# operation terminated by controller, release pre-allocated memory
|
||||||
self.mem_pool_host.free(
|
self.mem_pool_host.free(
|
||||||
operation.host_indices[operation.completed_tokens :]
|
operation.host_indices[operation.completed_tokens :]
|
||||||
|
|||||||
Reference in New Issue
Block a user