Add back data parallelism (#1635)

This commit is contained in:
Lianmin Zheng
2024-10-11 07:22:48 -07:00
committed by GitHub
parent 5d09ca5735
commit 23cc66f7b6
7 changed files with 228 additions and 39 deletions

View File

@@ -142,7 +142,7 @@ class Scheduler:
gpu_id=gpu_id,
tp_rank=tp_rank,
server_args=server_args,
nccl_port=port_args.nccl_ports[0],
nccl_port=port_args.nccl_port,
)
self.tp_cpu_group = self.tp_worker.model_runner.tp_group.cpu_group
@@ -1042,9 +1042,14 @@ def run_scheduler_process(
port_args: PortArgs,
gpu_id: int,
tp_rank: int,
dp_rank: Optional[int],
pipe_writer,
):
configure_logger(server_args, prefix=f" TP{tp_rank}")
if dp_rank is None:
configure_logger(server_args, prefix=f" TP{tp_rank}")
else:
configure_logger(server_args, prefix=f" DP{dp_rank} TP{tp_rank}")
suppress_other_loggers()
try: