init
This commit is contained in:
113
transformers/examples/training/distributed_training.py
Normal file
113
transformers/examples/training/distributed_training.py
Normal file
@@ -0,0 +1,113 @@
|
||||
import argparse
|
||||
import os
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
|
||||
# Environment variables set by torch.distributed.launch
|
||||
LOCAL_RANK = int(os.environ["LOCAL_RANK"])
|
||||
WORLD_SIZE = int(os.environ["WORLD_SIZE"])
|
||||
WORLD_RANK = int(os.environ["RANK"])
|
||||
|
||||
LOCAL_RANK = int(os.environ["OMPI_COMM_WORLD_LOCAL_RANK"])
|
||||
WORLD_SIZE = int(os.environ["OMPI_COMM_WORLD_SIZE"])
|
||||
WORLD_RANK = int(os.environ["OMPI_COMM_WORLD_RANK"])
|
||||
|
||||
|
||||
def run(backend):
|
||||
tensor = torch.zeros(1)
|
||||
# Need to put tensor on a GPU device for nccl backend
|
||||
if backend == "nccl":
|
||||
device = torch.device(f"cuda:{LOCAL_RANK}")
|
||||
tensor = tensor.to(device)
|
||||
|
||||
if WORLD_RANK == 0:
|
||||
for rank_recv in range(1, WORLD_SIZE):
|
||||
dist.send(tensor=tensor, dst=rank_recv)
|
||||
print(f"worker_{0} sent data to Rank {rank_recv}\n")
|
||||
else:
|
||||
dist.recv(tensor=tensor, src=0)
|
||||
print(f"worker_{WORLD_RANK} has received data from rank {0}\n")
|
||||
|
||||
|
||||
def init_processes(backend):
|
||||
dist.init_process_group(backend, rank=WORLD_RANK, world_size=WORLD_SIZE)
|
||||
run(backend)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--local_rank", type=int, help="Local rank. Necessary for using the torch.distributed.launch utility."
|
||||
)
|
||||
parser.add_argument("--backend", type=str, default="nccl", choices=["nccl", "gloo"])
|
||||
args = parser.parse_args()
|
||||
|
||||
init_processes(backend=args.backend)
|
||||
|
||||
""""
|
||||
python-m torch.distributed.launch \
|
||||
--nproc_per_node=2 --nnodes=2 --node_rank=0 \
|
||||
test_compile.py
|
||||
|
||||
python3 -m torch.distributed.launch \
|
||||
--nproc_per_node=2 --nnodes=2 --node_rank=1 \
|
||||
--master_addr=104.171.200.62 --master_port=1234 \
|
||||
main.py \
|
||||
--backend=nccl --use_syn --batch_size=8192 --arch=resnet152
|
||||
|
||||
|
||||
|
||||
mpirun -np 4 \
|
||||
-H 104.171.200.62:2,104.171.200.182:2 \
|
||||
-x MASTER_ADDR=104.171.200.62 \
|
||||
-x MASTER_PORT=1234 \
|
||||
-x PATH \
|
||||
-bind-to none -map-by slot \
|
||||
-mca pml ob1 -mca btl ^openib \
|
||||
python3 main.py
|
||||
"""
|
||||
|
||||
|
||||
""""
|
||||
You need a host file with the name of hosts.
|
||||
for example I have arthur@ip-26-0-162-46 and arthur@ip-26-0-162-239
|
||||
|
||||
________
|
||||
hostfile
|
||||
ip-26-0-162-46 slots=8
|
||||
ip-26-0-162-239 slots=8
|
||||
________
|
||||
|
||||
mpirun --hostfile hostfile -np 16 \
|
||||
--bind-to none --map-by slot \
|
||||
-x MASTER_ADDR=<master-node-ip> \
|
||||
-x MASTER_PORT=29500 \
|
||||
-x NCCL_DEBUG=INFO \
|
||||
-x NCCL_SOCKET_IFNAME=^lo,docker0 \
|
||||
-x CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
|
||||
python your_script.py --backend nccl
|
||||
|
||||
|
||||
to get the master IP you need to do a few things:
|
||||
hostname -I | awk '{print $1}'
|
||||
|
||||
|
||||
Use `ping ip-26-0-162-46` to check if connected
|
||||
|
||||
26.0.162.46
|
||||
|
||||
mpirun --hostfile hostfile -np 16 \
|
||||
--bind-to none --map-by slot \
|
||||
-x MASTER_ADDR=26.0.162.46 \
|
||||
-x MASTER_PORT=29500 \
|
||||
-x NCCL_DEBUG=INFO \
|
||||
-x NCCL_SOCKET_IFNAME=^lo,docker0 \
|
||||
-x CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
|
||||
python your_script.py --backend nccl
|
||||
|
||||
|
||||
mpirun --hostfile hostfile -np 2 -x NCCL_DEBUG=INFO python -c "import os;print(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])" -b 8 -e 128M -f 2 -g 1
|
||||
to test your setup
|
||||
"""
|
||||
Reference in New Issue
Block a user