init

2025-10-09 16:47:16 +08:00
parent c8feb4deb5
commit e27e3f16bb
5248 changed files with 1778505 additions and 0 deletions
--- a/transformers/examples/training/distributed_training.py
+++ b/transformers/examples/training/distributed_training.py
@@ -0,0 +1,113 @@
+import argparse
+import os
+
+import torch
+import torch.distributed as dist
+
+
+# Environment variables set by torch.distributed.launch
+LOCAL_RANK = int(os.environ["LOCAL_RANK"])
+WORLD_SIZE = int(os.environ["WORLD_SIZE"])
+WORLD_RANK = int(os.environ["RANK"])
+
+LOCAL_RANK = int(os.environ["OMPI_COMM_WORLD_LOCAL_RANK"])
+WORLD_SIZE = int(os.environ["OMPI_COMM_WORLD_SIZE"])
+WORLD_RANK = int(os.environ["OMPI_COMM_WORLD_RANK"])
+
+
+def run(backend):
+    tensor = torch.zeros(1)
+    # Need to put tensor on a GPU device for nccl backend
+    if backend == "nccl":
+        device = torch.device(f"cuda:{LOCAL_RANK}")
+        tensor = tensor.to(device)
+
+    if WORLD_RANK == 0:
+        for rank_recv in range(1, WORLD_SIZE):
+            dist.send(tensor=tensor, dst=rank_recv)
+            print(f"worker_{0} sent data to Rank {rank_recv}\n")
+    else:
+        dist.recv(tensor=tensor, src=0)
+        print(f"worker_{WORLD_RANK} has received data from rank {0}\n")
+
+
+def init_processes(backend):
+    dist.init_process_group(backend, rank=WORLD_RANK, world_size=WORLD_SIZE)
+    run(backend)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--local_rank", type=int, help="Local rank. Necessary for using the torch.distributed.launch utility."
+    )
+    parser.add_argument("--backend", type=str, default="nccl", choices=["nccl", "gloo"])
+    args = parser.parse_args()
+
+    init_processes(backend=args.backend)
+
+""""
+python-m torch.distributed.launch \
+--nproc_per_node=2 --nnodes=2 --node_rank=0 \
+test_compile.py
+
+python3 -m torch.distributed.launch \
+--nproc_per_node=2 --nnodes=2 --node_rank=1 \
+--master_addr=104.171.200.62 --master_port=1234 \
+main.py \
+--backend=nccl --use_syn --batch_size=8192 --arch=resnet152
+
+
+
+mpirun -np 4 \
+-H 104.171.200.62:2,104.171.200.182:2 \
+-x MASTER_ADDR=104.171.200.62 \
+-x MASTER_PORT=1234 \
+-x PATH \
+-bind-to none -map-by slot \
+-mca pml ob1 -mca btl ^openib \
+python3 main.py
+"""
+
+
+""""
+You need a host file with the name of hosts.
+for example I have arthur@ip-26-0-162-46 and arthur@ip-26-0-162-239
+
+________
+hostfile
+ip-26-0-162-46 slots=8
+ip-26-0-162-239 slots=8
+________
+
+mpirun --hostfile hostfile -np 16 \
+    --bind-to none --map-by slot \
+    -x MASTER_ADDR=<master-node-ip> \
+    -x MASTER_PORT=29500 \
+    -x NCCL_DEBUG=INFO \
+    -x NCCL_SOCKET_IFNAME=^lo,docker0 \
+    -x CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+    python your_script.py --backend nccl
+
+
+to get the master IP you need to do a few things:
+hostname -I | awk '{print $1}'
+
+
+Use `ping ip-26-0-162-46` to check if connected
+
+26.0.162.46
+
+mpirun --hostfile hostfile -np 16 \
+    --bind-to none --map-by slot \
+    -x MASTER_ADDR=26.0.162.46 \
+    -x MASTER_PORT=29500 \
+    -x NCCL_DEBUG=INFO \
+    -x NCCL_SOCKET_IFNAME=^lo,docker0 \
+    -x CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+    python your_script.py --backend nccl
+
+
+mpirun --hostfile hostfile -np 2     -x NCCL_DEBUG=INFO     python -c "import os;print(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])" -b 8 -e 128M -f 2 -g 1
+to test your setup
+"""