support 1 shot allreduce in 1-node and 2-node using mscclpp (#6277)

This commit is contained in:
zyksir
2025-06-05 13:11:24 +08:00
committed by GitHub
parent 4474eaf552
commit 8e3797be1c
20 changed files with 2177 additions and 12 deletions

View File

@@ -26,6 +26,8 @@ def launch_server(args):
cmd += f"--tp-size {args.tp_size} "
if args.disable_custom_all_reduce:
cmd += "--disable-custom-all-reduce"
if args.enable_mscclpp:
cmd += "--enable-mscclpp"
print(cmd)
os.system(cmd)
@@ -63,6 +65,11 @@ if __name__ == "__main__":
action="store_true",
help="Disable custom all reduce when device does not support p2p communication",
)
parser.add_argument(
"--enable-mscclpp",
action="store_true",
help="Enable using mscclpp for small messages for all-reduce kernel and fall back to NCCL.",
)
args = parser.parse_args()
launch_server(args)