support 1 shot allreduce in 1-node and 2-node using mscclpp (#6277)
This commit is contained in:
@@ -26,6 +26,8 @@ def launch_server(args):
|
||||
cmd += f"--tp-size {args.tp_size} "
|
||||
if args.disable_custom_all_reduce:
|
||||
cmd += "--disable-custom-all-reduce"
|
||||
if args.enable_mscclpp:
|
||||
cmd += "--enable-mscclpp"
|
||||
print(cmd)
|
||||
os.system(cmd)
|
||||
|
||||
@@ -63,6 +65,11 @@ if __name__ == "__main__":
|
||||
action="store_true",
|
||||
help="Disable custom all reduce when device does not support p2p communication",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enable-mscclpp",
|
||||
action="store_true",
|
||||
help="Enable using mscclpp for small messages for all-reduce kernel and fall back to NCCL.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
launch_server(args)
|
||||
|
||||
Reference in New Issue
Block a user