[Feature] add disable-custom-all-reduce (#1148)

Co-authored-by: chenxu02 <chenxu02@zhihu.com> Co-authored-by: Yineng Zhang <me@zhyncs.com>
2024-08-20 23:44:12 +08:00
parent a8ae640328
commit ff2cfdb1a2
2 changed files with 9 additions and 0 deletions
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -37,6 +37,7 @@ from vllm.distributed import (
    get_tp_group,
    init_distributed_environment,
    initialize_model_parallel,
    set_custom_all_reduce,
 )
 from vllm.distributed.parallel_state import in_the_same_node_as
 from vllm.model_executor.model_loader import get_model
@@ -105,6 +106,7 @@ class ModelRunner:
            nccl_init_method = f"tcp://{server_args.nccl_init_addr}"
        else:
            nccl_init_method = f"tcp://127.0.0.1:{self.nccl_port}"
        set_custom_all_reduce(not server_args.disable_custom_all_reduce)
        init_distributed_environment(
            backend="nccl",
            world_size=self.tp_size,
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -86,6 +86,7 @@ class ServerArgs:
    enable_mla: bool = False
    attention_reduce_in_fp32: bool = False
    efficient_weight_load: bool = False
    disable_custom_all_reduce: bool = False
    # Distributed args
    nccl_init_addr: Optional[str] = None
@@ -428,6 +429,12 @@ class ServerArgs:
            action="store_true",
            help="Turn on memory efficient weight loading with quantization (quantize per layer during loading).",
        )
        parser.add_argument(
            "--disable-custom-all-reduce",
            action="store_true",
            default=False,
            help="Disable the custom all-reduce kernel and fall back to NCCL.",
        )
    @classmethod
    def from_cli_args(cls, args: argparse.Namespace):