[Feat] Support Torch Symm Mem AllReduce (#10571)

Co-authored-by: luoyuan.luo <luoyuan.luo@antgroup.com>
2025-10-06 04:55:19 +08:00
parent 148d8d485d
commit 590f2da052
8 changed files with 466 additions and 1 deletions
--- a/benchmark/kernels/all_reduce/benchmark_symm_mem.py
+++ b/benchmark/kernels/all_reduce/benchmark_symm_mem.py
@@ -0,0 +1,234 @@
 """For Now, SYMM_MEM is only supported on TP8 case
 export WORLD_SIZE=1
 export RANK=0
 export MASTER_ADDR=127.0.0.1
 export MASTER_PORT=12345
 torchrun --nproc_per_node gpu \
 --nnodes $WORLD_SIZE \
 --node_rank $RANK \
 --master_addr $MASTER_ADDR \
 --master_port $MASTER_PORT ./benchmark/kernels/all_reduce/benchmark_symm_mem.py
 """
 import os
 from contextlib import nullcontext
 from typing import List
 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup
 from sglang.srt.distributed import init_distributed_environment
 from sglang.srt.distributed.device_communicators.pynccl import PyNcclCommunicator
 from sglang.srt.distributed.device_communicators.symm_mem import SymmMemCommunicator
 from sglang.srt.distributed.parallel_state import (
    get_tensor_model_parallel_group,
    graph_capture,
    initialize_model_parallel,
    set_symm_mem_all_reduce,
 )
 # CI environment detection
 IS_CI = (
    os.getenv("CI", "false").lower() == "true"
    or os.getenv("GITHUB_ACTIONS", "false").lower() == "true"
 )
 def torch_allreduce(torch_input: torch.Tensor, group: ProcessGroup) -> torch.Tensor:
    dist.all_reduce(torch_input, group=group)
    return torch_input
 def symm_mem_allreduce(
    symm_mem_input: torch.Tensor, symm_mem_comm: SymmMemCommunicator
 ) -> torch.Tensor:
    return symm_mem_comm.all_reduce(symm_mem_input)
 def pynccl_allreduce(
    pynccl_input: torch.Tensor, pynccl_comm: PyNcclCommunicator
 ) -> torch.Tensor:
    pynccl_comm.all_reduce(pynccl_input)
    return pynccl_input
 def _bench_graph_time(func, inp_randn, warmup_loop=2, graph_loop=10, test_loop=10):
    graph_input = inp_randn.clone()
    with graph_capture() as graph_capture_context:
        graph = torch.cuda.CUDAGraph()
        with torch.cuda.graph(graph, stream=graph_capture_context.stream):
            for _ in range(graph_loop):
                graph_out = func(graph_input)
    graph.replay()
    func_output = graph_out.clone()
    for _ in range(warmup_loop):
        graph.replay()
    torch.cuda.synchronize()
    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)
    latencies: List[float] = []
    for _ in range(test_loop):
        torch.cuda.synchronize()
        dist.barrier()
        start_event.record()
        graph.replay()
        end_event.record()
        end_event.synchronize()
        latencies.append(start_event.elapsed_time(end_event))
    func_cost_us = sum(latencies) / len(latencies) / graph_loop * 1000
    graph.reset()
    return func_output, func_cost_us
 def _bench_eager_time(func, inp_randn, warmup_loop=2, test_loop=10):
    eager_input = inp_randn.clone()
    eager_output = func(eager_input)
    func_output = eager_output.clone()
    for _ in range(warmup_loop):
        func(eager_input)
    torch.cuda.synchronize()
    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)
    torch.cuda.synchronize()
    start_event.record()
    for _ in range(test_loop):
        func(eager_input)
    end_event.record()
    torch.cuda.synchronize()
    func_cost_us = start_event.elapsed_time(end_event) / test_loop * 1000
    return func_output, func_cost_us
 def get_torch_prof_ctx(do_prof: bool):
    ctx = (
        torch.profiler.profile(
            activities=[
                torch.profiler.ProfilerActivity.CPU,
                torch.profiler.ProfilerActivity.CUDA,
            ],
            record_shapes=True,
            with_stack=True,
        )
        if do_prof
        else nullcontext()
    )
    return ctx
 def human_readable_size(size, decimal_places=1):
    for unit in ["B", "KiB", "MiB", "GiB", "TiB", "PiB"]:
        if size < 1024.0 or unit == "PiB":
            break
        size /= 1024.0
    return f"{size:.{decimal_places}f} {unit}"
 try:
    from tabulate import tabulate
 except ImportError:
    print("tabulate not installed, skipping table printing")
    tabulate = None
 def print_markdown_table(data):
    if tabulate is not None:
        print(tabulate(data, headers="keys", tablefmt="github"))
        return
    headers = data[0].keys()
    header_row = "| " + " | ".join(headers) + " |"
    separator = "| " + " | ".join(["---"] * len(headers)) + " |"
    rows = []
    for item in data:
        row = "| " + " | ".join(str(item[key]) for key in headers) + " |"
        rows.append(row)
    markdown_table = "\n".join([header_row, separator] + rows)
    print(markdown_table)
 if __name__ == "__main__":
    import logging
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s - %(levelname)s - %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
        force=True,
    )
    if not dist.is_initialized():
        dist.init_process_group(backend="nccl")
    world, world_size = dist.group.WORLD, dist.get_world_size()
    rank = dist.get_rank()
    torch.cuda.set_device(rank % 8)
    device = torch.cuda.current_device()
    set_symm_mem_all_reduce(True)
    init_distributed_environment(
        world_size=world_size,
        rank=rank,
        local_rank=rank % 8,
    )
    initialize_model_parallel(tensor_model_parallel_size=world_size)
    group = get_tensor_model_parallel_group().device_group
    cpu_group = get_tensor_model_parallel_group().cpu_group
    pynccl_comm = get_tensor_model_parallel_group().pynccl_comm
    symm_mem_comm = get_tensor_model_parallel_group().symm_mem_comm
    dist.barrier()
    profile = False
    dtype = torch.bfloat16
    ctx = get_torch_prof_ctx(profile)
    result = []
    with ctx:
        if IS_CI:
            i_range = range(10, 11)
        else:
            i_range = range(10, 20)
        for i in i_range:
            sz = 2**i
            if sz * dtype.itemsize > 2**24:
                break
            inp_randn = torch.randint(1, 16, (sz,), dtype=dtype, device=device)
            memory = torch.empty_like(inp_randn)
            memory_out = torch.empty_like(memory)
            torch_eager_output, torch_eager_time = _bench_eager_time(
                lambda inp: torch_allreduce(inp, group), inp_randn
            )
            symm_mem_eager_output, symm_mem_eager_time = _bench_eager_time(
                lambda inp: symm_mem_allreduce(inp, symm_mem_comm), inp_randn
            )
            symm_mem_graph_output, symm_mem_graph_time = _bench_graph_time(
                lambda inp: symm_mem_allreduce(inp, symm_mem_comm), inp_randn
            )
            # since pynccl is inplace op, this return result is not correct if graph loop > 1
            _, pynccl_graph_time = _bench_graph_time(
                lambda inp: pynccl_allreduce(inp, pynccl_comm), inp_randn
            )
            torch.testing.assert_close(torch_eager_output, symm_mem_graph_output)
            torch.testing.assert_close(torch_eager_output, symm_mem_eager_output)
            result.append(
                {
                    "msg_size": human_readable_size(inp_randn.nbytes),
                    "torch eager time": torch_eager_time,
                    "symm mem eager time": symm_mem_eager_time,
                    "symm mem graph time": symm_mem_graph_time,
                    "pynccl graph time": pynccl_graph_time,
                }
            )
            if rank == 0:
                print(f"sz={sz}, dtype={dtype}: correctness check PASS!")
    if rank == 0:
        print_markdown_table(result)
    if profile:
        prof_dir = f"prof/symm_mem"
        os.makedirs(prof_dir, exist_ok=True)
        ctx.export_chrome_trace(f"{prof_dir}/trace_rank{dist.get_rank()}.json.gz")
--- a/benchmark/lora/launch_server.py
+++ b/benchmark/lora/launch_server.py
@@ -28,6 +28,8 @@ def launch_server(args):
        cmd += "--disable-custom-all-reduce"
    if args.enable_mscclpp:
        cmd += "--enable-mscclpp"
    if args.enable_torch_symm_mem:
        cmd += "--enable-torch-symm-mem"
    print(cmd)
    os.system(cmd)
@@ -70,6 +72,11 @@ if __name__ == "__main__":
        action="store_true",
        help="Enable using mscclpp for small messages for all-reduce kernel and fall back to NCCL.",
    )
    parser.add_argument(
        "--enable-torch-symm-mem",
        action="store_true",
        help="Enable using torch symm mem for all-reduce kernel and fall back to NCCL.",
    )
    args = parser.parse_args()
    launch_server(args)
--- a/python/sglang/srt/distributed/device_communicators/all_reduce_utils.py
+++ b/python/sglang/srt/distributed/device_communicators/all_reduce_utils.py
@@ -0,0 +1,16 @@
 MiB = 1024 * 1024
 SYMM_MEM_ALL_REDUCE_MAX_SIZES = {
    9: {
        2: 64 * MiB,  # 64 MB
        4: 32 * MiB,  # 32 MB
        6: 64 * MiB,  # 64 MB
        8: 64 * MiB,  # 64 MB
    },
    10: {
        2: 64 * MiB,  # 64 MB
        4: 32 * MiB,  # 32 MB
        6: 128 * MiB,  # 128 MB
        8: 128 * MiB,  # 128 MB
    },
 }
--- a/python/sglang/srt/distributed/device_communicators/symm_mem.py
+++ b/python/sglang/srt/distributed/device_communicators/symm_mem.py
@@ -0,0 +1,164 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/bf214ca22625e311a2c4c0dfbf7af19128f4919c/vllm/distributed/device_communicators/symm_mem.py
 import logging
 from typing import Optional, Union
 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup
 from sglang.srt.distributed.device_communicators.all_reduce_utils import (
    SYMM_MEM_ALL_REDUCE_MAX_SIZES,
 )
 from sglang.srt.utils import get_device_capability, is_cuda, is_hip
 try:
    import torch.distributed._symmetric_memory as torch_symm_mem
    symm_mem_available = True
 except ImportError:
    symm_mem_available = False
 logger = logging.getLogger(__name__)
 _is_cuda = is_cuda()
 _is_hip = is_hip()
 symm_mem_is_available = False
 if _is_hip:
    symm_mem_is_available = False
 if _is_cuda:
    symm_mem_is_available = True
 class SymmMemCommunicator:
    """
    Thin wrapper around symmetric-memory collectives.
    This communicator:
      - Validates device capability and world size.
      - Allocates a shared symmetric buffer.
      - Chooses between 'multimem' and 'two-shot' all-reduce kernels.
      - Exposes a fast-path all_reduce() compatible with bfloat16 inputs.
    If any prerequisite is not met, the instance remains disabled and will
    decline to perform symmetric-memory all-reduce.
    """
    # Mapping: compute capability major -> supported world sizes for multimem
    # If the current (cc_major, world_size) is not listed, we fall back
    # to the two-shot path.
    _WORLD_SIZES_MULTIMEM = {
        9: [4, 6, 8],
        10: [6, 8],
    }
    def __init__(self, group: ProcessGroup, device: Union[int, str, torch.device]):
        """
        Args:
            group: Torch process group used for rendezvous and naming.
            device: Target CUDA device (index, 'cuda:X', or torch.device).
        """
        self.disabled = True
        if not symm_mem_available:
            return
        if isinstance(device, int):
            device = torch.device(f"cuda:{device}")
        elif isinstance(device, str):
            device = torch.device(device)
        torch.cuda.set_device(device)
        self.dtype = torch.bfloat16
        self.device = device
        self.group = group
        self.world_size = dist.get_world_size(self.group)
        self.device_capability = torch.cuda.get_device_capability(device)[0]
        if self.device_capability < 9:
            logger.warning(
                "SymmMemCommunicator: Device capability %s not supported, "
                "communicator is not available.",
                self.device_capability,
            )
            return
        if self.world_size not in SYMM_MEM_ALL_REDUCE_MAX_SIZES[self.device_capability]:
            logger.warning(
                "SymmMemCommunicator: World size %d not supported, "
                "communicator is not available.",
                self.world_size,
            )
            return
        self.max_size = SYMM_MEM_ALL_REDUCE_MAX_SIZES[self.device_capability][
            self.world_size
        ]
        self.buffer = torch_symm_mem.empty(
            self.max_size // self.dtype.itemsize,
            device=self.device,
            dtype=self.dtype,
        )
        handle = torch_symm_mem.rendezvous(self.buffer, self.group.group_name)
        if handle.multicast_ptr == 0:
            logger.warning(
                "SymmMemCommunicator: symmetric memory "
                "multicast operations are not supported."
            )
            self.buffer = None
            self.disabled = True
            return
        self.disabled = False
    def should_symm_mem_allreduce(self, inp: torch.Tensor):
        """
        Fast-path eligibility check for a given tensor.
        Conditions:
          - Communicator must be enabled.
          - dtype must be bfloat16 (matches kernel + buffer dtype).
          - Total byte size must be 4-byte aligned (hardware requirement).
          - Payload must be smaller than the symmetric-memory max size.
        Returns:
            True if the symmetric-memory path can handle this tensor.
        """
        if self.disabled:
            return False
        if inp.dtype != self.dtype:
            return False
        inp_size = inp.numel() * inp.element_size()
        # enforce 4-byte alignment
        if inp_size % 4 != 0:
            return False
        return inp_size < self.max_size
    def all_reduce(
        self, inp: torch.Tensor, *, out: Optional[torch.Tensor] = None
    ) -> Optional[torch.Tensor]:
        """
        Perform an in-place sum all-reduce via symmetric memory.
        Args:
            inp: Input tensor on the target CUDA device (bfloat16).
            out: Optional output tensor; if omitted, a new tensor is allocated.
        Returns:
            The reduced tensor (same shape as inp), or None if disabled.
        Implementation details:
            - Stages 'inp' into the symmetric buffer.
            - Selects 'multimem' or 'two_shot' kernel based on topology.
            - Writes the result into 'out' and returns it.
        """
        if out is None:
            out = torch.empty_like(inp)
        self.buffer[: inp.numel()].copy_(inp.view(-1))
        if self.world_size in self._WORLD_SIZES_MULTIMEM[self.device_capability]:
            torch.ops.symm_mem.multimem_all_reduce_(
                self.buffer[: inp.numel()], "sum", self.group.group_name
            )
        else:
            torch.ops.symm_mem.two_shot_all_reduce_(
                self.buffer[: inp.numel()], "sum", self.group.group_name
            )
        out.copy_(self.buffer[: inp.numel()].view(out.shape))
        return out
--- a/python/sglang/srt/distributed/parallel_state.py
+++ b/python/sglang/srt/distributed/parallel_state.py
@@ -215,12 +215,14 @@ class GroupCoordinator:
    use_pynccl: bool  # a hint of whether to use PyNccl
    use_pymscclpp: bool  # a hint of whether to use PyMsccl
    use_custom_allreduce: bool  # a hint of whether to use CustomAllreduce
    use_torch_symm_mem: bool  # a hint of whether to use SymmMemAllReduce
    use_message_queue_broadcaster: (
        bool  # a hint of whether to use message queue broadcaster
    )
    # communicators are only created for world size > 1
    pynccl_comm: Optional[Any]  # PyNccl communicator
    ca_comm: Optional[Any]  # Custom allreduce communicator
    symm_mem_comm: Optional[Any]  # Symm mem communicator
    mq_broadcaster: Optional[Any]  # shared memory broadcaster
    def __init__(
@@ -231,6 +233,7 @@ class GroupCoordinator:
        use_pynccl: bool,
        use_pymscclpp: bool,
        use_custom_allreduce: bool,
        use_torch_symm_mem: bool,
        use_hpu_communicator: bool,
        use_xpu_communicator: bool,
        use_npu_communicator: bool,
@@ -279,6 +282,7 @@ class GroupCoordinator:
        self.use_pynccl = use_pynccl
        self.use_pymscclpp = use_pymscclpp
        self.use_custom_allreduce = use_custom_allreduce
        self.use_torch_symm_mem = use_torch_symm_mem
        self.use_hpu_communicator = use_hpu_communicator
        self.use_xpu_communicator = use_xpu_communicator
        self.use_npu_communicator = use_npu_communicator
@@ -294,6 +298,9 @@ class GroupCoordinator:
        from sglang.srt.distributed.device_communicators.pynccl import (
            PyNcclCommunicator,
        )
        from sglang.srt.distributed.device_communicators.symm_mem import (
            SymmMemCommunicator,
        )
        if is_hip():
            from sglang.srt.distributed.device_communicators.quick_all_reduce import (
@@ -342,6 +349,13 @@ class GroupCoordinator:
                except Exception as e:
                    logger.warning(f"Failed to initialize QuickAllReduce: {e}")
        self.symm_mem_comm: Optional[SymmMemCommunicator] = None
        if self.use_torch_symm_mem and self.world_size > 1:
            self.symm_mem_comm = SymmMemCommunicator(
                group=self.cpu_group,
                device=self.device,
            )
        # Create communicator for other hardware backends
        from sglang.srt.distributed.device_communicators.hpu_communicator import (
            HpuCommunicator,
@@ -446,6 +460,7 @@ class GroupCoordinator:
            # custom allreduce       | enabled | enabled |
            # PyNccl                 | disabled| enabled |
            # PyMscclpp              | disabled| enabled |
            # TorchSymmMem           | disabled| enabled |
            # torch.distributed      | enabled | disabled|
            #
            # Note: When custom quick allreduce is enabled, a runtime check
@@ -547,7 +562,12 @@ class GroupCoordinator:
            and self.pymscclpp_comm.should_mscclpp_allreduce(input_)
        ):
            outplace_all_reduce_method = "pymscclpp"
-
+        elif (
            self.symm_mem_comm is not None
            and not self.symm_mem_comm.disabled
            and self.symm_mem_comm.should_symm_mem_allreduce(input_)
        ):
            outplace_all_reduce_method = "symm_mem"
        if outplace_all_reduce_method is not None:
            return torch.ops.sglang.outplace_all_reduce(
                input_,
@@ -564,6 +584,7 @@ class GroupCoordinator:
        ca_comm = self.ca_comm
        qr_comm = self.qr_comm
        pymscclpp_comm = self.pymscclpp_comm
        symm_mem_comm = self.symm_mem_comm
        assert any([qr_comm, ca_comm, pymscclpp_comm])
        if outplace_all_reduce_method == "ca":
            assert not ca_comm.disabled
@@ -571,6 +592,9 @@ class GroupCoordinator:
        elif outplace_all_reduce_method == "qr":
            assert not qr_comm.disabled
            out = qr_comm.quick_all_reduce(input_)
        elif outplace_all_reduce_method == "symm_mem":
            assert not symm_mem_comm.disabled
            out = symm_mem_comm.all_reduce(input_)
        else:
            assert not pymscclpp_comm.disabled
            out = pymscclpp_comm.all_reduce(input_)
@@ -1219,6 +1243,7 @@ def init_world_group(
        use_pynccl=False,
        use_pymscclpp=False,
        use_custom_allreduce=False,
        use_torch_symm_mem=False,
        use_hpu_communicator=False,
        use_xpu_communicator=False,
        use_npu_communicator=False,
@@ -1234,11 +1259,14 @@ def init_model_parallel_group(
    use_message_queue_broadcaster: bool = False,
    group_name: Optional[str] = None,
    use_mscclpp_allreduce: Optional[bool] = None,
    use_symm_mem_allreduce: Optional[bool] = None,
 ) -> GroupCoordinator:
    if use_custom_allreduce is None:
        use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE
    if use_mscclpp_allreduce is None:
        use_mscclpp_allreduce = _ENABLE_MSCCLPP_ALL_REDUCE
    if use_symm_mem_allreduce is None:
        use_symm_mem_allreduce = _ENABLE_SYMM_MEM_ALL_REDUCE
    return GroupCoordinator(
        group_ranks=group_ranks,
        local_rank=local_rank,
@@ -1246,6 +1274,7 @@ def init_model_parallel_group(
        use_pynccl=not _is_npu,
        use_pymscclpp=use_mscclpp_allreduce,
        use_custom_allreduce=use_custom_allreduce,
        use_torch_symm_mem=use_symm_mem_allreduce,
        use_hpu_communicator=True,
        use_xpu_communicator=True,
        use_npu_communicator=True,
@@ -1331,6 +1360,7 @@ logger = logging.getLogger(__name__)
 _ENABLE_CUSTOM_ALL_REDUCE = True
 _ENABLE_MSCCLPP_ALL_REDUCE = False
 _ENABLE_SYMM_MEM_ALL_REDUCE = False
 def set_custom_all_reduce(enable: bool):
@@ -1343,6 +1373,11 @@ def set_mscclpp_all_reduce(enable: bool):
    _ENABLE_MSCCLPP_ALL_REDUCE = enable
 def set_symm_mem_all_reduce(enable: bool):
    global _ENABLE_SYMM_MEM_ALL_REDUCE
    _ENABLE_SYMM_MEM_ALL_REDUCE = enable
 def init_distributed_environment(
    world_size: int = -1,
    rank: int = -1,
--- a/python/sglang/srt/layers/dp_attention.py
+++ b/python/sglang/srt/layers/dp_attention.py
@@ -263,6 +263,7 @@ def initialize_dp_attention(
        use_pynccl=SYNC_TOKEN_IDS_ACROSS_TP,
        use_pymscclpp=False,
        use_custom_allreduce=False,
        use_torch_symm_mem=False,
        use_hpu_communicator=False,
        use_xpu_communicator=False,
        use_npu_communicator=False,
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -42,6 +42,7 @@ from sglang.srt.distributed import (
    initialize_model_parallel,
    set_custom_all_reduce,
    set_mscclpp_all_reduce,
    set_symm_mem_all_reduce,
 )
 from sglang.srt.distributed.parallel_state import monkey_patch_vllm_parallel_state
 from sglang.srt.eplb.eplb_manager import EPLBManager
@@ -646,6 +647,7 @@ class ModelRunner:
            dist_init_method = f"tcp://127.0.0.1:{self.dist_port}"
        set_custom_all_reduce(not self.server_args.disable_custom_all_reduce)
        set_mscclpp_all_reduce(self.server_args.enable_mscclpp)
        set_symm_mem_all_reduce(self.server_args.enable_torch_symm_mem)
        if not self.is_draft_worker:
            if self.device == "cpu":
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -382,6 +382,7 @@ class ServerArgs:
    disable_outlines_disk_cache: bool = False
    disable_custom_all_reduce: bool = False
    enable_mscclpp: bool = False
    enable_torch_symm_mem: bool = False
    disable_overlap_schedule: bool = False
    enable_mixed_chunk: bool = False
    enable_dp_attention: bool = False
@@ -2443,6 +2444,11 @@ class ServerArgs:
            action="store_true",
            help="Enable using mscclpp for small messages for all-reduce kernel and fall back to NCCL.",
        )
        parser.add_argument(
            "--enable-torch-symm-mem",
            action="store_true",
            help="Enable using torch symm mem for all-reduce kernel and fall back to NCCL. Only supports CUDA device SM90 and above. SM90 supports world size 4, 6, 8. SM10 supports world size 6, 8.",
        )
        parser.add_argument(
            "--disable-overlap-schedule",
            action="store_true",