[Feat] Support Torch Symm Mem AllReduce (#10571)

Co-authored-by: luoyuan.luo <luoyuan.luo@antgroup.com>
This commit is contained in:
Yuan Luo
2025-10-06 04:55:19 +08:00
committed by GitHub
parent 148d8d485d
commit 590f2da052
8 changed files with 466 additions and 1 deletions

View File

@@ -42,6 +42,7 @@ from sglang.srt.distributed import (
initialize_model_parallel,
set_custom_all_reduce,
set_mscclpp_all_reduce,
set_symm_mem_all_reduce,
)
from sglang.srt.distributed.parallel_state import monkey_patch_vllm_parallel_state
from sglang.srt.eplb.eplb_manager import EPLBManager
@@ -646,6 +647,7 @@ class ModelRunner:
dist_init_method = f"tcp://127.0.0.1:{self.dist_port}"
set_custom_all_reduce(not self.server_args.disable_custom_all_reduce)
set_mscclpp_all_reduce(self.server_args.enable_mscclpp)
set_symm_mem_all_reduce(self.server_args.enable_torch_symm_mem)
if not self.is_draft_worker:
if self.device == "cpu":