From 5304b4ef58ecf101abac01c80d2dd5fe1e506d7f Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Sat, 6 Jul 2024 23:34:10 -0700 Subject: [PATCH] Add `--enable-p2p-check` option (#599) --- README.md | 2 +- python/sglang/srt/managers/controller/model_runner.py | 5 ++++- python/sglang/srt/server_args.py | 6 ++++++ python/sglang/srt/utils.py | 9 ++------- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index c22c257b5..88003e9af 100644 --- a/README.md +++ b/README.md @@ -362,7 +362,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port ``` ### Additional Arguments -- Add `--tp 2` to enable tensor parallelism. +- Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option. ``` python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2 ``` diff --git a/python/sglang/srt/managers/controller/model_runner.py b/python/sglang/srt/managers/controller/model_runner.py index 879f44151..0bb869cf7 100644 --- a/python/sglang/srt/managers/controller/model_runner.py +++ b/python/sglang/srt/managers/controller/model_runner.py @@ -259,7 +259,10 @@ class ModelRunner: logger.info(f"[gpu_id={self.gpu_id}] Set cuda device.") torch.cuda.set_device(self.gpu_id) logger.info(f"[gpu_id={self.gpu_id}] Init nccl begin.") - monkey_patch_vllm_p2p_access_check(self.gpu_id) + + if not server_args.enable_p2p_check: + monkey_patch_vllm_p2p_access_check(self.gpu_id) + if server_args.nccl_init_addr: nccl_init_method = f"tcp://{server_args.nccl_init_addr}" else: diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 14cf4d3b0..698a7bcc0 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -55,6 +55,7 @@ class ServerArgs: disable_regex_jump_forward: bool = False disable_disk_cache: bool = False attention_reduce_in_fp32: bool = False + enable_p2p_check: bool = False # Distributed args nccl_init_addr: Optional[str] = None @@ -304,6 +305,11 @@ class ServerArgs: help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16." "This only affects Triton attention kernels", ) + parser.add_argument( + "--enable-p2p-check", + action="store_true", + help="Enable P2P check for GPU access, otherwise the p2p access is allowed by default.", + ) @classmethod def from_cli_args(cls, args: argparse.Namespace): diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index a9ea62e4b..e6b7b7663 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -458,13 +458,8 @@ def monkey_patch_vllm_p2p_access_check(gpu_id: int): NOTE: We assume the p2p access is always allowed, which can be wrong for some setups. """ - # TODO: need a better check than just dev str name match - # compat: skip RTX 40 series as they do not have P2P feature and even checking for them may cause errors - device_name = torch.cuda.get_device_name(gpu_id) - if "RTX 40" not in device_name: - import vllm.distributed.device_communicators.custom_all_reduce_utils as tgt - - setattr(tgt, "gpu_p2p_access_check", lambda *arg, **kwargs: True) + import vllm.distributed.device_communicators.custom_all_reduce_utils as tgt + setattr(tgt, "gpu_p2p_access_check", lambda *arg, **kwargs: True) def monkey_patch_vllm_dummy_weight_loader():