Add --enable-p2p-check option (#599)
This commit is contained in:
@@ -362,7 +362,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|||||||
```
|
```
|
||||||
|
|
||||||
### Additional Arguments
|
### Additional Arguments
|
||||||
- Add `--tp 2` to enable tensor parallelism.
|
- Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
|
||||||
```
|
```
|
||||||
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -259,7 +259,10 @@ class ModelRunner:
|
|||||||
logger.info(f"[gpu_id={self.gpu_id}] Set cuda device.")
|
logger.info(f"[gpu_id={self.gpu_id}] Set cuda device.")
|
||||||
torch.cuda.set_device(self.gpu_id)
|
torch.cuda.set_device(self.gpu_id)
|
||||||
logger.info(f"[gpu_id={self.gpu_id}] Init nccl begin.")
|
logger.info(f"[gpu_id={self.gpu_id}] Init nccl begin.")
|
||||||
|
|
||||||
|
if not server_args.enable_p2p_check:
|
||||||
monkey_patch_vllm_p2p_access_check(self.gpu_id)
|
monkey_patch_vllm_p2p_access_check(self.gpu_id)
|
||||||
|
|
||||||
if server_args.nccl_init_addr:
|
if server_args.nccl_init_addr:
|
||||||
nccl_init_method = f"tcp://{server_args.nccl_init_addr}"
|
nccl_init_method = f"tcp://{server_args.nccl_init_addr}"
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -55,6 +55,7 @@ class ServerArgs:
|
|||||||
disable_regex_jump_forward: bool = False
|
disable_regex_jump_forward: bool = False
|
||||||
disable_disk_cache: bool = False
|
disable_disk_cache: bool = False
|
||||||
attention_reduce_in_fp32: bool = False
|
attention_reduce_in_fp32: bool = False
|
||||||
|
enable_p2p_check: bool = False
|
||||||
|
|
||||||
# Distributed args
|
# Distributed args
|
||||||
nccl_init_addr: Optional[str] = None
|
nccl_init_addr: Optional[str] = None
|
||||||
@@ -304,6 +305,11 @@ class ServerArgs:
|
|||||||
help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
|
help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
|
||||||
"This only affects Triton attention kernels",
|
"This only affects Triton attention kernels",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--enable-p2p-check",
|
||||||
|
action="store_true",
|
||||||
|
help="Enable P2P check for GPU access, otherwise the p2p access is allowed by default.",
|
||||||
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_cli_args(cls, args: argparse.Namespace):
|
def from_cli_args(cls, args: argparse.Namespace):
|
||||||
|
|||||||
@@ -458,12 +458,7 @@ def monkey_patch_vllm_p2p_access_check(gpu_id: int):
|
|||||||
NOTE: We assume the p2p access is always allowed, which can be wrong for some setups.
|
NOTE: We assume the p2p access is always allowed, which can be wrong for some setups.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# TODO: need a better check than just dev str name match
|
|
||||||
# compat: skip RTX 40 series as they do not have P2P feature and even checking for them may cause errors
|
|
||||||
device_name = torch.cuda.get_device_name(gpu_id)
|
|
||||||
if "RTX 40" not in device_name:
|
|
||||||
import vllm.distributed.device_communicators.custom_all_reduce_utils as tgt
|
import vllm.distributed.device_communicators.custom_all_reduce_utils as tgt
|
||||||
|
|
||||||
setattr(tgt, "gpu_p2p_access_check", lambda *arg, **kwargs: True)
|
setattr(tgt, "gpu_p2p_access_check", lambda *arg, **kwargs: True)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user