diff --git a/docs/references/deepseek.md b/docs/references/deepseek.md index f3599ff3b..0267ab5a8 100644 --- a/docs/references/deepseek.md +++ b/docs/references/deepseek.md @@ -81,3 +81,9 @@ Overall, with these optimizations, we have achieved up to a 7x acceleration in o - **Weight**: Per-128x128-block quantization for better numerical stability. **Usage**: turn on by default for DeepSeek V3 models. + +## FAQ + +**Question**: What should I do if model loading takes too long and NCCL timeout occurs? + +Answer: You can try to add `--dist-timeout 3600` when launching the model, this allows for 1-hour timeout.i diff --git a/python/sglang/srt/distributed/parallel_state.py b/python/sglang/srt/distributed/parallel_state.py index c6d1a8307..231efb965 100644 --- a/python/sglang/srt/distributed/parallel_state.py +++ b/python/sglang/srt/distributed/parallel_state.py @@ -30,6 +30,7 @@ import weakref from collections import namedtuple from contextlib import contextmanager, nullcontext from dataclasses import dataclass +from datetime import timedelta from multiprocessing import shared_memory from typing import Any, Callable, Dict, List, Optional, Tuple, Union from unittest.mock import patch @@ -960,6 +961,7 @@ def init_distributed_environment( distributed_init_method: str = "env://", local_rank: int = -1, backend: str = "nccl", + timeout: Optional[int] = None, ): logger.debug( "world_size=%d rank=%d local_rank=%d " "distributed_init_method=%s backend=%s", @@ -974,13 +976,20 @@ def init_distributed_environment( "distributed_init_method must be provided when initializing " "distributed environment" ) + if timeout is not None: + assert isinstance(timeout, (int)), "timeout must be a number" + assert timeout > 0, "timeout must be positive" + timeout = timedelta(seconds=timeout) + # this backend is used for WORLD torch.distributed.init_process_group( backend=backend, init_method=distributed_init_method, world_size=world_size, rank=rank, + timeout=timeout, ) + # set the local rank # local_rank is not available in torch ProcessGroup, # see https://github.com/pytorch/pytorch/issues/122816 diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index fc0f9747a..df9ed3d9d 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -259,6 +259,7 @@ class ModelRunner: rank=self.tp_rank, local_rank=self.gpu_id, distributed_init_method=dist_init_method, + timeout=self.server_args.dist_timeout, ) initialize_model_parallel(tensor_model_parallel_size=self.tp_size) initialize_dp_attention( diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index a81228ce3..610c0f5a8 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -79,6 +79,7 @@ class ServerArgs: random_seed: Optional[int] = None constrained_json_whitespace_pattern: Optional[str] = None watchdog_timeout: float = 300 + dist_timeout: Optional[int] = None # timeout for torch.distributed download_dir: Optional[str] = None base_gpu_id: int = 0 @@ -534,6 +535,12 @@ class ServerArgs: default=ServerArgs.watchdog_timeout, help="Set watchdog timeout in seconds. If a forward batch takes longer than this, the server will crash to prevent hanging.", ) + parser.add_argument( + "--dist-timeout", + type=int, + default=ServerArgs.dist_timeout, + help="Set timeout for torch.distributed initialization.", + ) parser.add_argument( "--download-dir", type=str, diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index b303f1912..3dc1ae347 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -503,7 +503,9 @@ def run_unittest_files(files: List[str], timeout_per_file: float): ret_code = run_with_timeout( run_one_file, args=(filename,), timeout=timeout_per_file ) - assert ret_code == 0 + assert ( + ret_code == 0 + ), f"expected return code 0, but {filename} returned {ret_code}" except TimeoutError: kill_process_tree(process.pid) time.sleep(5)