diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py index 7f01e312c..942a53c37 100644 --- a/python/sglang/srt/entrypoints/engine.py +++ b/python/sglang/srt/entrypoints/engine.py @@ -297,7 +297,7 @@ def _set_envs_and_config(server_args: ServerArgs): # Set global environments os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" os.environ["NCCL_CUMEM_ENABLE"] = "0" - os.environ["NCCL_NVLS_ENABLE"] = "0" + os.environ["NCCL_NVLS_ENABLE"] = str(int(server_args.enable_nccl_nvls)) os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1" os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4" diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 89f2c1b5b..93f797087 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -140,6 +140,7 @@ class ServerArgs: disable_jump_forward: bool = False disable_cuda_graph: bool = False disable_cuda_graph_padding: bool = False + enable_nccl_nvls: bool = False disable_outlines_disk_cache: bool = False disable_custom_all_reduce: bool = False disable_mla: bool = False @@ -783,6 +784,11 @@ class ServerArgs: action="store_true", help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.", ) + parser.add_argument( + "--enable-nccl-nvls", + action="store_true", + help="Enable NCCL NVLS for prefill heavy requests when available.", + ) parser.add_argument( "--disable-outlines-disk-cache", action="store_true",