[Docs]: Fix Multi-User Port Allocation Conflicts (#3601)

Co-authored-by: zhaochenyang20 <zhaochen20@outlook.com> Co-authored-by: simveit <simp.veitner@gmail.com>
2025-02-19 19:15:44 +00:00
parent 6b0aeb58fd
commit 55de40f782
12 changed files with 168 additions and 117 deletions
--- a/docs/README.md
+++ b/docs/README.md
@@ -36,42 +36,70 @@ find . -name '*.ipynb' -exec nbstripout {} \;
 # After these checks pass, push your changes and open a PR on your branch
 pre-commit run --all-files
 ```
 ---
 ### **Port Allocation and CI Efficiency**
-If you need to run and shut down a SGLang server or engine, following these examples:
+**To launch and kill the server:**
 1. Launch and close Sever:
 ```python
-#Launch Sever
+from sglang.test.test_utils import is_in_ci
 from sglang.utils import wait_for_server, print_highlight, terminate_process
-from sglang.utils import (
+if is_in_ci():
-    execute_shell_command,
+    from patch import launch_server_cmd
-    wait_for_server,
+else:
-    terminate_process,
+    from sglang.utils import launch_server_cmd
-    print_highlight,
+
 server_process, port = launch_server_cmd(
    """
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
 --host 0.0.0.0
 """
 )
-server_process = execute_shell_command(
+wait_for_server(f"http://localhost:{port}")
    "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --host 0.0.0.0"
 )
 wait_for_server("http://localhost:30000")
 # Terminate Sever
 # Terminate Server
 terminate_process(server_process)
 ```
-2. Launch Engine and close Engine
+
 **To launch and kill the engine:**
 ```python
 # Launch Engine
 import sglang as sgl
 import asyncio
 from sglang.test.test_utils import is_in_ci
 if is_in_ci():
    import patch
 llm = sgl.Engine(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct")
 # Terminalte Engine
 llm.shutdown()
 ```
 ### **Why this approach?**
 - **Dynamic Port Allocation**: Avoids port conflicts by selecting an available port at runtime, enabling multiple server instances to run in parallel.
 - **Optimized for CI**: The `patch` version of `launch_server_cmd` and `sgl.Engine()` in CI environments helps manage GPU memory dynamically, preventing conflicts and improving test parallelism.
 - **Better Parallel Execution**: Ensures smooth concurrent tests by avoiding fixed port collisions and optimizing memory usage.
 ### **Model Selection**
 For demonstrations in the docs, **prefer smaller models** to reduce memory consumption and speed up inference. Running larger models in CI can lead to instability due to memory constraints.
 ### **Prompt Alignment Example**
 When designing prompts, ensure they align with SGLang’s structured formatting. For example:
 ```python
 prompt = """You are an AI assistant. Answer concisely and accurately.
 User: What is the capital of France?
 Assistant: The capital of France is Paris."""
 ```
 This keeps responses aligned with expected behavior and improves reliability across different files.
--- a/docs/backend/function_calling.ipynb
+++ b/docs/backend/function_calling.ipynb
@@ -405,7 +405,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "terminate_process(server_process, port)"
+    "terminate_process(server_process)"
   ]
  },
  {
--- a/docs/backend/native_api.ipynb
+++ b/docs/backend/native_api.ipynb
@@ -252,7 +252,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "terminate_process(server_process, port)\n",
+    "terminate_process(server_process)\n",
    "\n",
    "embedding_process, port = launch_server_cmd(\n",
    "    \"\"\"\n",
@@ -286,7 +286,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "terminate_process(embedding_process, port)"
+    "terminate_process(embedding_process)"
   ]
  },
  {
@@ -304,7 +304,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "terminate_process(embedding_process, port)\n",
+    "terminate_process(embedding_process)\n",
    "\n",
    "# Note that SGLang now treats embedding models and reward models as the same type of models.\n",
    "# This will be updated in the future.\n",
@@ -355,7 +355,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "terminate_process(reward_process, port)"
+    "terminate_process(reward_process)"
   ]
  },
  {
@@ -425,7 +425,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "terminate_process(tokenizer_free_server_process, port)"
+    "terminate_process(tokenizer_free_server_process)"
   ]
  }
 ],
--- a/docs/backend/openai_api_completions.ipynb
+++ b/docs/backend/openai_api_completions.ipynb
@@ -512,7 +512,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "terminate_process(server_process, port)"
+    "terminate_process(server_process)"
   ]
  }
 ],
--- a/docs/backend/openai_api_embeddings.ipynb
+++ b/docs/backend/openai_api_embeddings.ipynb
@@ -169,7 +169,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "terminate_process(embedding_process, port)"
+    "terminate_process(embedding_process)"
   ]
  }
 ],
--- a/docs/backend/openai_api_vision.ipynb
+++ b/docs/backend/openai_api_vision.ipynb
@@ -243,7 +243,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "terminate_process(embedding_process, port)"
+    "terminate_process(embedding_process)"
   ]
  },
  {
--- a/docs/backend/patch.py
+++ b/docs/backend/patch.py
@@ -1,4 +1,5 @@
 import os
 import weakref
 from sglang.utils import execute_shell_command, reserve_port
@@ -21,15 +22,29 @@ def patched_post_init(self):
 server_args_mod.ServerArgs.__post_init__ = patched_post_init
 process_socket_map = weakref.WeakKeyDictionary()
 def launch_server_cmd(command: str, host: str = "0.0.0.0", port: int = None):
    """
    Launch the server using the given command.
    If no port is specified, a free port is reserved.
    """
    if port is None:
-        port = reserve_port()
+        port, lock_socket = reserve_port(host)
    else:
        lock_socket = None
    extra_flags = (
        f"--max-running-requests {DEFAULT_MAX_RUNNING_REQUESTS} "
        f"--max-total-tokens {DEFAULT_MAX_TOTAL_TOKENS} "
        f"--disable-cuda-graph"
    )
    full_command = f"{command} --port {port} {extra_flags}"
    process = execute_shell_command(full_command)
    if lock_socket is not None:
        process_socket_map[process] = lock_socket
    return process, port
--- a/docs/backend/send_request.ipynb
+++ b/docs/backend/send_request.ipynb
@@ -243,15 +243,8 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "terminate_process(server_process, port)"
+    "terminate_process(server_process)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
--- a/docs/backend/structured_outputs.ipynb
+++ b/docs/backend/structured_outputs.ipynb
@@ -397,7 +397,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "terminate_process(server_process, port)"
+    "terminate_process(server_process)"
   ]
  },
  {
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,5 +1,5 @@
 SGLang Documentation
-====================================
+====================
 SGLang is a fast serving framework for large language models and vision language models.
 It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
@@ -10,7 +10,6 @@ The core features include:
 - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte) and reward models (Skywork), with easy extensibility for integrating new models.
 - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
 .. toctree::
   :maxdepth: 1
   :caption: Getting Started
@@ -39,7 +38,6 @@ The core features include:
   frontend/frontend.md
   frontend/choices_methods.md
 .. toctree::
   :maxdepth: 1
   :caption: SGLang Router
@@ -47,24 +45,47 @@ The core features include:
   router/router.md
 References
 ==========
 General
 ---------------------
 .. toctree::
   :maxdepth: 1
   :caption: References
   references/supported_models.md
   references/contribution_guide.md
   references/troubleshooting.md
   references/faq.md
   references/learn_more.md
 Hardware
 --------------------------
 .. toctree::
   :maxdepth: 1
   references/AMD.md
   references/amd_configure.md
   references/nvidia_jetson.md
 Advanced Models & Deployment
 ------------------------------
 .. toctree::
   :maxdepth: 1
   references/deepseek.md
   references/multi_node.md
   references/multi_node_inference_k8s_lws.md
   references/modelscope.md
 Performance & Tuning
 --------------------
 .. toctree::
   :maxdepth: 1
   references/sampling_params.md
   references/hyperparameter_tuning.md
   references/benchmark_and_profiling.md
   references/accuracy_evaluation.md
   references/custom_chat_template.md
   references/amd_configure.md
   references/deepseek.md
   references/multi_node.md
   references/multi_node_inference_k8s_lws.md
   references/modelscope.md
   references/quantization.md
   references/contribution_guide.md
   references/troubleshooting.md
   references/nvidia_jetson.md
   references/faq.md
   references/learn_more.md
--- a/docs/start/install.md
+++ b/docs/start/install.md
@@ -2,19 +2,27 @@
 You can install SGLang using any of the methods below. For running DeepSeek V3/R1 with SGLang, refer to [DeepSeek V3 Support](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3). It is always recommended to use the [latest release version](https://pypi.org/project/sglang/#history) and deploy it with [Docker](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#using-docker-recommended) to avoid fixed issues and environment-related problems.
-## Method 1: With pip
+## Method 1: With pip or uv
-```
+
 We recommend using uv to install the dependencies with a higher installation speed:
 ```bash
 pip install --upgrade pip
-pip install sgl-kernel --force-reinstall --no-deps
+pip install uv
-pip install "sglang[all]>=0.4.3.post2" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python
+uv pip install sgl-kernel --force-reinstall --no-deps
 uv pip install "sglang[all]>=0.4.3.post2" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python
 ```
-Note: SGLang currently uses torch 2.5, so you need to install the flashinfer version for torch 2.5. If you want to install flashinfer separately, please refer to [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html). Please note that the package currently used by FlashInfer is named `flashinfer-python`, not `flashinfer`.
+**Quick Fix to Installation**
-If you experience an error like `OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root`， please try either of the following solutions:
+- SGLang currently uses torch 2.5, so you need to install the flashinfer version for torch 2.5. If you want to install flashinfer separately, please refer to [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html). Please note that the package currently used by FlashInfer is named `flashinfer-python`, not `flashinfer`.
- Use `export CUDA_HOME=/usr/local/cuda-<your-cuda-version>` to set the `CUDA_HOME` environment variable.
+- If you experience an error like `OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root`, please try either of the following solutions:
- Follow the procedure described in [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) first, then install SGLang as described above.
+
 1. Use `export CUDA_HOME=/usr/local/cuda-<your-cuda-version>` to set the `CUDA_HOME` environment variable.
 2. Follow the procedure described in [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) first, then install SGLang as described above.
 - If you encounter `ImportError; cannot import name 'is_valid_list_of_images' from 'transformers.models.llama.image_processing_llama'`, try to use the specified version of `transformers` in [pyproject.toml](https://github.com/sgl-project/sglang/blob/main/python/pyproject.toml). Currently, just running `pip install transformers==4.48.3`.
 ## Method 2: From source
 ```
--- a/python/sglang/utils.py
+++ b/python/sglang/utils.py
@@ -5,12 +5,15 @@ import importlib
 import json
 import logging
 import os
 import random
 import signal
 import socket
 import subprocess
 import sys
 import time
 import traceback
 import urllib.request
 import weakref
 from concurrent.futures import ThreadPoolExecutor
 from io import BytesIO
 from json import dumps
@@ -21,6 +24,8 @@ import requests
 from IPython.display import HTML, display
 from tqdm import tqdm
 from sglang.srt.utils import kill_process_tree
 logger = logging.getLogger(__name__)
@@ -306,27 +311,12 @@ def download_and_cache_file(url: str, filename: Optional[str] = None):
    return filename
 import fcntl
 def is_in_ci():
    from sglang.test.test_utils import is_in_ci
    return is_in_ci()
 LOCKFILE = os.path.expanduser("~/.sglang_port_lock")
 PORT_REGISTRY = os.path.expanduser("~/.sglang_port_registry.json")
 if not os.path.exists(LOCKFILE):
    with open(LOCKFILE, "w") as f:
        pass
 if not os.path.exists(PORT_REGISTRY):
    with open(PORT_REGISTRY, "w") as f:
        json.dump([], f)
 def print_highlight(html_content: str):
    if is_in_ci():
        html_content = str(html_content).replace("\n", "<br>")
@@ -335,55 +325,44 @@ def print_highlight(html_content: str):
        print(html_content)
-def init_port_registry():
+process_socket_map = weakref.WeakKeyDictionary()
    """Initialize the port registry file if it doesn't exist."""
    if not os.path.exists(PORT_REGISTRY):
        with open(PORT_REGISTRY, "w") as f:
            json.dump([], f)
-def reserve_port(start=30000, end=40000):
+def reserve_port(host, start=30000, end=40000):
    """
-    Reserve an available port using a file lock and a registry.
+    Reserve an available port by trying to bind a socket.
-    Returns the allocated port.
+    Returns a tuple (port, lock_socket) where `lock_socket` is kept open to hold the lock.
    """
-    init_port_registry()
+    candidates = list(range(start, end))
-    with open(LOCKFILE, "w") as lock:
+    random.shuffle(candidates)
-        fcntl.flock(lock, fcntl.LOCK_EX)
+
    for port in candidates:
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        try:
-            with open(PORT_REGISTRY, "r") as f:
+            # Attempt to bind to the port on localhost
-                used = json.load(f)
+            sock.bind((host, port))
-        except Exception:
+            return port, sock
-            used = []
+        except socket.error:
-        for port in range(start, end):
+            sock.close()  # Failed to bind, try next port
-            if port not in used:
+            continue
-                used.append(port)
+    raise RuntimeError("No free port available.")
                with open(PORT_REGISTRY, "w") as f:
                    json.dump(used, f)
                return port
    raise RuntimeError("No free port available")
-def release_port(port):
+def release_port(lock_socket):
-    """Release the reserved port by removing it from the registry."""
+    """
-    with open(LOCKFILE, "w") as lock:
+    Release the reserved port by closing the lock socket.
-        fcntl.flock(lock, fcntl.LOCK_EX)
+    """
    try:
-            with open(PORT_REGISTRY, "r") as f:
+        lock_socket.close()
-                used = json.load(f)
+    except Exception as e:
-        except Exception:
+        print(f"Error closing socket: {e}")
            used = []
        if port in used:
            used.remove(port)
        with open(PORT_REGISTRY, "w") as f:
            json.dump(used, f)
 def execute_shell_command(command: str) -> subprocess.Popen:
    """
    Execute a shell command and return its process handle.
    """
    # Replace newline continuations and split the command string.
    command = command.replace("\\\n", " ").replace("\\", " ")
    parts = command.split()
    return subprocess.Popen(parts, text=True, stderr=subprocess.STDOUT)
@@ -395,21 +374,28 @@ def launch_server_cmd(command: str, host: str = "0.0.0.0", port: int = None):
    If no port is specified, a free port is reserved.
    """
    if port is None:
-        port = reserve_port()
+        port, lock_socket = reserve_port(host)
    else:
        lock_socket = None
    full_command = f"{command} --port {port}"
    process = execute_shell_command(full_command)
    if lock_socket is not None:
        process_socket_map[process] = lock_socket
    return process, port
-def terminate_process(process, port=None):
+def terminate_process(process):
    """
-    Terminate the process and, if a port was reserved, release it.
+    Terminate the process and automatically release the reserved port.
    """
    from sglang.srt.utils import kill_process_tree
    kill_process_tree(process.pid)
-    if port is not None:
+
-        release_port(port)
+    lock_socket = process_socket_map.pop(process, None)
    if lock_socket is not None:
        release_port(lock_socket)
 def wait_for_server(base_url: str, timeout: int = None) -> None: