From 55de40f782d1949740aec74e88ae7cce00d59582 Mon Sep 17 00:00:00 2001
From: Shi Shuai <126407087+shuaills@users.noreply.github.com>
Date: Wed, 19 Feb 2025 19:15:44 +0000
Subject: [PATCH] [Docs]: Fix Multi-User Port Allocation Conflicts (#3601)

Co-authored-by: zhaochenyang20 <zhaochen20@outlook.com>
Co-authored-by: simveit <simp.veitner@gmail.com>
---
 docs/README.md                            |  64 ++++++++++----
 docs/backend/function_calling.ipynb       |   2 +-
 docs/backend/native_api.ipynb             |  10 +--
 docs/backend/openai_api_completions.ipynb |   2 +-
 docs/backend/openai_api_embeddings.ipynb  |   2 +-
 docs/backend/openai_api_vision.ipynb      |   2 +-
 docs/backend/patch.py                     |  17 +++-
 docs/backend/send_request.ipynb           |   9 +-
 docs/backend/structured_outputs.ipynb     |   2 +-
 docs/index.rst                            |  49 ++++++++---
 docs/start/install.md                     |  24 +++--
 python/sglang/utils.py                    | 102 ++++++++++------------
 12 files changed, 168 insertions(+), 117 deletions(-)

diff --git a/docs/README.md b/docs/README.md
index 0a12d64b1..9cd59bda5 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -36,42 +36,70 @@ find . -name '*.ipynb' -exec nbstripout {} \;
 # After these checks pass, push your changes and open a PR on your branch
 pre-commit run --all-files
 ```
+---
 
+### **Port Allocation and CI Efficiency**
 
-If you need to run and shut down a SGLang server or engine, following these examples:
-
-1. Launch and close Sever:
+**To launch and kill the server:**
 
 ```python
-#Launch Sever
+from sglang.test.test_utils import is_in_ci
+from sglang.utils import wait_for_server, print_highlight, terminate_process
 
-from sglang.utils import (
-    execute_shell_command,
-    wait_for_server,
-    terminate_process,
-    print_highlight,
+if is_in_ci():
+    from patch import launch_server_cmd
+else:
+    from sglang.utils import launch_server_cmd
+
+server_process, port = launch_server_cmd(
+    """
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
+ --host 0.0.0.0
+"""
 )
 
-server_process = execute_shell_command(
-    "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --host 0.0.0.0"
-)
-
-wait_for_server("http://localhost:30000")
-
-# Terminate Sever
+wait_for_server(f"http://localhost:{port}")
 
+# Terminate Server
 terminate_process(server_process)
 ```
-2. Launch Engine and close Engine
+
+**To launch and kill the engine:**
 
 ```python
 # Launch Engine
-
 import sglang as sgl
 import asyncio
+from sglang.test.test_utils import is_in_ci
+
+if is_in_ci():
+    import patch
 
 llm = sgl.Engine(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct")
 
 # Terminalte Engine
 llm.shutdown()
 ```
+
+### **Why this approach?**
+
+- **Dynamic Port Allocation**: Avoids port conflicts by selecting an available port at runtime, enabling multiple server instances to run in parallel.
+- **Optimized for CI**: The `patch` version of `launch_server_cmd` and `sgl.Engine()` in CI environments helps manage GPU memory dynamically, preventing conflicts and improving test parallelism.
+- **Better Parallel Execution**: Ensures smooth concurrent tests by avoiding fixed port collisions and optimizing memory usage.
+
+### **Model Selection**
+
+For demonstrations in the docs, **prefer smaller models** to reduce memory consumption and speed up inference. Running larger models in CI can lead to instability due to memory constraints.
+
+### **Prompt Alignment Example**
+
+When designing prompts, ensure they align with SGLang’s structured formatting. For example:
+
+```python
+prompt = """You are an AI assistant. Answer concisely and accurately.
+
+User: What is the capital of France?
+Assistant: The capital of France is Paris."""
+```
+
+This keeps responses aligned with expected behavior and improves reliability across different files.
diff --git a/docs/backend/function_calling.ipynb b/docs/backend/function_calling.ipynb
index e80c91d54..a5c469623 100644
--- a/docs/backend/function_calling.ipynb
+++ b/docs/backend/function_calling.ipynb
@@ -405,7 +405,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "terminate_process(server_process, port)"
+    "terminate_process(server_process)"
    ]
   },
   {
diff --git a/docs/backend/native_api.ipynb b/docs/backend/native_api.ipynb
index 2208bab35..7f3a67c80 100644
--- a/docs/backend/native_api.ipynb
+++ b/docs/backend/native_api.ipynb
@@ -252,7 +252,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "terminate_process(server_process, port)\n",
+    "terminate_process(server_process)\n",
     "\n",
     "embedding_process, port = launch_server_cmd(\n",
     "    \"\"\"\n",
@@ -286,7 +286,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "terminate_process(embedding_process, port)"
+    "terminate_process(embedding_process)"
    ]
   },
   {
@@ -304,7 +304,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "terminate_process(embedding_process, port)\n",
+    "terminate_process(embedding_process)\n",
     "\n",
     "# Note that SGLang now treats embedding models and reward models as the same type of models.\n",
     "# This will be updated in the future.\n",
@@ -355,7 +355,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "terminate_process(reward_process, port)"
+    "terminate_process(reward_process)"
    ]
   },
   {
@@ -425,7 +425,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "terminate_process(tokenizer_free_server_process, port)"
+    "terminate_process(tokenizer_free_server_process)"
    ]
   }
  ],
diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb
index e9c3f360a..96ee05022 100644
--- a/docs/backend/openai_api_completions.ipynb
+++ b/docs/backend/openai_api_completions.ipynb
@@ -512,7 +512,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "terminate_process(server_process, port)"
+    "terminate_process(server_process)"
    ]
   }
  ],
diff --git a/docs/backend/openai_api_embeddings.ipynb b/docs/backend/openai_api_embeddings.ipynb
index 5a86ef18e..38543fa3b 100644
--- a/docs/backend/openai_api_embeddings.ipynb
+++ b/docs/backend/openai_api_embeddings.ipynb
@@ -169,7 +169,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "terminate_process(embedding_process, port)"
+    "terminate_process(embedding_process)"
    ]
   }
  ],
diff --git a/docs/backend/openai_api_vision.ipynb b/docs/backend/openai_api_vision.ipynb
index 2ce921f50..2bb45e50f 100644
--- a/docs/backend/openai_api_vision.ipynb
+++ b/docs/backend/openai_api_vision.ipynb
@@ -243,7 +243,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "terminate_process(embedding_process, port)"
+    "terminate_process(embedding_process)"
    ]
   },
   {
diff --git a/docs/backend/patch.py b/docs/backend/patch.py
index 1623c1b1f..d16422d08 100644
--- a/docs/backend/patch.py
+++ b/docs/backend/patch.py
@@ -1,4 +1,5 @@
 import os
+import weakref
 
 from sglang.utils import execute_shell_command, reserve_port
 
@@ -21,15 +22,29 @@ def patched_post_init(self):
 
 server_args_mod.ServerArgs.__post_init__ = patched_post_init
 
+process_socket_map = weakref.WeakKeyDictionary()
+
 
 def launch_server_cmd(command: str, host: str = "0.0.0.0", port: int = None):
+    """
+    Launch the server using the given command.
+    If no port is specified, a free port is reserved.
+    """
     if port is None:
-        port = reserve_port()
+        port, lock_socket = reserve_port(host)
+    else:
+        lock_socket = None
+
     extra_flags = (
         f"--max-running-requests {DEFAULT_MAX_RUNNING_REQUESTS} "
         f"--max-total-tokens {DEFAULT_MAX_TOTAL_TOKENS} "
         f"--disable-cuda-graph"
     )
+
     full_command = f"{command} --port {port} {extra_flags}"
     process = execute_shell_command(full_command)
+
+    if lock_socket is not None:
+        process_socket_map[process] = lock_socket
+
     return process, port
diff --git a/docs/backend/send_request.ipynb b/docs/backend/send_request.ipynb
index ea6398da1..610538760 100644
--- a/docs/backend/send_request.ipynb
+++ b/docs/backend/send_request.ipynb
@@ -243,15 +243,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "terminate_process(server_process, port)"
+    "terminate_process(server_process)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/docs/backend/structured_outputs.ipynb b/docs/backend/structured_outputs.ipynb
index 31d59adf9..1fe1b0576 100644
--- a/docs/backend/structured_outputs.ipynb
+++ b/docs/backend/structured_outputs.ipynb
@@ -397,7 +397,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "terminate_process(server_process, port)"
+    "terminate_process(server_process)"
    ]
   },
   {
diff --git a/docs/index.rst b/docs/index.rst
index b58d99a8e..62c7383da 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,5 +1,5 @@
 SGLang Documentation
-====================================
+====================
 
 SGLang is a fast serving framework for large language models and vision language models.
 It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
@@ -10,7 +10,6 @@ The core features include:
 - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte) and reward models (Skywork), with easy extensibility for integrating new models.
 - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
 
-
 .. toctree::
    :maxdepth: 1
    :caption: Getting Started
@@ -39,7 +38,6 @@ The core features include:
    frontend/frontend.md
    frontend/choices_methods.md
 
-
 .. toctree::
    :maxdepth: 1
    :caption: SGLang Router
@@ -47,24 +45,47 @@ The core features include:
    router/router.md
 
 
+References
+==========
+
+General
+---------------------
 .. toctree::
    :maxdepth: 1
-   :caption: References
 
    references/supported_models.md
+   references/contribution_guide.md
+   references/troubleshooting.md
+   references/faq.md
+   references/learn_more.md
+
+Hardware
+--------------------------
+.. toctree::
+   :maxdepth: 1
+
+   references/AMD.md
+   references/amd_configure.md
+   references/nvidia_jetson.md
+
+Advanced Models & Deployment
+------------------------------
+.. toctree::
+   :maxdepth: 1
+
+   references/deepseek.md
+   references/multi_node.md
+   references/multi_node_inference_k8s_lws.md
+   references/modelscope.md
+
+Performance & Tuning
+--------------------
+.. toctree::
+   :maxdepth: 1
+
    references/sampling_params.md
    references/hyperparameter_tuning.md
    references/benchmark_and_profiling.md
    references/accuracy_evaluation.md
    references/custom_chat_template.md
-   references/amd_configure.md
-   references/deepseek.md
-   references/multi_node.md
-   references/multi_node_inference_k8s_lws.md
-   references/modelscope.md
    references/quantization.md
-   references/contribution_guide.md
-   references/troubleshooting.md
-   references/nvidia_jetson.md
-   references/faq.md
-   references/learn_more.md
diff --git a/docs/start/install.md b/docs/start/install.md
index db29241c8..4fadf597a 100644
--- a/docs/start/install.md
+++ b/docs/start/install.md
@@ -2,19 +2,27 @@
 
 You can install SGLang using any of the methods below. For running DeepSeek V3/R1 with SGLang, refer to [DeepSeek V3 Support](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3). It is always recommended to use the [latest release version](https://pypi.org/project/sglang/#history) and deploy it with [Docker](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#using-docker-recommended) to avoid fixed issues and environment-related problems.
 
-## Method 1: With pip
-```
+## Method 1: With pip or uv
+
+We recommend using uv to install the dependencies with a higher installation speed:
+
+```bash
 pip install --upgrade pip
-pip install sgl-kernel --force-reinstall --no-deps
-pip install "sglang[all]>=0.4.3.post2" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python
+pip install uv
+uv pip install sgl-kernel --force-reinstall --no-deps
+uv pip install "sglang[all]>=0.4.3.post2" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python
 ```
 
-Note: SGLang currently uses torch 2.5, so you need to install the flashinfer version for torch 2.5. If you want to install flashinfer separately, please refer to [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html). Please note that the package currently used by FlashInfer is named `flashinfer-python`, not `flashinfer`.
+**Quick Fix to Installation**
 
-If you experience an error like `OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root`， please try either of the following solutions:
+- SGLang currently uses torch 2.5, so you need to install the flashinfer version for torch 2.5. If you want to install flashinfer separately, please refer to [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html). Please note that the package currently used by FlashInfer is named `flashinfer-python`, not `flashinfer`.
 
-- Use `export CUDA_HOME=/usr/local/cuda-<your-cuda-version>` to set the `CUDA_HOME` environment variable.
-- Follow the procedure described in [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) first, then install SGLang as described above.
+- If you experience an error like `OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root`, please try either of the following solutions:
+
+1. Use `export CUDA_HOME=/usr/local/cuda-<your-cuda-version>` to set the `CUDA_HOME` environment variable.
+2. Follow the procedure described in [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) first, then install SGLang as described above.
+
+- If you encounter `ImportError; cannot import name 'is_valid_list_of_images' from 'transformers.models.llama.image_processing_llama'`, try to use the specified version of `transformers` in [pyproject.toml](https://github.com/sgl-project/sglang/blob/main/python/pyproject.toml). Currently, just running `pip install transformers==4.48.3`.
 
 ## Method 2: From source
 ```
diff --git a/python/sglang/utils.py b/python/sglang/utils.py
index d83022303..4a751aa88 100644
--- a/python/sglang/utils.py
+++ b/python/sglang/utils.py
@@ -5,12 +5,15 @@ import importlib
 import json
 import logging
 import os
+import random
 import signal
+import socket
 import subprocess
 import sys
 import time
 import traceback
 import urllib.request
+import weakref
 from concurrent.futures import ThreadPoolExecutor
 from io import BytesIO
 from json import dumps
@@ -21,6 +24,8 @@ import requests
 from IPython.display import HTML, display
 from tqdm import tqdm
 
+from sglang.srt.utils import kill_process_tree
+
 logger = logging.getLogger(__name__)
 
 
@@ -306,27 +311,12 @@ def download_and_cache_file(url: str, filename: Optional[str] = None):
     return filename
 
 
-import fcntl
-
-
 def is_in_ci():
     from sglang.test.test_utils import is_in_ci
 
     return is_in_ci()
 
 
-LOCKFILE = os.path.expanduser("~/.sglang_port_lock")
-PORT_REGISTRY = os.path.expanduser("~/.sglang_port_registry.json")
-
-if not os.path.exists(LOCKFILE):
-    with open(LOCKFILE, "w") as f:
-        pass
-
-if not os.path.exists(PORT_REGISTRY):
-    with open(PORT_REGISTRY, "w") as f:
-        json.dump([], f)
-
-
 def print_highlight(html_content: str):
     if is_in_ci():
         html_content = str(html_content).replace("\n", "<br>")
@@ -335,55 +325,44 @@ def print_highlight(html_content: str):
         print(html_content)
 
 
-def init_port_registry():
-    """Initialize the port registry file if it doesn't exist."""
-    if not os.path.exists(PORT_REGISTRY):
-        with open(PORT_REGISTRY, "w") as f:
-            json.dump([], f)
+process_socket_map = weakref.WeakKeyDictionary()
 
 
-def reserve_port(start=30000, end=40000):
+def reserve_port(host, start=30000, end=40000):
     """
-    Reserve an available port using a file lock and a registry.
-    Returns the allocated port.
+    Reserve an available port by trying to bind a socket.
+    Returns a tuple (port, lock_socket) where `lock_socket` is kept open to hold the lock.
     """
-    init_port_registry()
-    with open(LOCKFILE, "w") as lock:
-        fcntl.flock(lock, fcntl.LOCK_EX)
+    candidates = list(range(start, end))
+    random.shuffle(candidates)
+
+    for port in candidates:
+        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
         try:
-            with open(PORT_REGISTRY, "r") as f:
-                used = json.load(f)
-        except Exception:
-            used = []
-        for port in range(start, end):
-            if port not in used:
-                used.append(port)
-                with open(PORT_REGISTRY, "w") as f:
-                    json.dump(used, f)
-                return port
-    raise RuntimeError("No free port available")
+            # Attempt to bind to the port on localhost
+            sock.bind((host, port))
+            return port, sock
+        except socket.error:
+            sock.close()  # Failed to bind, try next port
+            continue
+    raise RuntimeError("No free port available.")
 
 
-def release_port(port):
-    """Release the reserved port by removing it from the registry."""
-    with open(LOCKFILE, "w") as lock:
-        fcntl.flock(lock, fcntl.LOCK_EX)
-        try:
-            with open(PORT_REGISTRY, "r") as f:
-                used = json.load(f)
-        except Exception:
-            used = []
-        if port in used:
-            used.remove(port)
-        with open(PORT_REGISTRY, "w") as f:
-            json.dump(used, f)
+def release_port(lock_socket):
+    """
+    Release the reserved port by closing the lock socket.
+    """
+    try:
+        lock_socket.close()
+    except Exception as e:
+        print(f"Error closing socket: {e}")
 
 
 def execute_shell_command(command: str) -> subprocess.Popen:
     """
     Execute a shell command and return its process handle.
     """
-    # Replace newline continuations and split the command string.
     command = command.replace("\\\n", " ").replace("\\", " ")
     parts = command.split()
     return subprocess.Popen(parts, text=True, stderr=subprocess.STDOUT)
@@ -395,21 +374,28 @@ def launch_server_cmd(command: str, host: str = "0.0.0.0", port: int = None):
     If no port is specified, a free port is reserved.
     """
     if port is None:
-        port = reserve_port()
+        port, lock_socket = reserve_port(host)
+    else:
+        lock_socket = None
+
     full_command = f"{command} --port {port}"
     process = execute_shell_command(full_command)
+
+    if lock_socket is not None:
+        process_socket_map[process] = lock_socket
+
     return process, port
 
 
-def terminate_process(process, port=None):
+def terminate_process(process):
     """
-    Terminate the process and, if a port was reserved, release it.
+    Terminate the process and automatically release the reserved port.
     """
-    from sglang.srt.utils import kill_process_tree
-
     kill_process_tree(process.pid)
-    if port is not None:
-        release_port(port)
+
+    lock_socket = process_socket_map.pop(process, None)
+    if lock_socket is not None:
+        release_port(lock_socket)
 
 
 def wait_for_server(base_url: str, timeout: int = None) -> None: