From 55de40f782d1949740aec74e88ae7cce00d59582 Mon Sep 17 00:00:00 2001 From: Shi Shuai <126407087+shuaills@users.noreply.github.com> Date: Wed, 19 Feb 2025 19:15:44 +0000 Subject: [PATCH] [Docs]: Fix Multi-User Port Allocation Conflicts (#3601) Co-authored-by: zhaochenyang20 Co-authored-by: simveit --- docs/README.md | 64 ++++++++++---- docs/backend/function_calling.ipynb | 2 +- docs/backend/native_api.ipynb | 10 +-- docs/backend/openai_api_completions.ipynb | 2 +- docs/backend/openai_api_embeddings.ipynb | 2 +- docs/backend/openai_api_vision.ipynb | 2 +- docs/backend/patch.py | 17 +++- docs/backend/send_request.ipynb | 9 +- docs/backend/structured_outputs.ipynb | 2 +- docs/index.rst | 49 ++++++++--- docs/start/install.md | 24 +++-- python/sglang/utils.py | 102 ++++++++++------------ 12 files changed, 168 insertions(+), 117 deletions(-) diff --git a/docs/README.md b/docs/README.md index 0a12d64b1..9cd59bda5 100644 --- a/docs/README.md +++ b/docs/README.md @@ -36,42 +36,70 @@ find . -name '*.ipynb' -exec nbstripout {} \; # After these checks pass, push your changes and open a PR on your branch pre-commit run --all-files ``` +--- +### **Port Allocation and CI Efficiency** -If you need to run and shut down a SGLang server or engine, following these examples: - -1. Launch and close Sever: +**To launch and kill the server:** ```python -#Launch Sever +from sglang.test.test_utils import is_in_ci +from sglang.utils import wait_for_server, print_highlight, terminate_process -from sglang.utils import ( - execute_shell_command, - wait_for_server, - terminate_process, - print_highlight, +if is_in_ci(): + from patch import launch_server_cmd +else: + from sglang.utils import launch_server_cmd + +server_process, port = launch_server_cmd( + """ +python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \ + --host 0.0.0.0 +""" ) -server_process = execute_shell_command( - "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --host 0.0.0.0" -) - -wait_for_server("http://localhost:30000") - -# Terminate Sever +wait_for_server(f"http://localhost:{port}") +# Terminate Server terminate_process(server_process) ``` -2. Launch Engine and close Engine + +**To launch and kill the engine:** ```python # Launch Engine - import sglang as sgl import asyncio +from sglang.test.test_utils import is_in_ci + +if is_in_ci(): + import patch llm = sgl.Engine(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct") # Terminalte Engine llm.shutdown() ``` + +### **Why this approach?** + +- **Dynamic Port Allocation**: Avoids port conflicts by selecting an available port at runtime, enabling multiple server instances to run in parallel. +- **Optimized for CI**: The `patch` version of `launch_server_cmd` and `sgl.Engine()` in CI environments helps manage GPU memory dynamically, preventing conflicts and improving test parallelism. +- **Better Parallel Execution**: Ensures smooth concurrent tests by avoiding fixed port collisions and optimizing memory usage. + +### **Model Selection** + +For demonstrations in the docs, **prefer smaller models** to reduce memory consumption and speed up inference. Running larger models in CI can lead to instability due to memory constraints. + +### **Prompt Alignment Example** + +When designing prompts, ensure they align with SGLang’s structured formatting. For example: + +```python +prompt = """You are an AI assistant. Answer concisely and accurately. + +User: What is the capital of France? +Assistant: The capital of France is Paris.""" +``` + +This keeps responses aligned with expected behavior and improves reliability across different files. diff --git a/docs/backend/function_calling.ipynb b/docs/backend/function_calling.ipynb index e80c91d54..a5c469623 100644 --- a/docs/backend/function_calling.ipynb +++ b/docs/backend/function_calling.ipynb @@ -405,7 +405,7 @@ "metadata": {}, "outputs": [], "source": [ - "terminate_process(server_process, port)" + "terminate_process(server_process)" ] }, { diff --git a/docs/backend/native_api.ipynb b/docs/backend/native_api.ipynb index 2208bab35..7f3a67c80 100644 --- a/docs/backend/native_api.ipynb +++ b/docs/backend/native_api.ipynb @@ -252,7 +252,7 @@ "metadata": {}, "outputs": [], "source": [ - "terminate_process(server_process, port)\n", + "terminate_process(server_process)\n", "\n", "embedding_process, port = launch_server_cmd(\n", " \"\"\"\n", @@ -286,7 +286,7 @@ "metadata": {}, "outputs": [], "source": [ - "terminate_process(embedding_process, port)" + "terminate_process(embedding_process)" ] }, { @@ -304,7 +304,7 @@ "metadata": {}, "outputs": [], "source": [ - "terminate_process(embedding_process, port)\n", + "terminate_process(embedding_process)\n", "\n", "# Note that SGLang now treats embedding models and reward models as the same type of models.\n", "# This will be updated in the future.\n", @@ -355,7 +355,7 @@ "metadata": {}, "outputs": [], "source": [ - "terminate_process(reward_process, port)" + "terminate_process(reward_process)" ] }, { @@ -425,7 +425,7 @@ "metadata": {}, "outputs": [], "source": [ - "terminate_process(tokenizer_free_server_process, port)" + "terminate_process(tokenizer_free_server_process)" ] } ], diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb index e9c3f360a..96ee05022 100644 --- a/docs/backend/openai_api_completions.ipynb +++ b/docs/backend/openai_api_completions.ipynb @@ -512,7 +512,7 @@ "metadata": {}, "outputs": [], "source": [ - "terminate_process(server_process, port)" + "terminate_process(server_process)" ] } ], diff --git a/docs/backend/openai_api_embeddings.ipynb b/docs/backend/openai_api_embeddings.ipynb index 5a86ef18e..38543fa3b 100644 --- a/docs/backend/openai_api_embeddings.ipynb +++ b/docs/backend/openai_api_embeddings.ipynb @@ -169,7 +169,7 @@ "metadata": {}, "outputs": [], "source": [ - "terminate_process(embedding_process, port)" + "terminate_process(embedding_process)" ] } ], diff --git a/docs/backend/openai_api_vision.ipynb b/docs/backend/openai_api_vision.ipynb index 2ce921f50..2bb45e50f 100644 --- a/docs/backend/openai_api_vision.ipynb +++ b/docs/backend/openai_api_vision.ipynb @@ -243,7 +243,7 @@ "metadata": {}, "outputs": [], "source": [ - "terminate_process(embedding_process, port)" + "terminate_process(embedding_process)" ] }, { diff --git a/docs/backend/patch.py b/docs/backend/patch.py index 1623c1b1f..d16422d08 100644 --- a/docs/backend/patch.py +++ b/docs/backend/patch.py @@ -1,4 +1,5 @@ import os +import weakref from sglang.utils import execute_shell_command, reserve_port @@ -21,15 +22,29 @@ def patched_post_init(self): server_args_mod.ServerArgs.__post_init__ = patched_post_init +process_socket_map = weakref.WeakKeyDictionary() + def launch_server_cmd(command: str, host: str = "0.0.0.0", port: int = None): + """ + Launch the server using the given command. + If no port is specified, a free port is reserved. + """ if port is None: - port = reserve_port() + port, lock_socket = reserve_port(host) + else: + lock_socket = None + extra_flags = ( f"--max-running-requests {DEFAULT_MAX_RUNNING_REQUESTS} " f"--max-total-tokens {DEFAULT_MAX_TOTAL_TOKENS} " f"--disable-cuda-graph" ) + full_command = f"{command} --port {port} {extra_flags}" process = execute_shell_command(full_command) + + if lock_socket is not None: + process_socket_map[process] = lock_socket + return process, port diff --git a/docs/backend/send_request.ipynb b/docs/backend/send_request.ipynb index ea6398da1..610538760 100644 --- a/docs/backend/send_request.ipynb +++ b/docs/backend/send_request.ipynb @@ -243,15 +243,8 @@ "metadata": {}, "outputs": [], "source": [ - "terminate_process(server_process, port)" + "terminate_process(server_process)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/docs/backend/structured_outputs.ipynb b/docs/backend/structured_outputs.ipynb index 31d59adf9..1fe1b0576 100644 --- a/docs/backend/structured_outputs.ipynb +++ b/docs/backend/structured_outputs.ipynb @@ -397,7 +397,7 @@ "metadata": {}, "outputs": [], "source": [ - "terminate_process(server_process, port)" + "terminate_process(server_process)" ] }, { diff --git a/docs/index.rst b/docs/index.rst index b58d99a8e..62c7383da 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,5 +1,5 @@ SGLang Documentation -==================================== +==================== SGLang is a fast serving framework for large language models and vision language models. It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language. @@ -10,7 +10,6 @@ The core features include: - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte) and reward models (Skywork), with easy extensibility for integrating new models. - **Active Community**: SGLang is open-source and backed by an active community with industry adoption. - .. toctree:: :maxdepth: 1 :caption: Getting Started @@ -39,7 +38,6 @@ The core features include: frontend/frontend.md frontend/choices_methods.md - .. toctree:: :maxdepth: 1 :caption: SGLang Router @@ -47,24 +45,47 @@ The core features include: router/router.md +References +========== + +General +--------------------- .. toctree:: :maxdepth: 1 - :caption: References references/supported_models.md + references/contribution_guide.md + references/troubleshooting.md + references/faq.md + references/learn_more.md + +Hardware +-------------------------- +.. toctree:: + :maxdepth: 1 + + references/AMD.md + references/amd_configure.md + references/nvidia_jetson.md + +Advanced Models & Deployment +------------------------------ +.. toctree:: + :maxdepth: 1 + + references/deepseek.md + references/multi_node.md + references/multi_node_inference_k8s_lws.md + references/modelscope.md + +Performance & Tuning +-------------------- +.. toctree:: + :maxdepth: 1 + references/sampling_params.md references/hyperparameter_tuning.md references/benchmark_and_profiling.md references/accuracy_evaluation.md references/custom_chat_template.md - references/amd_configure.md - references/deepseek.md - references/multi_node.md - references/multi_node_inference_k8s_lws.md - references/modelscope.md references/quantization.md - references/contribution_guide.md - references/troubleshooting.md - references/nvidia_jetson.md - references/faq.md - references/learn_more.md diff --git a/docs/start/install.md b/docs/start/install.md index db29241c8..4fadf597a 100644 --- a/docs/start/install.md +++ b/docs/start/install.md @@ -2,19 +2,27 @@ You can install SGLang using any of the methods below. For running DeepSeek V3/R1 with SGLang, refer to [DeepSeek V3 Support](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3). It is always recommended to use the [latest release version](https://pypi.org/project/sglang/#history) and deploy it with [Docker](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#using-docker-recommended) to avoid fixed issues and environment-related problems. -## Method 1: With pip -``` +## Method 1: With pip or uv + +We recommend using uv to install the dependencies with a higher installation speed: + +```bash pip install --upgrade pip -pip install sgl-kernel --force-reinstall --no-deps -pip install "sglang[all]>=0.4.3.post2" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python +pip install uv +uv pip install sgl-kernel --force-reinstall --no-deps +uv pip install "sglang[all]>=0.4.3.post2" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python ``` -Note: SGLang currently uses torch 2.5, so you need to install the flashinfer version for torch 2.5. If you want to install flashinfer separately, please refer to [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html). Please note that the package currently used by FlashInfer is named `flashinfer-python`, not `flashinfer`. +**Quick Fix to Installation** -If you experience an error like `OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root`, please try either of the following solutions: +- SGLang currently uses torch 2.5, so you need to install the flashinfer version for torch 2.5. If you want to install flashinfer separately, please refer to [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html). Please note that the package currently used by FlashInfer is named `flashinfer-python`, not `flashinfer`. -- Use `export CUDA_HOME=/usr/local/cuda-` to set the `CUDA_HOME` environment variable. -- Follow the procedure described in [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) first, then install SGLang as described above. +- If you experience an error like `OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root`, please try either of the following solutions: + +1. Use `export CUDA_HOME=/usr/local/cuda-` to set the `CUDA_HOME` environment variable. +2. Follow the procedure described in [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) first, then install SGLang as described above. + +- If you encounter `ImportError; cannot import name 'is_valid_list_of_images' from 'transformers.models.llama.image_processing_llama'`, try to use the specified version of `transformers` in [pyproject.toml](https://github.com/sgl-project/sglang/blob/main/python/pyproject.toml). Currently, just running `pip install transformers==4.48.3`. ## Method 2: From source ``` diff --git a/python/sglang/utils.py b/python/sglang/utils.py index d83022303..4a751aa88 100644 --- a/python/sglang/utils.py +++ b/python/sglang/utils.py @@ -5,12 +5,15 @@ import importlib import json import logging import os +import random import signal +import socket import subprocess import sys import time import traceback import urllib.request +import weakref from concurrent.futures import ThreadPoolExecutor from io import BytesIO from json import dumps @@ -21,6 +24,8 @@ import requests from IPython.display import HTML, display from tqdm import tqdm +from sglang.srt.utils import kill_process_tree + logger = logging.getLogger(__name__) @@ -306,27 +311,12 @@ def download_and_cache_file(url: str, filename: Optional[str] = None): return filename -import fcntl - - def is_in_ci(): from sglang.test.test_utils import is_in_ci return is_in_ci() -LOCKFILE = os.path.expanduser("~/.sglang_port_lock") -PORT_REGISTRY = os.path.expanduser("~/.sglang_port_registry.json") - -if not os.path.exists(LOCKFILE): - with open(LOCKFILE, "w") as f: - pass - -if not os.path.exists(PORT_REGISTRY): - with open(PORT_REGISTRY, "w") as f: - json.dump([], f) - - def print_highlight(html_content: str): if is_in_ci(): html_content = str(html_content).replace("\n", "
") @@ -335,55 +325,44 @@ def print_highlight(html_content: str): print(html_content) -def init_port_registry(): - """Initialize the port registry file if it doesn't exist.""" - if not os.path.exists(PORT_REGISTRY): - with open(PORT_REGISTRY, "w") as f: - json.dump([], f) +process_socket_map = weakref.WeakKeyDictionary() -def reserve_port(start=30000, end=40000): +def reserve_port(host, start=30000, end=40000): """ - Reserve an available port using a file lock and a registry. - Returns the allocated port. + Reserve an available port by trying to bind a socket. + Returns a tuple (port, lock_socket) where `lock_socket` is kept open to hold the lock. """ - init_port_registry() - with open(LOCKFILE, "w") as lock: - fcntl.flock(lock, fcntl.LOCK_EX) + candidates = list(range(start, end)) + random.shuffle(candidates) + + for port in candidates: + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) try: - with open(PORT_REGISTRY, "r") as f: - used = json.load(f) - except Exception: - used = [] - for port in range(start, end): - if port not in used: - used.append(port) - with open(PORT_REGISTRY, "w") as f: - json.dump(used, f) - return port - raise RuntimeError("No free port available") + # Attempt to bind to the port on localhost + sock.bind((host, port)) + return port, sock + except socket.error: + sock.close() # Failed to bind, try next port + continue + raise RuntimeError("No free port available.") -def release_port(port): - """Release the reserved port by removing it from the registry.""" - with open(LOCKFILE, "w") as lock: - fcntl.flock(lock, fcntl.LOCK_EX) - try: - with open(PORT_REGISTRY, "r") as f: - used = json.load(f) - except Exception: - used = [] - if port in used: - used.remove(port) - with open(PORT_REGISTRY, "w") as f: - json.dump(used, f) +def release_port(lock_socket): + """ + Release the reserved port by closing the lock socket. + """ + try: + lock_socket.close() + except Exception as e: + print(f"Error closing socket: {e}") def execute_shell_command(command: str) -> subprocess.Popen: """ Execute a shell command and return its process handle. """ - # Replace newline continuations and split the command string. command = command.replace("\\\n", " ").replace("\\", " ") parts = command.split() return subprocess.Popen(parts, text=True, stderr=subprocess.STDOUT) @@ -395,21 +374,28 @@ def launch_server_cmd(command: str, host: str = "0.0.0.0", port: int = None): If no port is specified, a free port is reserved. """ if port is None: - port = reserve_port() + port, lock_socket = reserve_port(host) + else: + lock_socket = None + full_command = f"{command} --port {port}" process = execute_shell_command(full_command) + + if lock_socket is not None: + process_socket_map[process] = lock_socket + return process, port -def terminate_process(process, port=None): +def terminate_process(process): """ - Terminate the process and, if a port was reserved, release it. + Terminate the process and automatically release the reserved port. """ - from sglang.srt.utils import kill_process_tree - kill_process_tree(process.pid) - if port is not None: - release_port(port) + + lock_socket = process_socket_map.pop(process, None) + if lock_socket is not None: + release_port(lock_socket) def wait_for_server(base_url: str, timeout: int = None) -> None: